diff --git a/DEVELOPER_NOTES b/DEVELOPER_NOTES index 1db881a092..49eb67844c 100644 --- a/DEVELOPER_NOTES +++ b/DEVELOPER_NOTES @@ -1,3 +1,7 @@ +XX/XX/26 (TRUST) GPU : Blackwell B6000 and Cuda 13.0 build (NVHPC 25.11) supported +XX/XX/26 (TRUST) Matrix : Introduce Stencil structure to deal with possible nnz larger than 2^31 (possible on future GPU device) +XX/XX/26 (TRUST) VEF : Elem_VEF_base::normale replaced by Elem_VEF_base::creer_face_normales +XX/XX/26 (TRUST) Build : Add a new target profiling to build a -O3 -g (+specific options) binary to ease the profiler tools (perf, nsys, rocprof,...) ------------------------------------------------------------- Developer notes version 1.9.8_beta : Changes since version 1.9.7 : ------------------------------------------------------------- diff --git a/RELEASE_NOTES b/RELEASE_NOTES index 7151a4deb7..331cb686e3 100644 --- a/RELEASE_NOTES +++ b/RELEASE_NOTES @@ -1,3 +1,10 @@ +XX/XX/26 (TRUST) GPU : AMG solver fixed for gfx1100 AMD card and ROCm 7.2 supported +XX/XX/26 (TRUST) Keyword : Add Correlation_triple keyword for computed advanced fields +XX/XX/26 (TRUST) Keyword : Add Enstrophie_totale keyword +XX/XX/26 (TRUST) Bug fix : Fix Corriger_frontiere_periodique_64 issue (VDF only) + Ecrire_med_64 not available +XX/XX/26 (TRUST) Change : Warning, VEF VerifierCoin keyword algorithm has been rewritten which possibly change the numerotation of the mesh and the partitioned mesh +XX/XX/26 (TRUST) Solver : Sparskit based solver "solveur gen { solv_elem bicgstab|gmres ... }" removed. It can be replaced by more efficient PETSc equivalent solvers +XX/XX/26 (TRUST) Tool : trust -energy JOB_ID returns now on well-configured cluster the energy consumption of your job ------------------------------------------------------------------------------------------------- Release notes version 1.9.8_beta : Enhancements, modifications and corrected bugs since version 1.9.7 : -------------------------------------------------------------------------------------------------- @@ -7,7 +14,7 @@ Release notes version 1.9.8_beta : Enhancements, modifications and corrected bug 30/04/26 (TRUST) New feature : Introducing an IBM pre-processor + optimisations 27/04/26 (TRUST) New feature : Add new colocalised discretization that currently works with a compressible single-phase Euler or two-phase Baer-Nunziato problem integrating a Riemann solver (HLL & Rusanov). 08/04/26 (TRUST) Change : 'corriger_frontiere_periodique' becomes 'declarer_bord_perio'. Old keyword remains valid as a synonym. A periodic boundary must now **always** be declared with this keyword. The 'periodic' option in Partitioners is no longer necessary. -20/04/26 (TRUST) Change : Warning, VEF VerifierCoin keyword algorithm has been rewritten which possibly change the numerotation of the mesh and the partitioned mesh +20/04/26 (TRUST) Change : Warning, VEF VerifierCoin keyword algorithm has been rewritten which possibly change the numerotation of the mesh and the partitioned mesh 20/04/26 (TRUST) Solver : Sparskit based solver "solveur gen { solv_elem bicgstab|gmres ... }" removed. It can be replaced by more efficient PETSc equivalent solvers 20/04/26 (TRUST) Tool : trust -energy JOB_ID returns now on well-configured cluster the energy consumption of your job 14/04/26 (TRUST) New feature : CGNS supports now reset_time used in the framework of ICoCo @@ -18,8 +25,8 @@ Release notes version 1.9.8_beta : Enhancements, modifications and corrected bug 09/04/26 (TRUST) Major change : TrioCFD radiation models (transparent and semi-transparent media) are re-written completely so that they can be used in basic classes of TRUST. See examples and validations in TrioCFD code. For transparent medium, it is possible to define now more than one model in a coupled problem; ie: a model in each fluid problem. It is also possible to use it with a distant problem; ie: without a coupled problem. 31/03/26 (TRUST) Change : Change the diffusive time step computation in multiphase VDF. Less restrictive, similar as in PolyMAC now. 31/03/26 (TRUST) Change : Change the diffusive time step computation in multiphase VDF. Less restrictive, similar -as in PolyMAC now. -30/03/26 (TRUST) Performance : Computation overlapped by communication in VEF convective schemes. Convection operator duration reduced by 20% on 128 GPU case on Adastra. +as in PolyMAC now. +30/03/26 (TRUST) Performance : Computation overlapped by communication in VEF convective schemes. Convection operator duration reduced by 20% on 128 GPU case on Adastra. 12/03/26 (TRUST) New keyword : Analyse_angle_64 version of Analyse_angle keyword for large mesh to compute angles histogram especially for tetraedras to detect too much obtuse angles 24/02/26 (TRUST) GPU : Single GPU performance on MI250X (adastra, lumi) and MI300A (adastra) improved by 30% after ROCm update (6.4.x) 24/02/26 (TRUST) Bug fix : Replace CG by GMRES in AMG solver for better robustness and speed especially on GPU (may not converge) @@ -51,6 +58,14 @@ as in PolyMAC now. 13/01/26 (TRUST) New feature : Add new flag 'adapt_dt_tmax' in time scheme that ensures that the simulation ends at tmax 08/01/26 (TRUST) Bug fix : Fix for the case where dt_post is specified once in the header of post-processing block and where a mix of post-processings from files and old fashion is requested 08/01/26 (TRUST) Bug fix : Possible memory increase when writing some files (typically opened/closed at each write) +07/01/26 (TRUST) New feature : New mechanics module (Meca) integrated, including linear elasticity (Hooke's law), a Newmark solver for elastodynamics, and thermo-elastic source terms. +07/01/26 (TRUST) New feature : Enable bidim_axi support in EF and PolyMAC_MPFA. +07/01/26 (TRUST) New feature : DP_impose & regular pressure drops: time-dependent driving, regul option, and save/restore; dp_regul parameter renamed from eps to alpha. +07/01/26 (TRUST) New feature : New source term Echange_Thermique_Volumique: implicit heat conduction between two domains in VDF/Poly* +07/01/26 (TRUST) New feature : New Champ_Morceaux (piecewise fields on sub-domains) and improvements to Champ_Fonc_Tabule_Morceaux (can use post-processed fields). +07/01/26 (TRUST) Fix : Postprocessing on deformable/ALE domains: fix weighted_sum/average (face control volumes) and optimize volume_maille. +07/01/26 (Tools) Fix : Improve macOS/darwin builds (PDI, darwin_g++). +07/01/26 (TRUST) Change : Decouper_multi now splits domains following the order specified in the input file. -------------------------------------------------------------------------------------------------- Release notes version 1.9.7 : Enhancements, modifications and corrected bugs since version 1.9.6 : -------------------------------------------------------------------------------------------------- diff --git a/ThirdPart/src/LIBAMGX/install b/ThirdPart/src/LIBAMGX/install index 028e669f62..c95dca46fe 100755 --- a/ThirdPart/src/LIBAMGX/install +++ b/ThirdPart/src/LIBAMGX/install @@ -51,7 +51,7 @@ then # Hack du CMakeLists.txt (vu sur orcus avec Cuda 12) car manque de cublasLt sed -i "1,$ s?CUDA::cublas?CUDA::cublas CUDA::cublasLt?g" ../CMakeLists.txt || exit -1 - if [ "$HOST" = jean-zay ] + if [ "$HOST" = jean-zay ] || [ $HOST = dalianvl ] then # ToDo: cublas not found on JeanZay so we simplify and it works now... Generalize to other builds ? cmake -DCMAKE_CUDA_COMPILER=$TRUST_NVCC -DAMGX_NO_RPATH=1 -DCMAKE_INSTALL_PREFIX=$AMGX_DIR/$LIB $AMGX_COMPILERS -DCMAKE_CUDA_FLAGS_RELEASE="-DNDEBUG" .. || exit -1 diff --git a/ThirdPart/src/LIBAMGXWRAPPER/install b/ThirdPart/src/LIBAMGXWRAPPER/install index 611d5eae22..a49d4e8ff6 100755 --- a/ThirdPart/src/LIBAMGXWRAPPER/install +++ b/ThirdPart/src/LIBAMGXWRAPPER/install @@ -67,6 +67,7 @@ then cd $AMGX_DIR/$LIB rm -r -f example && mkdir -p example && cd example tests="poisson solveFromFiles" + tests="" for test in $tests do echo "Building $test test..." diff --git a/ThirdPart/src/LIBCUDSS/install.sh b/ThirdPart/src/LIBCUDSS/install.sh index c1e3652490..1939513417 100755 --- a/ThirdPart/src/LIBCUDSS/install.sh +++ b/ThirdPart/src/LIBCUDSS/install.sh @@ -1,5 +1,7 @@ #!/bin/bash -build_dir=libcudss-linux-`uname -m`-0.6.0.5_cuda12-archive +version=0.7.1.4 +cuda=12 && [ "`nvcc --version 2>/dev/null | grep cuda_13`" != "" ] && cuda=13 +build_dir=libcudss-linux-`uname -m`-$version"_cuda"$cuda-archive mkdir -p $TRUST_ROOT/lib/src/LIBCUDSS tar -xf $TRUST_ROOT/externalpackages/cudss/$build_dir.tar.xz || exit -1 cp -r $build_dir/* $TRUST_ROOT/lib/src/LIBCUDSS diff --git a/ThirdPart/src/LIBKOKKOS/install_arborx.sh b/ThirdPart/src/LIBKOKKOS/install_arborx.sh index 97df6ccbae..73016db37b 100755 --- a/ThirdPart/src/LIBKOKKOS/install_arborx.sh +++ b/ThirdPart/src/LIBKOKKOS/install_arborx.sh @@ -1,7 +1,7 @@ #!/bin/bash [ "$TRUST_STDCPP" = c++14 ] && exit 0 [ "$TRUST_STDCPP" = c++17 ] && exit 0 -archive=$TRUST_ROOT/externalpackages/kokkos/arborx-2.0.1.tar.gz # C++ 20 +archive=$TRUST_ROOT/externalpackages/kokkos/arborx-2.1.tar.gz # C++ 20 build_dir=$TRUST_ROOT/build/arborx KOKKOS_ROOT_DIR=$TRUST_ROOT/lib/src/LIBKOKKOS diff --git a/ThirdPart/src/LIBKOKKOS/install_kokkos-kernels.sh b/ThirdPart/src/LIBKOKKOS/install_kokkos-kernels.sh index 37120ef684..82f6906946 100755 --- a/ThirdPart/src/LIBKOKKOS/install_kokkos-kernels.sh +++ b/ThirdPart/src/LIBKOKKOS/install_kokkos-kernels.sh @@ -3,7 +3,7 @@ [ "$TRUST_USE_GPU" != 1 ] && exit 0 # Kokkos-kernels: -archive=$TRUST_ROOT/externalpackages/kokkos/kokkos-kernels-release-candidate-5.1.0.tar.gz +archive=$TRUST_ROOT/externalpackages/kokkos/kokkos-kernels-5.1.1.tar.gz build_dir=$TRUST_ROOT/build/kokkos-kernels KOKKOS_ROOT_DIR=$TRUST_ROOT/lib/src/LIBKOKKOS # Log file of the process: diff --git a/ThirdPart/src/LIBKOKKOS/install_kokkos.sh b/ThirdPart/src/LIBKOKKOS/install_kokkos.sh index 696963c50d..b01d9908da 100755 --- a/ThirdPart/src/LIBKOKKOS/install_kokkos.sh +++ b/ThirdPart/src/LIBKOKKOS/install_kokkos.sh @@ -3,7 +3,7 @@ if [ "$TRUST_STDCPP" = c++20 ] then # Kokkos (C++20): - archive=$TRUST_ROOT/externalpackages/kokkos/kokkos-release-candidate-5.1.0.tar.gz + archive=$TRUST_ROOT/externalpackages/kokkos/kokkos-5.1.1.tar.gz elif [ "$TRUST_STDCPP" = c++17 ] then # Kokkos (C++17): diff --git a/ThirdPart/src/LIBKOKKOS/makefile b/ThirdPart/src/LIBKOKKOS/makefile index 28d6cfebc7..aca90cc518 100644 --- a/ThirdPart/src/LIBKOKKOS/makefile +++ b/ThirdPart/src/LIBKOKKOS/makefile @@ -1,8 +1,9 @@ # Kokkos -lib=$(TRUST_LIB)/src/LIBKOKKOS/$(TRUST_ARCH)$(OPT)/lib64/libkokkoscore.a +lib1=$(TRUST_LIB)/src/LIBKOKKOS/$(TRUST_ARCH)$(OPT)/lib64/libkokkoscore.a +lib2=$(TRUST_LIB)/src/LIBKOKKOS/$(TRUST_ARCH)$(OPT)/lib64/libkokkoskernels.a -all: $(lib) -$(lib): install_kokkos.sh install_kokkos-kernels.sh install_arborx.sh makefile +all: $(lib1) $(lib2) +$(lib1) $(lib2): install_kokkos.sh install_kokkos-kernels.sh install_arborx.sh makefile @make clean ./install_kokkos.sh && ./install_kokkos-kernels.sh && ./install_arborx.sh diff --git a/ThirdPart/src/LIBLAPACK/Installer b/ThirdPart/src/LIBLAPACK/Installer index e1bd5482fb..17a8b8a411 100755 --- a/ThirdPart/src/LIBLAPACK/Installer +++ b/ThirdPart/src/LIBLAPACK/Installer @@ -11,7 +11,7 @@ install_OpenBlas() rm -r -f *-OpenBLAS-* OPENBLAS_USE_OPENMP=$TRUST_USE_OPENMP # Disable OpenMP for PETSc (Probleme avec STRUMPACK sur GPU) - OPENBLAS_USE_OPENMP=0 + OPENBLAS_USE_OPENMP=0 gunzip -f -c $package | tar -xf - cd OpenBLAS-$version_openblas if [ "$TRUST_CC_BASE_EXTP" != "" ] @@ -39,7 +39,7 @@ install_OpenBlas() # CPU_ARCH="" #fi # Instructions -mavx512 fait crasher valgrind sur les machines avec instructions avx512 ... On desactive (comme dans TRUST d'ailleurs) - options="CC=$CC FC=$FC USE_THREAD=$OPENBLAS_USE_OPENMP USE_OPENMP=$OPENBLAS_USE_OPENMP NO_AVX512=1 $CPU_ARCH" + options="CC=$CC FC=$FC USE_THREAD=$OPENBLAS_USE_OPENMP USE_OPENMP=$OPENBLAS_USE_OPENMP NO_AVX512=1 BUILD_BFLOAT16=0 $CPU_ARCH" echo "Installation of $package ($options)..." if [ `uname -s` = Darwin ] then @@ -144,8 +144,7 @@ version_lapack=3.4.1 && [ "$TRUST_INT64" != "1" ] && [ "$TRUST_USE_MUMPS" != 1 ] # [HPC][!Portabilite] Utilisation OpenBlas par defaut (valide sur TRUST/F5/TrioCFD/G3) -> Decomposition LU plus rapide if [ "$TRUST_USE_OPENBLAS" = 1 ] then - #for tag in 0.3.25 0.3.29 - for tag in 0.3.29 + for tag in 0.3.33 do version_openblas=$tag echo "version_openblas = $tag" diff --git a/ThirdPart/src/LIBPETSC/amgx_int32.cxx b/ThirdPart/src/LIBPETSC/amgx_int32.cxx index 0cde5cd0ba..305bef9ac8 100644 --- a/ThirdPart/src/LIBPETSC/amgx_int32.cxx +++ b/ThirdPart/src/LIBPETSC/amgx_int32.cxx @@ -552,30 +552,30 @@ static PetscErrorCode PCView_AMGX(PC pc, PetscViewer viewer) } /*MC - PCAMGX - Interface to NVIDIA's AmgX algebraic multigrid - - Options Database Keys: -+ -pc_amgx_amg_method - set the AMG algorithm to use -. -pc_amgx_amg_cycle - set the AMG cycle type -. -pc_amgx_smoother - set the AMG pre/post smoother -. -pc_amgx_jacobi_relaxation_factor - set the relaxation factor for Jacobi smoothing -. -pc_amgx_gs_symmetric - enforce symmetric Gauss-Seidel smoothing (only applies if GS smoothing is selected) -. -pc_amgx_selector - set the AMG coarse selector -. -pc_amgx_presweeps - set the number of AMG pre-sweeps -. -pc_amgx_postsweeps - set the number of AMG post-sweeps -. -pc_amgx_max_levels - set the maximum number of levels in the AMG level hierarchy -. -pc_amgx_strength_threshold - set the strength threshold for the AMG coarsening -. -pc_amgx_aggressive_levels - set the number of levels (from the finest) that should apply aggressive coarsening -. -pc_amgx_coarse_solver - set the coarse solve -. -pc_amgx_print_grid_stats - output the AMG grid hierarchy to stdout -- -pc_amgx_verbose - enable AmgX output + PCAMGX - Interface to NVIDIA's AmgX algebraic multigrid + + Options Database Keys: ++ -pc_amgx_amg_method (CLASSICAL,AGGREGATION) - set the AMG algorithm to use +. -pc_amgx_amg_cycle (V,W,F,CG) - set the AMG cycle type +. -pc_amgx_jacobi_relaxation_factor - set the relaxation factor for Jacobi smoothing +. -pc_amgx_gs_symmetric - enforce symmetric Gauss-Seidel smoothing (only applies if GS smoothing is selected) +. -pc_amgx_selector (SIZE_2|SIZE_4|SIZE_8|MULTI_PAIRWISE|PMIS|HMIS) - set the AMG coarse selector +. -pc_amgx_presweeps - set the number of AMG pre-sweeps +. -pc_amgx_postsweeps - set the number of AMG post-sweeps +. -pc_amgx_max_levels - set the maximum number of levels in the AMG level hierarchy +. -pc_amgx_strength_threshold - set the strength threshold for the AMG coarsening +. -pc_amgx_aggressive_levels - set the number of levels (from the finest) that should apply aggressive coarsening +. -pc_amgx_coarse_solver (DENSE_LU_SOLVER,NOSOLVER) - set the coarse solve +. -pc_amgx_print_grid_stats - output the AMG grid hierarchy to `stdout` +- -pc_amgx_verbose - enable AmgX verbose output +- -pc_amgx_smoother (PCG|PCGF|PBICGSTAB|GMRES|FGMRES|JACOBI_L1|BLOCK_JACOBI|GS|MULTICOLOR_GS|MULTICOLOR_ILU|MULTICOLOR_DILU|CHEBYSHEV_POLY|NOSOLVER) - set the AMG pre/post smoother Level: intermediate Note: - Implementation will accept host or device pointers, but good performance will require that the `KSP` is also GPU accelerated so that data is not frequently transferred between host and device. + Implementation will accept host or device pointers, but good performance will require that the `KSP` is also GPU accelerated so that data is not frequently transferred between host and device. -.seealso: [](ch_ksp), `PCGAMG`, `PCHYPRE`, `PCMG`, `PCAmgXGetResources()`, `PCCreate()`, `PCSetType()`, `PCType` (for list of available types), `PC` +.seealso: [](ch_ksp), `PCGAMG`, `PCHYPRE`, `PCMG`, `PCAmgXGetResources()`, `PCCreate()`, `PCSetType()`, `PCType`, `PC` M*/ PETSC_EXTERN PetscErrorCode PCCreate_AMGX(PC pc) diff --git a/ThirdPart/src/LIBPETSC/amgx_int64.cxx b/ThirdPart/src/LIBPETSC/amgx_int64.cxx index 5587942db3..7b629ce4e1 100644 --- a/ThirdPart/src/LIBPETSC/amgx_int64.cxx +++ b/ThirdPart/src/LIBPETSC/amgx_int64.cxx @@ -556,30 +556,30 @@ static PetscErrorCode PCView_AMGX(PC pc, PetscViewer viewer) } /*MC - PCAMGX - Interface to NVIDIA's AmgX algebraic multigrid - - Options Database Keys: -+ -pc_amgx_amg_method - set the AMG algorithm to use -. -pc_amgx_amg_cycle - set the AMG cycle type -. -pc_amgx_smoother - set the AMG pre/post smoother -. -pc_amgx_jacobi_relaxation_factor - set the relaxation factor for Jacobi smoothing -. -pc_amgx_gs_symmetric - enforce symmetric Gauss-Seidel smoothing (only applies if GS smoothing is selected) -. -pc_amgx_selector - set the AMG coarse selector -. -pc_amgx_presweeps - set the number of AMG pre-sweeps -. -pc_amgx_postsweeps - set the number of AMG post-sweeps -. -pc_amgx_max_levels - set the maximum number of levels in the AMG level hierarchy -. -pc_amgx_strength_threshold - set the strength threshold for the AMG coarsening -. -pc_amgx_aggressive_levels - set the number of levels (from the finest) that should apply aggressive coarsening -. -pc_amgx_coarse_solver - set the coarse solve -. -pc_amgx_print_grid_stats - output the AMG grid hierarchy to stdout -- -pc_amgx_verbose - enable AmgX output + PCAMGX - Interface to NVIDIA's AmgX algebraic multigrid + + Options Database Keys: ++ -pc_amgx_amg_method (CLASSICAL,AGGREGATION) - set the AMG algorithm to use +. -pc_amgx_amg_cycle (V,W,F,CG) - set the AMG cycle type +. -pc_amgx_jacobi_relaxation_factor - set the relaxation factor for Jacobi smoothing +. -pc_amgx_gs_symmetric - enforce symmetric Gauss-Seidel smoothing (only applies if GS smoothing is selected) +. -pc_amgx_selector (SIZE_2|SIZE_4|SIZE_8|MULTI_PAIRWISE|PMIS|HMIS) - set the AMG coarse selector +. -pc_amgx_presweeps - set the number of AMG pre-sweeps +. -pc_amgx_postsweeps - set the number of AMG post-sweeps +. -pc_amgx_max_levels - set the maximum number of levels in the AMG level hierarchy +. -pc_amgx_strength_threshold - set the strength threshold for the AMG coarsening +. -pc_amgx_aggressive_levels - set the number of levels (from the finest) that should apply aggressive coarsening +. -pc_amgx_coarse_solver (DENSE_LU_SOLVER,NOSOLVER) - set the coarse solve +. -pc_amgx_print_grid_stats - output the AMG grid hierarchy to `stdout` +- -pc_amgx_verbose - enable AmgX verbose output +- -pc_amgx_smoother (PCG|PCGF|PBICGSTAB|GMRES|FGMRES|JACOBI_L1|BLOCK_JACOBI|GS|MULTICOLOR_GS|MULTICOLOR_ILU|MULTICOLOR_DILU|CHEBYSHEV_POLY|NOSOLVER) - set the AMG pre/post smoother Level: intermediate Note: - Implementation will accept host or device pointers, but good performance will require that the `KSP` is also GPU accelerated so that data is not frequently transferred between host and device. + Implementation will accept host or device pointers, but good performance will require that the `KSP` is also GPU accelerated so that data is not frequently transferred between host and device. -.seealso: [](ch_ksp), `PCGAMG`, `PCHYPRE`, `PCMG`, `PCAmgXGetResources()`, `PCCreate()`, `PCSetType()`, `PCType` (for list of available types), `PC` +.seealso: [](ch_ksp), `PCGAMG`, `PCHYPRE`, `PCMG`, `PCAmgXGetResources()`, `PCCreate()`, `PCSetType()`, `PCType`, `PC` M*/ PETSC_EXTERN PetscErrorCode PCCreate_AMGX(PC pc) diff --git a/ThirdPart/src/LIBPETSC/install b/ThirdPart/src/LIBPETSC/install index 345bec7c98..9d542d2d60 100755 --- a/ThirdPart/src/LIBPETSC/install +++ b/ThirdPart/src/LIBPETSC/install @@ -10,7 +10,7 @@ then cd - else # package=petsc-3.24.2.tar.gz # Huge issue with 3.24.x : make ctest_optim hangs - package=petsc-3.23.7.tar.gz && [ "$TRUST_USE_GPU" = 1 ] && package=petsc-5d4b16a5b.tar.gz # 2026_01_06 + package=petsc-3.23.7.tar.gz && [ "$TRUST_USE_GPU" = 1 ] && package=petsc-99a952e4.tar.gz # 2026_05_31 fi ###################################### @@ -290,10 +290,9 @@ with_gpu="" # Kokkos-Kernels # ################## # Toujours un pb de link en debug avec kokkos-kernels -# Et les performances de kokkos-kernels est moindre (x4 slowdown on A6000) par rapport aux kernels PETSc CUDA +# Et les performances de kokkos-kernels encore en retrait par rapport aux kernels PETSc CUDA # Mais utilise avec GAMG de PETSc cela permet d'avoir une alternative a AmgX (int64?) et Hypre -# Build error on HIP also and orcus: on active ponctuellement -ENABLE_KOKKOS=0 # && [ "$HOST" = topaze ] && ENABLE_KOKKOS=1 +ENABLE_KOKKOS=1 if [ "$ENABLE_KOKKOS" = 1 ] && [ -f $TRUST_ROOT/lib/src/LIBKOKKOS/$TRUST_ARCH"_opt"/lib64/libkokkoskernels.a ] then with_gpu=$with_gpu" --with-kokkos=1 --with-kokkos-dir=$TRUST_ROOT/lib/src/LIBKOKKOS/$TRUST_ARCH"_opt @@ -316,8 +315,11 @@ then ln -s -f $CUDA_ROOT/lib64/stubs/libcuda.so libcuda.so.1 export LD_LIBRARY_PATH=`pwd`:$LD_LIBRARY_PATH fi - # Hack for this sh.t : - [ ! -f /usr/lib64/libnvidia-ml.so.1 ] && cp -f $TRUST_ROOT/ThirdPart/src/LIBPETSC/libnvidia-ml.so.1 $TRUST_LIB/libnvidia-ml.so.1 + # Hack for this sh.t : PL ou cela pose probleme ? Sur orcus, pas sur la frontale donc stubs copie et plantage sur noeud de calcul... + #for file in /usr/lib64/libnvidia-ml.so.1 /usr/lib/x86_64-linux-gnu/libnvidia-ml.so.1 $TRUST_ROOT/ThirdPart/src/LIBPETSC/libnvidia-ml.so.1 + #do + # [ -f $file ] && cp -f $file $TRUST_LIB && break + #done echo "CUDA support for PETSc with: "$with_gpu elif [ "$TRUST_USE_ROCM" = 1 ] then @@ -590,27 +592,38 @@ then ####### # Hypre ####### - add_package hypre 2.33.0 - # add_package hypre + # Le support de ROCm 7.x commence avec 3.1.0. Cuda 13 necessite aussi 3.1.0 + # hypre=2.33.0 && [ "`nvcc --version 2>&1 | grep cuda_13`" != "" ] && hypre=3.1.0 + # On passe a 3.1.0 car valide sur JeanZay H100 et Adastra MI300 + # On passe a master (89e7e8d) pour corriger un pb de build sur ROCm 7.2.x + hypre=master + add_package hypre $hypre if [ "$TRUST_USE_GPU" = 1 ] then - # Ne surtout pas activer UVM: lent sur AMD et CUDA : + # Ne surtout pas activer UVM: lent sur AMD et CUDA : #hypre_configure="--enable-unified-memory" - - # Enable MPI GPU awareness for Hypre (plantage encore sur adastra MI250 pendant KSPSolve: Memory access fault by GPU) - #[ "$TRUST_MPI_GPU_AWARE" = 1 ] && [ "$ROCM_ARCH" != gfx90a ] && hypre_configure=$hypre_configure" --enable-gpu-aware-mpi" - # PL: I disable --enable-gpu-aware-mpi cause issue on adastra (perf or crash) - sed -i "1,$ s?--enable-gpu-aware-mpi??g" config/BuildSystem/config/packages/hypre.py || exit -1 - + # Seems OK on Lumi (40% faster on 4xMI250X on OpenMP_Iterateur) + # 20% faster on 2xA100 (orcus) TaylorGreen_BENCH + # 50% faster on 8xMI300A (adastra) TaylorGreen_BENCH + # No gain on Topaze on 4xA100 CALCUL_83M ? + #if [ $TRUST_USE_ROCM = 1 ] + #then + # echo "WARNING: Disabling --enable-gpu-aware-mpi in Hypre on ROCm cause issues on adastra (perf or crash) ?" + # sed -i "1,$ s?--enable-gpu-aware-mpi??g" config/BuildSystem/config/packages/hypre.py || exit -1 + #fi # Hack pour Hypre car --download-hypre-configure-arguments difficile a faire marcher avec plusieurs arguments... sed -i "1,$ s?--with-MPI-libs?$hypre_configure --with-MPI-libs?g" config/BuildSystem/config/packages/hypre.py || exit -1 - + #nedit config/BuildSystem/config/packages/hypre.py [ "$TRUST_USE_CUDA" = 1 ] && GPU_ARCH=$TRUST_CUDA_CC [ "$TRUST_USE_ROCM" = 1 ] && GPU_ARCH=$ROCM_ARCH with_packages=$with_packages" --with-hypre-gpu-arch=$GPU_ARCH" - #add_package Umpire # Recommended for performance See later some issue when linking with Hypre + # Recommended for performance during Hypre setup according PETSc: + # Setup on 8xMI300A 15s->13s but +30% RAM ! + # Mandatory for HIP with --enable-gpu-aware-mpi cause OpenMP_Iterateur/weak_scaling.sh crash on device on 256xMI250X + # Disabled for CUDA cause hangs on orcus a100 + [ "$TRUST_USE_ROCM" = 1 ] && add_package Umpire && LIBS="--LIBS=-lrt" # Fix on old Fedora: Undefined shm_open fi fi @@ -701,7 +714,7 @@ do echo "Configuring PETSc..." # Hack provisoire sur aarch64, python de conda fait planter le configure: [ "`uname -m`" = aarch64 ] && PATH=/usr/bin:$PATH - LIBS="" && [ `uname -s` = Darwin ] && LIBS=--LIBS=`$TRUST_Awk '/SYSLIBS =/ {gsub("SYSLIBS =","",$0);print $0}' $fic_env` + [ `uname -s` = Darwin ] && LIBS=--LIBS=`$TRUST_Awk '/SYSLIBS =/ {gsub("SYSLIBS =","",$0);print $0}' $fic_env` export TMPDIR=$TRUST_TMP # Par defaut, les fichiers temporaires de PETSC sont sous /tmp, cela peut probleme si pas de droit d'execution donnes ./configure --help 1>../configure.help 2>&1 cp ../configure.help $PETSC_ROOT/ diff --git a/Validation/Rapports_automatiques/Verification/Verification_codage/Champ_front_MED/src/pre_run b/Validation/Rapports_automatiques/Verification/Verification_codage/Champ_front_MED/src/pre_run index 1a37b51bfc..d017c52b45 100644 --- a/Validation/Rapports_automatiques/Verification/Verification_codage/Champ_front_MED/src/pre_run +++ b/Validation/Rapports_automatiques/Verification/Verification_codage/Champ_front_MED/src/pre_run @@ -8,5 +8,6 @@ source $TRUST_MEDCOUPLING_ROOT/env.sh python ./prepare.py || exit -1 # Dump du field.med cree: +export LD_LIBRARY_PATH=$TRUST_ROOT/lib/src/LIBHDF5/lib:$LD_LIBRARY_PATH echo -e "1\n1\n0\n" | $TRUST_ROOT/lib/src/LIBMED/bin/mdump --structure fields.med diff --git a/bin/KSH/Createcmakefile.py b/bin/KSH/Createcmakefile.py index a4fe09d131..34f800114d 100644 --- a/bin/KSH/Createcmakefile.py +++ b/bin/KSH/Createcmakefile.py @@ -359,6 +359,9 @@ def generate_cmake_files(root_dir, atelier): if (lib STREQUAL "nvidia-ml") set (lib${lib} /usr/lib64/libnvidia-ml.so.1) # PC, some clusters + if(NOT EXISTS ${lib${lib}}) + set (lib${lib} /usr/lib/x86_64-linux-gnu/libnvidia-ml.so.1) # PC Ubuntu + endif() if(NOT EXISTS ${lib${lib}}) # Other clusters, cmake can't find this sh.t set (lib${lib} ${TRUST_ROOT}/lib/libnvidia-ml.so.1) endif() diff --git a/bin/KSH/debog.sh b/bin/KSH/debog.sh index c0d82260d9..7c3df673db 100755 --- a/bin/KSH/debog.sh +++ b/bin/KSH/debog.sh @@ -4,8 +4,11 @@ jdd=`pwd` jdd=`basename $jdd` cp $jdd.data cpu.data cp $jdd.data gpu.data -sed -i "1,$ s?Solve?Debog pb seq faces 1.e-6 0 Solve?g" cpu.data -sed -i "1,$ s?Solve?Debog pb seq faces 1.e-6 1 Solve?g" gpu.data -(source $cpu/env_TRUST.sh;$exec cpu 2>&1 | tee cpu.out_err) -$exec gpu 2>&1 | tee gpu.out_err -compare_lata cpu.lml gpu.lml +pb=`awk '/Solve/ && (NF==2) {print $2}' $jdd.data` +seuil=1.e-12 # seuil relatif mais sert aussi comme seuil absolu pour filtrer +sed -i "1,$ s?Solve?Debog $pb seq faces $seuil 1 Solve?g" cpu.data || exit -1 +sed -i "1,$ s?Solve?Debog $pb seq faces $seuil 0 Solve?g" gpu.data || exit -1 +rm -f *lml +TRUST_CLOCK_ON=1 $exec gpu 2>&1 | tee gpu.out_err +(env=`ls $cpu/env_*sh`;source $env;$exec cpu 2>&1 | tee cpu.out_err) +compare_lata cpu.lml gpu.lml --max_delta diff --git a/bin/gestion_externalpackages/md5.ref b/bin/gestion_externalpackages/md5.ref index fa522032be..89865c560a 100644 --- a/bin/gestion_externalpackages/md5.ref +++ b/bin/gestion_externalpackages/md5.ref @@ -259,21 +259,23 @@ b8042f9970ea70a36da1ee1fae27c448 VisIt/mesa-17.3.9.tar.xz 85adef240c5f370b308da8c938951a68 VisIt/zlib-1.2.11.tar.xz ba84eaa8564155babd4ba1458d4eaa11 astyle_2.03_linux.tar.gz d43a8fbe83767978098ba7f8ee25d3d1 ccache/ccache-3.1.4.tar.gz +134a43d5c8e01c7805b1adc2dc7e7048 ccache/ccache-4.13.6-linux-aarch64-musl-static.tar.gz 27fc515a919221d69008cf7347137752 ccache/ccache-4.8.2-darwin.tar.gz 34991901e77027afcc3bb16a9595c353 ccache/ccache-4.8.2-linux-x86_64.tar.gz -34fd4b0843da02ebaa76f5711e1b63de cudss/libcudss-linux-aarch64-0.6.0.5_cuda12-archive.tar.xz -4ac17f5b35a4ecc550c4d7c479a5c5b5 cudss/libcudss-linux-x86_64-0.6.0.5_cuda12-archive.tar.xz +e81b58209814379f5d1476705229602f cudss/libcudss-linux-aarch64-0.7.1.4_cuda13-archive.tar.xz +97a40c68c2f4d4d0405532c65ede87b7 cudss/libcudss-linux-x86_64-0.7.1.4_cuda12-archive.tar.xz +38cfe9a97f3d8e8060d99ba34bdd8d3b cudss/libcudss-linux-x86_64-0.7.1.4_cuda13-archive.tar.xz 2cf02a542c1933de95bdbe3f42188ffa doxygen-1.7.4.linux.bin.tar.gz 065cef54eb09cdb54614e1ed353ddbd1 doxygen-1.9.3.src.tar.gz 27c5022f697e2522c0dbab439b9573b9 gnuplot/gnuplot-5.2.7.tar.gz ea0931758fc180e3b1950931b9869921 gnuplot/gnuplot-6.0.2.tar.gz 4fa24da17c99b122a56cb8808b6eb78b hwloc-2.7.1.tar.gz -a51b0a245c34151f42a4c6120d2aceec kokkos/arborx-2.0.1.tar.gz +58e992e49dcb3a100e6b7dbd75a6689b kokkos/arborx-2.1.tar.gz 36abe803480d07db87f9ba03cd5a842c kokkos/kokkos-3.7.02.tgz 24cd603e2a047fc8d67d814f33769f54 kokkos/kokkos-4.7.00.tar.gz -7ab1e3728978c5be85d77485eea96aa1 kokkos/kokkos-kernels-release-candidate-5.1.0.tar.gz -6a1c520d8aa7147fb4067ee7745e2575 kokkos/kokkos-release-candidate-5.1.0.tar.gz -853a0c5c0747c5943e7ef4bbb793162d lapack/OpenBLAS-0.3.29.tar.gz +1d68ff32eaea69cb97726fb6b6354b7c kokkos/kokkos-5.1.1.tar.gz +3a7125c6e47f19dc3d75712b24293aaa kokkos/kokkos-kernels-5.1.1.tar.gz +96c5cd9013013faefc294bc57830c77d lapack/OpenBLAS-0.3.33.tar.gz 44c3869c38c8335c2b9c2a8bb276eb55 lapack/lapack-3.4.1.tgz b5e558f981326d9ca1bfdb841640721a make-4.0.tar.gz 9be6e048224797bf531f94b7a8aaa99d osqp-0.6.0.tar.gz @@ -286,12 +288,13 @@ d4c0862c48e6e9742807e6e50bdf5deb petsc/STRUMPACK_8.0.0.tar.gz 8048f7b7b50daa99257593cf2e7d785a petsc/ScaLAPACK-6f56981cb0cabffd8c72c7d1016146c4b8e276dc.tar.gz e659373ed5e9b961d2fcb6d67d250783 petsc/SuiteSparse-7.7.0.tar.gz f8559a94ee64c8b70ebd79b65576d08d petsc/SuperLU_DIST-9.1.0.tar.gz -da990c4e944ede86879fd29ee309d8c4 petsc/Umpire-2025.09.0.tar.gz +a684171841395b8903963e3970aae567 petsc/Umpire-2025.12.0.tar.gz d4990384b7b1d8b0357fc34d91530d49 petsc/hypre-2.33.0.tar.gz c33d67d1ae475460002782b09929e5cf petsc/hypre-3.1.0.tar.gz +870443b8d6d173469e91af547e6bcb5a petsc/hypre-master.tar.gz 88a40e3bf9e8ee28af8725a73f9e3bc3 petsc/petsc-3.23.7.tar.gz d4c79d4859cd6770439e7a4d880777de petsc/petsc-3.24.2.tar.gz -6a53e417a59d7cbde483babb624703f7 petsc/petsc-5d4b16a5b.tar.gz +bb46da11f4af1c212cd0bdcaef48fa91 petsc/petsc-99a952e4.tar.gz f121c9d7ef5e43a20899acd93f425b22 petsc/slate-v2023.06.00.tgz 5390282424f874836d572e2ae4e3c185 petsc/zfp-1.0.1.tar.gz e97ed4ddf3b59a05729097ab66a46b03 pip/mpi4py-4.0.0.tar.gz diff --git a/bin/lance_test b/bin/lance_test index b559b50010..aba68fa885 100755 --- a/bin/lance_test +++ b/bin/lance_test @@ -1011,7 +1011,7 @@ then liste_skipped=$liste_skipped" $i" i="" # Discard Pb_multiphase # - elif [ "`grep -i "Pb_multiphase " $file 2>/dev/null`" != "" ] + elif [ "`grep -i "Pb_multiphase" $file 2>/dev/null`" != "" ] then echo "Test $i skipped (Pb_multiphase not yet supported on GPU)" liste_skipped=$liste_skipped" $i" diff --git a/bin/mklibs b/bin/mklibs index 27d802beca..8671767134 100755 --- a/bin/mklibs +++ b/bin/mklibs @@ -50,7 +50,7 @@ then [ -f libcmumps.a ] && PETSC_L=$PETSC_L" cmumps dmumps mumps_common smumps zmumps pord" [ -f libpastix.a ] && PETSC_L=$PETSC_L" pastix" [ -f libml.a ] && PETSC_L=$PETSC_L" ml" - for lib in HYPRE strumpack magma sbutterflypack dbutterflypack cbutterflypack zbutterflypack zfp slate blaspp lapackpp scalapack blacs spai parms parmetis metis ptesmumps ptscotcherr ptscotcherrexit ptscotchparmetisv3 ptscotch scotch scotcherr scotcherrexit + for lib in HYPRE umpire camp strumpack magma sbutterflypack dbutterflypack cbutterflypack zbutterflypack zfp slate blaspp lapackpp scalapack blacs spai parms parmetis metis ptesmumps ptscotcherr ptscotcherrexit ptscotchparmetisv3 ptscotch scotch scotcherr scotcherrexit do [ -f lib$lib.a ] && PETSC_L=$PETSC_L" $lib" done diff --git a/bin/trust b/bin/trust index 3fae8615f4..aac19d669a 100755 --- a/bin/trust +++ b/bin/trust @@ -144,6 +144,7 @@ help() echo "-perf : Run perf tool (profiling)." echo "-trace : Run traceanalyzer tool (MPI profiling)." [ "`rocprof --version 1>/dev/null 2>&1;echo $?`" = 0 ] && echo "-rocprof : Run rocprof tool (GPU profiling on AMD)" + [ "`rocprof --version 1>/dev/null 2>&1;echo $?`" = 0 ] && echo "-rcu : Run rocprof compute (GPU profiling on AMD)" [ "`nsys --version 1>/dev/null 2>&1;echo $?`" = 0 ] && echo "-nsys : Run Nsight system tool (GPU profiling)." [ "`ncu --version 1>/dev/null 2>&1;echo $?`" = 0 ] && echo "-ncu kernel : Run Nsight compute tool on given kernel (GPU Kernel profiling)." [ "`compute-sanitizer --version 1>/dev/null 2>&1;echo $?`" = 0 ] && echo "-cs memcheck|racecheck|initcheck|synccheck : Run compute sanitizer tool (GPU debugging)." @@ -455,6 +456,9 @@ do elif [ "$1" = "-rocprof" ] then ROCPROF=1 + elif [ "$1" = "-rcu" ] + then + RCU=1 elif [ "$1" = "-nsys" ] then NSYS=1 @@ -880,9 +884,17 @@ then rm -f *.pftrace *.csv #See https://dci.dci-gitlab.cines.fr/webextranet/software_stack/tools/index.html#adastra-software-stack-tools-profiling-rocprof #export AMD_SERIALIZE_COPY=3 AMD_SERIALIZE_KERNEL=3 GPU_MAX_HW_QUEUES=1 - #profiler="rocprofv3 -o $NOM --output-format=pftrace --hip-trace --" - #exec="\"rocprofv3 -o $NOM --output-format=pftrace --hip-trace --hip-runtime-trace -- $exec\"" - exec="\"rocprofv3 -o $NOM --output-format=pftrace --memory-allocation-trace --hip-trace --kernel-trace --kokkos-trace -- $exec\"" + # --runtime-trace : Collects tracing data for HIP runtime API, marker (ROCTx) API, RCCL API, memory operations (copies, scratch, and allocation), and kernel dispatches. + # --kokkos-trace: Enables builtin Kokkos tools support, which implies enabling --marker-trace collection and --kernel-rename. + # --scratch-memory-trace: detect scratch mem alloc + # --stats + exec="\"rocprofv3 -o $NOM --output-format=pftrace --memory-copy-trace --memory-allocation-trace --hip-trace --kernel-trace --kokkos-trace -- $exec\"" +elif [ "$RCU" = 1 ] +then + # Trace roofline + rm -f *.pftrace *.csv + echo "pmc: TCC_EA_RDREQ_32B_sum TCC_EA_RDREQ_sum TCC_EA_WRREQ_sum TCC_EA_WRREQ_64B_sum SQ_INSTS_VALU_ADD_F64 SQ_INSTS_VALU_MUL_F64 SQ_INSTS_VALU_FMA_F64 SQ_INSTS_VALU_TRANS_F64 SQ_INSTS_VALU_MFMA_MOPS_F64" > counters.txt + exec="\"rocprofv3 --input=counters.txt -- $exec\"" elif [ "$NSYS" = 1 ] then rm -f $NOM.qdrep @@ -903,7 +915,7 @@ then then sudo sh -c 'echo 1 >/proc/sys/kernel/perf_event_paranoid' # metriques GPU en plus mais necessite d'etre sudo - trace=$trace" --gpu-metrics-device all" + trace=$trace" --gpu-metrics-devices all" # profiler="sudo " # Mais attention fichiers ecrits root ! fi fi @@ -1201,17 +1213,6 @@ then then [ "`ldd $binary 2>/dev/null | grep gtl_hsa`" = "" ] && echo "Error, binary $exec not linked correctly for MPICH_GPU_SUPPORT_ENABLED=1 !" && exit -1 echo "export MPICH_GPU_SUPPORT_ENABLED=1" >> $GPU_DIRECT - # More variables for adastra to fiabilize GPU communications tips from adastra support: - # Move into HOST files ? - if [ $HOST = adastra ] || [ $HOST = lumi ] - then - echo "# Some flags to enable if issues MI250/MI300: -export MPICH_ASYNC_PROGRESS=1 -export FI_CXI_RX_MATCH_MODE=software -export FI_CXI_REQ_BUF_SIZE=12582912 -export FI_CXI_REQ_BUF_MIN_POSTED=8 -export FI_CXI_DEFAULT_CQ_SIZE=393216" >> $GPU_DIRECT - fi fi PETSC_OPTIONS=$PETSC_OPTIONS" -use_gpu_aware_mpi 1" elif [ "$TRUST_USE_GPU" = 1 ] @@ -1236,6 +1237,7 @@ export FI_CXI_DEFAULT_CQ_SIZE=393216" >> $GPU_DIRECT # et blocage possible sur castor avec Baltik par exemple !!! OUTPUT=.$NOM fi + rm -f $OUTPUT.err $OUTPUT.out if [ $NB_PROCS = 1 ] && [ $USE_MPIRUN = 0 ] then echo "$profiler \$exec \$case $PETSC_OPTIONS 1>$OUTPUT.out 2>$OUTPUT.err" >> $sub_file @@ -1326,10 +1328,13 @@ export FI_CXI_DEFAULT_CQ_SIZE=393216" >> $GPU_DIRECT # A cause de platine (permission denied), on utilise plutot les chemins pointes /proc/self/fd/1 et 2 [ -f $OUTPUT.out ] && cat $OUTPUT.out >> "/proc/self/fd/1" #cat $OUTPUT.out > "/dev/stdout" [ -f $OUTPUT.err ] && cat $OUTPUT.err >> "/proc/self/fd/2" #cat $OUTPUT.err > "/dev/stderr" - # Check for TRUST calculation only (cause coupled MC2 calculation DO NOT produce this message for example): - if [ "`grep 'Executable: ' $OUTPUT.err 2>/dev/null`" != "" ] && [ "`grep 'Arret des processes.' $OUTPUT.err`" = "" ] + if [ ! -f $OUTPUT.err ] then + echo "Error of submission !" err=1 + else + # Check for TRUST calculation only (cause coupled MC2 calculation DO NOT produce this message for example): + [ "`grep 'Executable: ' $OUTPUT.err 2>/dev/null`" != "" ] && [ "`grep 'Arret des processes.' $OUTPUT.err`" = "" ] && err=1 fi # Try to detect crashes (if it is not a TRUST binary, example PETSc test case) if [ "`grep 'invalid device function' $OUTPUT.err`" != "" ] || [ "`grep 'Signal: Aborted' $OUTPUT.err`" != "" ] @@ -1337,7 +1342,7 @@ export FI_CXI_DEFAULT_CQ_SIZE=393216" >> $GPU_DIRECT err=1 fi [ $sub = CCC ] && ccc_myproject -P $queue 2>/dev/null # Sur CCRT heures de calcul - [ "$TRUST_USE_SACCT" = 1 ] && trust -energy $id | tee -a $NOM.TU # Energy consumption printed and added to .TU + [ "$err" != 1 ] && [ "$TRUST_USE_SACCT" = 1 ] && trust -energy $id | tee -a $NOM.TU # Energy consumption printed and added to .TU else # Pas de gestionnaire, example PC: [ "$cpus_per_task" != "" ] && [ "$HOST" != "jean-zay" ] && echo "Number of core per task option not supported yet on $HOST. Contact TRUST support" && exit -1 @@ -1432,10 +1437,10 @@ then elif [ "$ROCPROF" = 1 ] then echo "Use -g flag for source stack" - echo "Open $NOM"_results".fptrace file with: firefox https://ui.perfetto.dev" + echo "Open $NOM"_results".pftrace file with: firefox https://ui.perfetto.dev" elif [ "$NSYS" = 1 ] then - [ -f $NOM.nsys-rep ] && nsys-ui $NOM.nsys-rep + [ -f $NOM.nsys-rep ] && nsys-ui $NOM.nsys-rep && rm -f $NOM.nsys-rep $NOM.*.html $NOM.sqlite elif [ "$NCU" = 1 ] then # Marche pas encore: diff --git a/env_src/HOST.env b/env_src/HOST.env index bc979b8d90..7a1baa330f 100755 --- a/env_src/HOST.env +++ b/env_src/HOST.env @@ -21,10 +21,14 @@ elif [ "${HOST#gutta}" != $HOST ];then HOST=aar elif [ "${HOST#grenx}" != $HOST ];then HOST=aar elif [ "${HOST#aar}" != $HOST ];then HOST=aar elif [ "${HOST#summer}" != $HOST ];then HOST=summer -elif [ "${HOST#calypso}" != $HOST ] +elif [ "${HOST#calypso}" != $HOST ] # GH100: then - [ "${HOST#calypso-grace}" = $HOST ] && echo "Error, you need to log on grace node with: salloc -p grace --gres=gpu:0" && exit + [ "${HOST#calypso-grace}" = $HOST ] && echo "Error, you need to log and build on grace node (ARM) with: salloc -p grace --gres=gpu:0" HOST=calypso-grace +elif [ "${HOST#dalia}" != $HOST ] # GB200: Intel sur la frontale, incompatible avec ARM sur le noeud de calcul: +then + [ "${HOST#dalianvl}" = $HOST ] && echo "Error, you need to log and build (configure && make) on grace node (ARM) with: srun -p defq -t 240 --exclusive -c 144 --gres=gpu:0 --pty bash" + HOST=dalianvl elif [ "${HOST#mezel}" != $HOST ] then HOST=mezel diff --git a/env_src/HOST_adastra.sh b/env_src/HOST_adastra.sh index 9c90cfb203..f13a351db3 100755 --- a/env_src/HOST_adastra.sh +++ b/env_src/HOST_adastra.sh @@ -9,7 +9,8 @@ define_modules_config() echo "Command qstat created on $HOST" cp $TRUST_ROOT/bin/KSH/qstat_wrapper $TRUST_ROOT/bin/KSH/qstat # Initialisation de l environnement module $MODULE_PATH - echo "source /etc/profile" >> $env + # echo "source /etc/profile" >> $env # Slow.... + echo "source /etc/bashrc" >> $env # # Load modules if [ "$TRUST_USE_ROCM" = 1 ] @@ -23,6 +24,7 @@ define_modules_config() echo "$ROCM_ARCH not supported on adastra!" fi # Compilateur hipcc + module="PrgEnv-gnu/8.6.0 craype-accel-amd-$ROCM_ARCH rocm/6.4.3 gcc/11.2" # KO module="PrgEnv-gnu/8.6.0 craype-accel-amd-$ROCM_ARCH rocm/6.4.3" module=$module" firefox" # For profiling else @@ -36,7 +38,7 @@ define_modules_config() # PL: C++20 module="craype-x86-trento craype-network-ofi PrgEnv-gnu/8.5.0 libfabric" # gcc 13.X fi - module=$module" python/3.12.1 swig" # Pour -without-conda + module=$module" python/3.12.1 cmake/3.27.9 swig" # Pour -without-conda # echo "# Module $module detected and loaded on $HOST." echo "module purge 1>/dev/null" >> $env @@ -67,6 +69,7 @@ define_soumission_batch() then project="genden15" fi + rm -f ld_env.sh if [ "$gpu" = 1 ] then if [ "$ROCM_ARCH" = gfx90a ] # Partition MI250X (BW: 1600 GB/s) @@ -83,7 +86,7 @@ define_soumission_batch() gpu_per_node=4 # Not available on the GPU nodes: #cp -f /lib64/libsuitesparseconfig.so.4 . - #echo "export LD_LIBRARY_PATH=.:\$LD_LIBRARY_PATH" > ld_env.sh + #echo "export LD_LIBRARY_PATH=.:\$LD_LIBRARY_PATH" >> ld_env.sh #echo "export TRUST_DISABLE_CHECK_OS=1" >> ld_env.sh fi noeuds=`echo "1+($NB_PROCS-1)/$gpu_per_node" | bc` @@ -100,6 +103,13 @@ define_soumission_batch() srun_options="" #[ $NB_PROCS -gt ??? ] && qos=??? fi + # More variables for adastra to fiabilize GPU communications tips from adastra support: + echo "# Some flags to enable if issues MI250/MI300: +export MPICH_ASYNC_PROGRESS=1 +export FI_CXI_RX_MATCH_MODE=software +export FI_CXI_REQ_BUF_SIZE=12582912 +export FI_CXI_REQ_BUF_MIN_POSTED=8 +export FI_CXI_DEFAULT_CQ_SIZE=393216" >> ld_env.sh node=1 # --exclusive ram=0 # Important pour acceder a toute la RAM du noeud # ToDo utiliser le binding !!! @@ -107,11 +117,14 @@ define_soumission_batch() # https://dci.dci-gitlab.cines.fr/webextranet/porting_optimization/detailed_binding_script.html#adastra-detailed-binding-script # Attention, le verbose est important sinon crash ! voir doc USE_MPIRUN=1 # Pour profiter du binding meme en sequentiel - if [ "$TRUST_USE_OLD_BINDING" = 1 ] || [ "$ROCM_ARCH" = gfx942 ] # Pas clair encore le binding sur MI300 + if [ "$ROCM_ARCH" = gfx942 ] # Pas clair encore le binding sur MI300 then mpirun="srun -l $srun_options --mpi=cray_shasta --mem-bind=local --cpu-bind=verbose,cores" - else + elif [ "$ROCM_ARCH" = gfx90a ] # MI250 + then mpirun="srun -l $srun_options --mem-bind=none --cpu-bind=verbose,none -- \$TRUST_ROOT/env_src/adastra_acc_binding.sh" + else + mpirun="srun -l $srun_options --mem-bind=none --cpu-bind=verbose,none -- " fi sub=SLURM } diff --git a/env_src/HOST_calypso.sh b/env_src/HOST_calypso.sh deleted file mode 100755 index 856b92f4fc..0000000000 --- a/env_src/HOST_calypso.sh +++ /dev/null @@ -1,54 +0,0 @@ -#!/bin/bash -################################## -# Variables for configure script # -################################## -define_modules_config() -{ - env=$TRUST_ROOT/env/machine.env - # Initialisation de l environnement module $MODULE_PATH - #echo "source /etc/profile.d/modules.sh " >> $env - # Load modules - module="python/3.12.10 tools/cmake/3.28.2_arm nvhpc/24.1" # openmpi/4.1.7_gcc114_cuda124 gcc/12.3.0_arm - [ "$TRUST_CUDA_CC" = "" ] && TRUST_CUDA_CC=90 # H100 - echo "# Module $module detected and loaded on $HOST." - echo "module purge 1>/dev/null" >> $env - echo "module load $module" >> $env - echo "[ \$? != 0 ] && echo \"Error: $module not found; we exit...\" && echo \"Contat TRUST support team or system administrator\" && exit -1" >> $env - echo $source >> $env - . $env - # Creation wrapper qstat -> squeue - echo "#!/bin/bash -squeue" > $TRUST_ROOT/bin/qstat - chmod +x $TRUST_ROOT/bin/qstat -} - -############################## -# Variables for trust script # -############################## -define_soumission_batch() -{ - soumission=2 - [ "$prod" = 1 ] && soumission=1 - [ "$gpu" = 1 ] && soumission=1 - queue=grace && gpus_per_node=`echo $NB_PROCS | awk '{print $1<1?$1:1}'` && noeuds=`echo "1+($NB_PROCS-1)/1" | bc` # 1GPU/node - if [ "$prod" = 1 ] || [ "$NB_PROCS" -gt 40 ] - then - qos=2jours && cpu=2880 - [ "$gpu" != 1 ] && node=1 # exclusif uniquement sur cpu - else - qos=test && cpu=60 && node=0 - fi - # Le binding ameliore fortement les performances sur AMD quelque soit MPI: - if [ "$I_MPI_ROOT" != "" ] # IntelMPI - then - binding="-m block:block --cpu-bind=rank" - mpirun="srun $binding -n \$SLURM_NTASKS" - elif [ "$HPCX_DIR" != "" ] # HPC-X - then - binding="--map-by numa --bind-to core" - mpirun="mpirun $binding -n \$SLURM_NTASKS" - else - mpirun="srun --mpi=pmix -n \$SLURM_NTASKS" - fi - sub=SLURM -} diff --git a/env_src/HOST_dalianvl.sh b/env_src/HOST_dalianvl.sh new file mode 100755 index 0000000000..23c2b489bc --- /dev/null +++ b/env_src/HOST_dalianvl.sh @@ -0,0 +1,53 @@ +#!/bin/bash +####################################################################### +# Dalia # +####################################################################### + + +################################## +# Variables for configure script # +################################## +define_modules_config() +{ + env=$TRUST_ROOT/env/machine.env + # Initialisation de l environnement module $MODULE_PATH si pas disponible: + module -v 1>/dev/null 2>&1 || echo $echo "source /etc/profile" >> $env + # + # Load modules (do not take gcc/14.2.0) + module="gmp/6.3.0 mpfr/4.2.1 mpc/1.3.1 slurm/slurm/24.11 nvhpc/25.9" + echo "# Module $module detected and loaded on $HOST." + echo "module purge 1>/dev/null" >> $env + echo "module load $module 1>/dev/null || exit -1" >> $env + echo "export TRUST_USE_SACCT=1" >> $env # Energy data + echo "export USE_NVHPC_MPI=1" >> $env + #echo "export TRUST_BATCH=\"srun -p defq -n 144 -A pdl17744\"" >> $env + . $env + # Creation wrapper qstat -> squeue + echo "#!/bin/bash +squeue" > $TRUST_ROOT/bin/qstat + chmod +x $TRUST_ROOT/bin/qstat +} + +############################## +# Variables for trust script # +############################## +define_soumission_batch() +{ + soumission=1 + # http://www.idris.fr/docs/dalia/dalia-environnement + project="pdl17744" + ntasks=144 # number of cores max + gpus_per_node=`echo $NB_PROCS | awk '{print $1<4?$1:4}'` + noeuds=`echo "1+($NB_PROCS-1)/4" | bc` + cpus_per_task=`echo $ntasks/$gpus_per_node | bc` + noeuds=`echo "1+($NB_PROCS-1)/4" | bc` + queue=defq + cpu=60 # 2880 # 2 days + #os=qos_gpu$q-t3 && cpu=1200 && [ "$prod" != 1 ] && [ $NB_PROCS -le 32 ] && qos=qos_gpu$q-dev && cpu=120 + #hintnomultithread=1 + #node=1 # --exclusive + USE_MPIRUN=1 + mpirun="mpiexec -n \$SLURM_NTASKS --mca pml ucx" # Command for NVHPC MPI UCX + sub=SLURM +} + diff --git a/env_src/HOST_lumi.sh b/env_src/HOST_lumi.sh index 24025d2447..0e590ea525 100755 --- a/env_src/HOST_lumi.sh +++ b/env_src/HOST_lumi.sh @@ -21,7 +21,8 @@ define_modules_config() else echo "$ROCM_ARCH not supported on lumi!" fi - module="PrgEnv-gnu/8.5.0 craype-accel-amd-$ROCM_ARCH LUMI/24.03 partition/G buildtools/24.03 CrayEnv rocm/6.4.4 gnuplot/5.4.10-cpeGNU-24.03" + LUMI=25.03 # Default 2026_05_15 + module="PrgEnv-gnu/8.5.0 craype-accel-amd-$ROCM_ARCH LUMI/$LUMI partition/G buildtools/$LUMI CrayEnv rocm/6.4.4" else echo "Not configured." && exit -1 fi @@ -52,7 +53,7 @@ define_soumission_batch() [ $NB_PROCS -le 128 ] && queue=dev-g && qos="" && cpu=60 # 1h [ $NB_PROCS -le 64 ] && queue=dev-g && qos="" && cpu=120 # h2 fi - project=project_465002428 + project=project_465002986 if [ "$gpu" = 1 ] then if [ "$ROCM_ARCH" = gfx90a ] # Partition MI250X (BW: 1600 GB/s) @@ -68,6 +69,13 @@ define_soumission_batch() else echo "ToDo" fi + # More variables for adastra to fiabilize GPU communications tips from adastra support: + echo "# Some flags to enable if issues MI250/MI300: +export MPICH_ASYNC_PROGRESS=1 +export FI_CXI_RX_MATCH_MODE=software +export FI_CXI_REQ_BUF_SIZE=12582912 +export FI_CXI_REQ_BUF_MIN_POSTED=8 +export FI_CXI_DEFAULT_CQ_SIZE=393216" >> ld_env.sh node=1 # --exclusive ram=480g # RAM per node (512 - 32) USE_MPIRUN=1 diff --git a/env_src/HOST_orcus-intel.sh b/env_src/HOST_orcus-intel.sh index 9ecd8c39fd..e340386a8c 100755 --- a/env_src/HOST_orcus-intel.sh +++ b/env_src/HOST_orcus-intel.sh @@ -42,6 +42,7 @@ squeue" > $TRUST_ROOT/bin/qstat ############################## define_soumission_batch() { + [ "$TRUST_USE_CUDA" = 1 ] && echo "GPU run can't be launched on the Intel orcus frontale. Use orcus AMD frontale." && exit -1 soumission=2 [ "$prod" = 1 ] && soumission=1 [ "$gpu" = 1 ] && soumission=1 diff --git a/env_src/configurer_env b/env_src/configurer_env index d161e54143..b9b243fa5f 100755 --- a/env_src/configurer_env +++ b/env_src/configurer_env @@ -685,9 +685,9 @@ case $arch in CUDA_COMPILER=nvcc TRUST_CC_BASE="nvcc_wrapper" fi - # Si n'est pas dans le PATH, on download NVHPC: - $CUDA_COMPILER --version 1>/dev/null 2>&1 || INSTALL_NVHPC=1 - if [ "$INSTALL_NVHPC" = 1 ] + # Si n'est pas dans le PATH ou version ancienne, on download NVHPC: + NVCC_OK=`nvcc --version 2>/dev/null | awk '/release/ {gsub(",","",$5); print ($5>12.1)}'` + if [ "$NVCC_OK" != 1 ] then source ../env_src/gpu/install_nvhpc_sdk_toolkit.sh echo "ADD_PATH $NVHPC/bin" >>$env @@ -786,6 +786,11 @@ TRUST_NB_PROCS=`./configurer_env -TRUST_NB_PROCS` TRUST_MAKE="make -j $TRUST_NB_PROCS" if [ -f /proc/cpuinfo ] then + if [ "`uname -m`" = aarch64 ] + then + total_cores=`grep processor /proc/cpuinfo | wc -l` + TRUST_NB_PROCS=$total_cores + else # Nombre de processeurs physiques: procs=`grep "physical id" /proc/cpuinfo | sort -u | wc -l` && [ "$procs" = 0 ] && procs=1 cores_per_proc=`grep "core id" /proc/cpuinfo | sort -u | wc -l` && [ "$cores_per_proc" = 0 ] && cores_per_proc=1 @@ -793,6 +798,7 @@ then echo "# Detected $procs processors of $cores_per_proc cores means a total of $total_cores physical cores." | tee -a $env CACHE_SIZE=`awk '/cache size/ {if ($(NF-1)>cs) cs=$(NF-1)} END {print cs}' /proc/cpuinfo | sort -u` echo "# Detected a size cache of $CACHE_SIZE KB." | tee -a $env + fi elif [ `uname -s` = "Darwin" ]; then total_cores=$(sysctl -n hw.perflevel0.logicalcpu) fi @@ -1291,13 +1297,14 @@ then then echo "NVidia driver version: $NVIDIA_VERSION" NVCC_VALIDE=`echo $NVCC_VERSION $NVIDIA_VERSION | awk '{if (1.0*$1<=1.0*$2) print 1}'` - if [ "$NVCC_VALIDE" = "" ] - then - echo "The NVidia drivers are too old for your Cuda compiler." - echo "You will experience possible error code:222, reason: the provided PTX was compiled with an unsupported toolchain" - echo "-> Update NVidia driver or take older Cuda compiler." + # Ok si meme Cuda 12.x ou Cuda 13.x + #if [ "$NVCC_VALIDE" = "" ] + #then + # echo "The NVidia drivers are too old for your Cuda compiler." + # echo "You will experience possible error code:222, reason: the provided PTX was compiled with an unsupported toolchain" + # echo "-> Update NVidia driver or take older Cuda compiler." #exit -1 - fi + #fi fi if [ "$TRUST_CUDA_CC" = "" ] then @@ -1366,8 +1373,6 @@ then echo "ADD_PATH \$NVHPC_ROOT/$MPI/bin" >> $env echo "ADD_LD_LIBRARY_PATH \$NVHPC_ROOT/$MPI/lib" >> $env fi - # nvidia-ml for PETSc (energy measure): - [ -f /usr/lib64/libnvidia-ml.so ] && echo "ADD_LD_LIBRARY_PATH /usr/lib64" >> $env fi m="# TRUST will use CUDA ?";e="TRUST_USE_CUDA=\"$TRUST_USE_CUDA\" && export TRUST_USE_CUDA";ecrit $m"|"$e"|"$env @@ -1946,7 +1951,8 @@ case $TRUST_ARCH_CC in [ "`echo $TRUST_VERSION_GNU | awk -F. '{print ($1==12)}'`" = 1 ] && CppFlags=$CppFlags" -Wno-use-after-free" # Uniquement si gcc12 (Erreur Kokkos sinon) elif [ $TRUST_ARCH_CC = linux_nvcc_wrapper ] then - CppFlags=$CppFlags" -arch=sm_$TRUST_CUDA_CC --extended-lambda -Werror" + # 20011/20014 : calling a __host__ function() from a __host__ __device__ function() is not allowed + CppFlags=$CppFlags" -arch=sm_$TRUST_CUDA_CC --extended-lambda -Werror -Xcudafe --diag_suppress=20011 -Xcudafe --diag_suppress=20014" fi CppFlags=$CppFlags" -fno-common -Wno-long-long -Wall -Wno-unknown-pragmas -Wnon-virtual-dtor -Wreorder -Woverloaded-virtual -Wsynth -Wextra -Wno-unused-parameter -pedantic -fabi-version=0 -Wno-cpp" CppFlags=$CppFlags" -fno-math-errno" # Operations mathematiques optimisees sans ecarts crees (vient de F5) diff --git a/env_src/gpu/install_nvhpc_sdk_toolkit.sh b/env_src/gpu/install_nvhpc_sdk_toolkit.sh index 86d6d83af7..a0a8296a16 100644 --- a/env_src/gpu/install_nvhpc_sdk_toolkit.sh +++ b/env_src/gpu/install_nvhpc_sdk_toolkit.sh @@ -1,12 +1,17 @@ #!/bin/bash -# NVIDIA HPC SDK -NVIDIA_VERSION=`nvidia-smi 2>/dev/null | awk '/CUDA Version/ {v=$(NF-1);gsub("\\\.","",v);print v}'` # Cuda12.9 works on Driver 12.x. Issue for major version only. E.g: Cuda13.x on Cuda12.x -# Keep 23.5 if issue on orcus or jean-zay... -# SDK_VERSION=23.5 && CUDA_VERSION=12.1 && installer=nvhpc_2023_235_Linux_x86_64_cuda_$CUDA_VERSION && installer_md5sum=eff38d63c4d08ca5c2962dad049a6833 -# Support Blackwell: -SDK_VERSION=25.5 && CUDA_VERSION=12.9 && installer=nvhpc_2025_255_Linux_x86_64_cuda_$CUDA_VERSION && installer_md5sum=748302adcb483bc332214a34dad1e31d +TRUST_CUDA_VERSION=`nvidia-smi 2>/dev/null | awk '/CUDA Version/ {print $(NF-1)}'` +if [ "$TRUST_CUDA_VERSION" = 13.0 ] +then + # Debut du support Cuda 13 dans TRUST (ex: B6000) + # On limite 25.11 a 13.0 pour le moment car sur A6000 avec 13.2, crashes bizarre avec SEGFAULT ou deivision pas 0, 25.5 OK) + SDK_VERSION=25.11 && CUDA_VERSION=13.0 && installer=nvhpc_2025_2511_Linux_x86_64_cuda_$CUDA_VERSION && installer_md5sum=2601954ba94355aea67d32043aaa3263 + # Domaine_VF.cpp plante sur ArborX avec: + # SDK_VERSION=26.3 && CUDA_VERSION=13.1 && installer=nvhpc_2026_263_Linux_x86_64_cuda_$CUDA_VERSION && installer_md5sum=712c670a876409f96608f7f29bdbcf51 +else + SDK_VERSION=25.5 && CUDA_VERSION=12.9 && installer=nvhpc_2025_255_Linux_x86_64_cuda_$CUDA_VERSION && installer_md5sum=748302adcb483bc332214a34dad1e31d +fi INSTALL=$TRUST_ROOT/env/gpu/install NVHPC=$INSTALL/nvhpc-$SDK_VERSION/Linux_x86_64/$SDK_VERSION/compilers diff --git a/src/DG/Operateurs/Op_Diff_Dift/Op_Diff_DG_Elem.cpp b/src/DG/Operateurs/Op_Diff_Dift/Op_Diff_DG_Elem.cpp index a5a7a11c2d..3d8e21e858 100644 --- a/src/DG/Operateurs/Op_Diff_Dift/Op_Diff_DG_Elem.cpp +++ b/src/DG/Operateurs/Op_Diff_Dift/Op_Diff_DG_Elem.cpp @@ -107,7 +107,7 @@ void Op_Diff_DG_Elem::dimensionner(Matrice_Morse& la_matrice) const // TODO a re int size_inc = indices_glob_elem(nb_elem_tot); const Stencil& stencil_sorted = domaine.get_stencil_sorted(); - const int nb_stencil_max = stencil_sorted.dimension(1); + const auto nb_stencil_max = stencil_sorted.dimension(1); la_matrice.dimensionner(size_inc, size_inc, 0); @@ -137,7 +137,7 @@ void Op_Diff_DG_Elem::dimensionner(Matrice_Morse& la_matrice) const // TODO a re for (int nelem = 0; nelem < nb_elem_tot; nelem++) { auto row = tab1[indices_glob_elem(nelem)]-1 ; - auto nb_indices_line = tab1[indices_glob_elem(nelem)+1] - tab1[indices_glob_elem(nelem)]; + int nb_indices_line = (int)(tab1[indices_glob_elem(nelem)+1] - tab1[indices_glob_elem(nelem)]); indice = 0; for (int d = 0; d < dim; d++) { diff --git a/src/DG/Operateurs/Op_Divers/Op_Div_DG.cpp b/src/DG/Operateurs/Op_Divers/Op_Div_DG.cpp index 012850a7b6..ac9006327b 100644 --- a/src/DG/Operateurs/Op_Divers/Op_Div_DG.cpp +++ b/src/DG/Operateurs/Op_Divers/Op_Div_DG.cpp @@ -101,7 +101,7 @@ void Op_Div_DG::dimensionner_blocs(matrices_t matrices, const tabs_t& semi_impl) int size_col = indices_glob_elem_v(nb_elem_tot); const Stencil& stencil_sorted = domaine.get_stencil_sorted(); - const int nb_stencil_max = stencil_sorted.dimension(1); + const auto nb_stencil_max = stencil_sorted.dimension(1); int nb_indices_line; int row, col, indice; @@ -132,8 +132,8 @@ void Op_Div_DG::dimensionner_blocs(matrices_t matrices, const tabs_t& semi_impl) for (int nelem = 0; nelem < nb_elem_tot; nelem++) { - row = tabv1[indices_glob_elem_p(nelem)] - 1; - nb_indices_line = tabv1[indices_glob_elem_p(nelem) + 1] - tabv1[indices_glob_elem_p(nelem)]; + row = (int)(tabv1[indices_glob_elem_p(nelem)] - 1); + nb_indices_line = (int)(tabv1[indices_glob_elem_p(nelem) + 1] - tabv1[indices_glob_elem_p(nelem)]); indice = 0; for (int i = 0; i < nb_bfunc_p; i++) @@ -187,11 +187,11 @@ void Op_Div_DG::dimensionner_blocs(matrices_t matrices, const tabs_t& semi_impl) for (int nelem = 0; nelem < nb_elem_tot; nelem++) { - nb_indices_line = tabp1(indices_glob_elem_p(nelem) + 1) - tabp1(indices_glob_elem_p(nelem)); + nb_indices_line = (int)(tabp1(indices_glob_elem_p(nelem) + 1) - tabp1(indices_glob_elem_p(nelem))); for (int i = 0; i < nb_bfunc_p; i++) { - row = tabp1(indices_glob_elem_p(nelem) + i) - 1; + row = (int)(tabp1(indices_glob_elem_p(nelem) + i) - 1); for (int k = 0; k < nb_stencil_max; k++) { diff --git a/src/DG/Operateurs/Op_Divers/Op_Grad_DG.cpp b/src/DG/Operateurs/Op_Divers/Op_Grad_DG.cpp index c881783504..b462730a97 100644 --- a/src/DG/Operateurs/Op_Divers/Op_Grad_DG.cpp +++ b/src/DG/Operateurs/Op_Divers/Op_Grad_DG.cpp @@ -93,7 +93,7 @@ void Op_Grad_DG::dimensionner_blocs(matrices_t matrices, const tabs_t& semi_impl coeff = 0; const Stencil& stencil_sorted = domaine.get_stencil_sorted(); - const int nb_stencil_max = stencil_sorted.dimension(1); + const auto nb_stencil_max = stencil_sorted.dimension(1); int nb_indices_line; int row, col, indice; @@ -116,8 +116,8 @@ void Op_Grad_DG::dimensionner_blocs(matrices_t matrices, const tabs_t& semi_impl for (int nelem = 0; nelem < nb_elem_tot; nelem++) { - row = tab1[indices_glob_elem_v(nelem)] - 1; - nb_indices_line = tab1[indices_glob_elem_v(nelem) + 1] - tab1[indices_glob_elem_v(nelem)]; + row = (int)(tab1[indices_glob_elem_v(nelem)] - 1); + nb_indices_line = (int)(tab1[indices_glob_elem_v(nelem) + 1] - tab1[indices_glob_elem_v(nelem)]); indice = 0; for (int i = 0; i < nb_bfunc_v*dim; i++) diff --git a/src/DG/Solveurs/Assembleur_P_DG.cpp b/src/DG/Solveurs/Assembleur_P_DG.cpp index 2cfb3c10e8..e2da8a1d57 100644 --- a/src/DG/Solveurs/Assembleur_P_DG.cpp +++ b/src/DG/Solveurs/Assembleur_P_DG.cpp @@ -121,7 +121,7 @@ int Assembleur_P_DG::assembler_mat(Matrice& la_matrice, const DoubleVect& diag, int size_inc = indices_glob_elem(nb_elem_tot); const Stencil& stencil_sorted = domaine.get_stencil_sorted(); - const int nb_stencil_max = stencil_sorted.dimension(1); + const auto nb_stencil_max = stencil_sorted.dimension(1); mat.dimensionner(size_inc, size_inc, 0); auto& tab1 = mat.get_set_tab1(); @@ -148,8 +148,8 @@ int Assembleur_P_DG::assembler_mat(Matrice& la_matrice, const DoubleVect& diag, for (int nelem = 0; nelem < nb_elem_tot; nelem++) { - row = tab1[indices_glob_elem(nelem)] - 1; - nb_indices_line = tab1[indices_glob_elem(nelem) + 1] - tab1[indices_glob_elem(nelem)]; + row = (int)(tab1[indices_glob_elem(nelem)] - 1); + nb_indices_line = (int)(tab1[indices_glob_elem(nelem) + 1] - tab1[indices_glob_elem(nelem)]); indice = 0; for (int k = 0; k < nb_stencil_max; k++) { diff --git a/src/EF/Champs/Champ_Fonc_Tabule_P0_EF.cpp b/src/EF/Champs/Champ_Fonc_Tabule_P0_EF.cpp index 578ad7f0bd..6c3741e1cb 100644 --- a/src/EF/Champs/Champ_Fonc_Tabule_P0_EF.cpp +++ b/src/EF/Champs/Champ_Fonc_Tabule_P0_EF.cpp @@ -31,6 +31,7 @@ void Champ_Fonc_Tabule_P0_EF::associer_param(const VECT(OBS_PTR(Champ_base)) &le void Champ_Fonc_Tabule_P0_EF::mettre_a_jour(double t) { + // ToDo: replace by Champ_Fonc_P0_base::mettre_a_jour(t, la_table.valeur(), les_ch_param); DoubleTab& mes_valeurs = valeurs(); int nb_elem = le_dom_VF->nb_elem(), nb_elem_tot = le_dom_VF->nb_elem_tot(), nb_param = les_ch_param.size(); DoubleTabs val_params_aux_elems; diff --git a/src/Kernel/Champs/Champ_Gen_de_Champs_Gen.cpp b/src/Kernel/Champs/Champ_Gen_de_Champs_Gen.cpp index 965c00a014..7dd2e5e6a3 100644 --- a/src/Kernel/Champs/Champ_Gen_de_Champs_Gen.cpp +++ b/src/Kernel/Champs/Champ_Gen_de_Champs_Gen.cpp @@ -127,10 +127,6 @@ OWN_PTR(Champ_Fonc_base)& Champ_Gen_de_Champs_Gen::creer_espace_stockage(const N const int nb_comp, OWN_PTR(Champ_Fonc_base)& es_tmp) const { - if (es_tmp) - { - ToDo_Kokkos("critical, call to creer_espace_stockage() is expensive on GPU (fields copy on host). Refactor like Champ_Generique_Moyenne and other advanced fields..."); - } Noms noms; Noms unites; for (int c=0; c0) // Once the computation is initialized + { + ToDo_Kokkos("critical, recurent calls to creer_espace_stockage() is expensive on GPU (fields copy on host). Refactor like Champ_Generique_Moyenne and other advanced fields..."); + } const Discretisation_base& discr = get_discretisation(); Motcle directive = get_directive_pour_discr(); const Domaine_dis_base& domaine_dis = get_ref_domaine_dis_base(); diff --git a/src/Kernel/Champs/Champ_Generique_Correlation.cpp b/src/Kernel/Champs/Champ_Generique_Correlation.cpp index 1b21f25078..c157fa3862 100644 --- a/src/Kernel/Champs/Champ_Generique_Correlation.cpp +++ b/src/Kernel/Champs/Champ_Generique_Correlation.cpp @@ -120,7 +120,7 @@ const Champ_base& Champ_Generique_Correlation::get_champ(OWN_PTR(Champ_base)& es else espace_stockage_->changer_temps(temps()); DoubleTab& tab_correlation = espace_stockage_->valeurs(); - tab_correlation = Op_Correlation_.calculer_valeurs(); + Op_Correlation_.calculer(tab_correlation); tab_correlation.echange_espace_virtuel(); return espace_stockage_; } diff --git a/src/Kernel/Champs/Champ_Generique_Correlation_Triple.cpp b/src/Kernel/Champs/Champ_Generique_Correlation_Triple.cpp new file mode 100644 index 0000000000..604fedae00 --- /dev/null +++ b/src/Kernel/Champs/Champ_Generique_Correlation_Triple.cpp @@ -0,0 +1,142 @@ +/**************************************************************************** +* Copyright (c) 2026, CEA +* All rights reserved. +* +* Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: +* 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. +* 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. +* 3. Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. +* +* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. +* IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; +* OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +* +*****************************************************************************/ + +#include +#include +#include +#include +#include + +Implemente_instanciable(Champ_Generique_Correlation_Triple, + "Champ_Post_Statistiques_Correlation_Triple|Correlation_Triple", + Champ_Generique_Statistiques_base); +// XD correlation_triple champ_post_statistiques_base correlation_triple -1 to calculate the triple correlation between three fields. + +Sortie& Champ_Generique_Correlation_Triple::printOn(Sortie& s) const +{ + return s << que_suis_je() << " " << le_nom(); +} + +Entree& Champ_Generique_Correlation_Triple::readOn(Entree& s) +{ + return Champ_Generique_Statistiques_base::readOn(s); +} + +void Champ_Generique_Correlation_Triple::completer(const Postraitement_base& post) +{ + Champ_Gen_de_Champs_Gen::completer(post); + const Probleme_base& Pb = get_ref_pb_base(); + + const Champ_Generique_base& ch1 = get_source(0); + const Champ_Generique_base& ch2 = get_source(1); + const Champ_Generique_base& ch3 = get_source(2); + const Domaine_dis_base& zdis = get_ref_domaine_dis_base(); + + // Tout est gere en interne : pas besoin de chercher des champs dans le postraitement + Op_Correlation_Triple_.associer(zdis, ch1, ch2, ch3, tstat_deb_, tstat_fin_); + + Nom prefix = Pb.le_nom() + "_"; + if (post.le_nom() != "??" && post.le_nom() != "neant") + prefix += post.le_nom() + "_"; + if (parent_name_ != "??" && !use_source_name_only_) + prefix += parent_name_ + "_"; + Op_Correlation_Triple_.completer(Pb, prefix); +} + +const Champ_base& Champ_Generique_Correlation_Triple::get_champ_without_evaluation(OWN_PTR(Champ_base)& espace_stockage) const +{ + // nb_comp_post() retourne nb_comp_abc_ (sans les colonnes auxiliaires du tableau etendu) + const int nb_comp = Op_Correlation_Triple_.nb_comp_post(); + Nature_du_champ nature_source = (nb_comp == 1) ? scalaire : vectoriel; + OWN_PTR(Champ_Fonc_base) es_tmp; + espace_stockage = creer_espace_stockage(nature_source, nb_comp, es_tmp); + return espace_stockage; +} + +const Champ_base& Champ_Generique_Correlation_Triple::get_champ(OWN_PTR(Champ_base)& espace_stockage) const +{ + // nb_comp_post() retourne nb_comp_abc_ (sans les colonnes auxiliaires du tableau etendu) + const int nb_comp = Op_Correlation_Triple_.nb_comp_post(); + Nature_du_champ nature_source = (nb_comp == 1) ? scalaire : vectoriel; + if (!espace_stockage_) + creer_espace_stockage(nature_source, nb_comp, espace_stockage_); + else + espace_stockage_->changer_temps(temps()); + DoubleTab& tab = espace_stockage_->valeurs(); + // fill_result ecrit directement dans tab (qui porte deja le bon md_vector P0) + // sans creer de tableau temporaire, evitant l'assertion md_vector_ == v.md_vector_ + // qui se declencherait si on faisait tab = calculer_valeurs() avec un resultat + // sans md_vector. + Op_Correlation_Triple_.fill_result(tab); + tab.echange_espace_virtuel(); + return espace_stockage_; +} + +const Noms Champ_Generique_Correlation_Triple::get_property(const Motcle& query) const +{ + Motcles motcles(2); + motcles[0] = "unites"; + motcles[1] = "composantes"; + switch (motcles.search(query)) + { + case 0: + { + // Ne retourner que les nb_comp_abc_ premieres unites (pas les colonnes auxiliaires) + const int nb = Op_Correlation_Triple_.nb_comp_post(); + const Noms& all = integrale().le_champ_calcule().unites(); + if (all.size() <= nb) return all; + Noms res(nb); + for (int i = 0; i < nb; i++) res[i] = all[i]; + return res; + } + case 1: + { + // Ne retourner que les nb_comp_abc_ premiers noms de composantes + const int nb = Op_Correlation_Triple_.nb_comp_post(); + const Noms& all = integrale().le_champ_calcule().noms_compo(); + if (all.size() <= nb) return all; + Noms res(nb); + for (int i = 0; i < nb; i++) res[i] = all[i]; + return res; + } + } + return Champ_Gen_de_Champs_Gen::get_property(query); +} + +void Champ_Generique_Correlation_Triple::nommer_source() +{ + if (nom_post_ == "??") + { + Nom n("Correlation_Triple_"); + n += get_source(0).get_property("nom")[0]; + n += "_"; + n += get_source(1).get_property("nom")[0]; + n += "_"; + n += get_source(2).get_property("nom")[0]; + nommer(n); + } +} + +int Champ_Generique_Correlation_Triple::get_info_type_post() const +{ + return (get_property("composantes").size() > 1) ? 1 : 0; +} + +const Motcle Champ_Generique_Correlation_Triple::get_directive_pour_discr() const +{ + if (Op_Correlation_Triple_.integrale().get_support_different()) + return Motcle("champ_elem"); + return Op_Correlation_Triple_.le_champ_a()->get_directive_pour_discr(); +} diff --git a/src/Kernel/Champs/Champ_Generique_Correlation_Triple.h b/src/Kernel/Champs/Champ_Generique_Correlation_Triple.h new file mode 100644 index 0000000000..4c830bb444 --- /dev/null +++ b/src/Kernel/Champs/Champ_Generique_Correlation_Triple.h @@ -0,0 +1,77 @@ +/**************************************************************************** +* Copyright (c) 2026, CEA +* All rights reserved. +* +* Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: +* 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. +* 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. +* 3. Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. +* +* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. +* IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; +* OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +* +*****************************************************************************/ + +#ifndef Champ_Generique_Correlation_Triple_included +#define Champ_Generique_Correlation_Triple_included + +#include +#include +#include + +class Postraitement_base; + +/*! @brief class Champ_Generique_Correlation_Triple + * + * Champ destine a post-traiter une correlation triple . + * Les trois champs Moyenne correspondants doivent etre declares avant ce champ dans le jeu de donnees. + * + * Syntaxe : + * nom correlation_triple { t_deb val t_fin val sources_reference { F , G , H } } + * + */ +class Champ_Generique_Correlation_Triple : public Champ_Generique_Statistiques_base +{ + Declare_instanciable(Champ_Generique_Correlation_Triple); + +public: + + const Noms get_property(const Motcle& query) const override; + + inline double temps() const override + { + return Op_Correlation_Triple_.integrale().le_champ_calcule().temps(); + }; + inline const Integrale_tps_Champ& integrale() const override + { + return Op_Correlation_Triple_.integrale(); + }; + + inline const Operateur_Statistique_tps_base& Operateur_Statistique() const override; + inline Operateur_Statistique_tps_base& Operateur_Statistique() override; + void completer(const Postraitement_base& post) override; + + const Motcle get_directive_pour_discr() const override; + const Champ_base& get_champ_without_evaluation(OWN_PTR(Champ_base)& espace_stockage) const override; + const Champ_base& get_champ(OWN_PTR(Champ_base)& espace_stockage) const override; + void nommer_source() override; + int get_info_type_post() const override; + +protected: + Op_Correlation_Triple Op_Correlation_Triple_; + +private: + mutable OWN_PTR(Champ_Fonc_base) espace_stockage_; +}; + +inline const Operateur_Statistique_tps_base& Champ_Generique_Correlation_Triple::Operateur_Statistique() const +{ + return Op_Correlation_Triple_; +} +inline Operateur_Statistique_tps_base& Champ_Generique_Correlation_Triple::Operateur_Statistique() +{ + return Op_Correlation_Triple_; +} + +#endif diff --git a/src/Kernel/Champs/Champ_Generique_Ecart_Type.cpp b/src/Kernel/Champs/Champ_Generique_Ecart_Type.cpp index e5c6805663..1646a003dc 100644 --- a/src/Kernel/Champs/Champ_Generique_Ecart_Type.cpp +++ b/src/Kernel/Champs/Champ_Generique_Ecart_Type.cpp @@ -107,7 +107,7 @@ const Champ_base& Champ_Generique_Ecart_Type::get_champ(OWN_PTR(Champ_base)&) co else espace_stockage_->changer_temps(temps()); DoubleTab& tab_ecart_type = espace_stockage_->valeurs(); - tab_ecart_type = Op_Ecart_Type_.calculer_valeurs(); + Op_Ecart_Type_.calculer(tab_ecart_type); tab_ecart_type.echange_espace_virtuel(); return espace_stockage_; } diff --git a/src/Kernel/Champs/Champ_Generique_Moyenne.cpp b/src/Kernel/Champs/Champ_Generique_Moyenne.cpp index 2cf4ada943..80af70a040 100644 --- a/src/Kernel/Champs/Champ_Generique_Moyenne.cpp +++ b/src/Kernel/Champs/Champ_Generique_Moyenne.cpp @@ -93,11 +93,15 @@ const Champ_base& Champ_Generique_Moyenne::get_champ(OWN_PTR(Champ_base)&) const Nature_du_champ nature_source = source.nature_du_champ(); int nb_comp = source.nb_comp(); if (!espace_stockage_) - creer_espace_stockage(nature_source,nb_comp,espace_stockage_); + { + creer_espace_stockage(nature_source, nb_comp, espace_stockage_); + //mapToDevice(espace_stockage_->valeurs()); // Force creation on device + // PL: normally A=B (line 103) if B is on device should create A on device no ? + } else espace_stockage_->changer_temps(temps()); DoubleTab& tab_moy = espace_stockage_->valeurs(); - tab_moy = Op_Moyenne_.calculer_valeurs(); + Op_Moyenne_.calculer(tab_moy); tab_moy.echange_espace_virtuel(); return espace_stockage_; } diff --git a/src/Kernel/Champs/Champ_Generique_Predefini.cpp b/src/Kernel/Champs/Champ_Generique_Predefini.cpp index 70a30d0c2d..4787ffda2f 100644 --- a/src/Kernel/Champs/Champ_Generique_Predefini.cpp +++ b/src/Kernel/Champs/Champ_Generique_Predefini.cpp @@ -19,7 +19,7 @@ Implemente_instanciable(Champ_Generique_Predefini,"Predefini",Champ_Gen_de_Champs_Gen); // XD predefini champ_generique_base predefini -1 This keyword is used to post process predefined postprocessing fields. -// XD attr pb_champ deuxmots pb_champ 0 { Pb_champ nom_pb nom_champ } : nom_pb is the problem name and nom_champ is the selected field name. The available keywords for the field name are: energie_cinetique_totale, energie_cinetique_elem, viscosite_turbulente, viscous_force_x, viscous_force_y, viscous_force_z, pressure_force_x, pressure_force_y, pressure_force_z, total_force_x, total_force_y, total_force_z, viscous_force, pressure_force, total_force +// XD attr pb_champ deuxmots pb_champ 0 { Pb_champ nom_pb nom_champ } : nom_pb is the problem name and nom_champ is the selected field name. The available keywords for the field name are: energie_cinetique_totale, energie_cinetique_elem, enstrophie_totale, viscosite_turbulente, viscous_force_x, viscous_force_y, viscous_force_z, pressure_force_x, pressure_force_y, pressure_force_z, total_force_x, total_force_y, total_force_z, viscous_force, pressure_force, total_force Sortie& Champ_Generique_Predefini::printOn(Sortie& s ) const { @@ -92,6 +92,8 @@ const Noms Champ_Generique_Predefini::get_property(const Motcle& query) const mots[0] = "kg.m2/s2"; else if (Motcle(type_champ_)=="ENERGIE_CINETIQUE_ELEM") mots[0] = "kg/(m.s2)"; + else if (Motcle(type_champ_)=="ENSTROPHIE_TOTALE") + mots[0] = "s-2"; else if (Motcle(type_champ_)=="VISCOSITE_TURBULENTE") mots[0] = "m2/s"; else if (Motcle(type_champ_)=="VISCOUS_FORCE_X") @@ -141,7 +143,7 @@ void Champ_Generique_Predefini::nommer_source() Nom Champ_Generique_Predefini::construit_expression() { - Motcles les_mots(15); + Motcles les_mots(16); { les_mots[0] = "energie_cinetique_totale"; les_mots[1] = "energie_cinetique_elem"; @@ -158,6 +160,7 @@ Nom Champ_Generique_Predefini::construit_expression() les_mots[12] = "viscous_force"; les_mots[13] = "pressure_force"; les_mots[14] = "total_force"; + les_mots[15] = "enstrophie_totale"; } Nom expression(""); @@ -363,6 +366,17 @@ Nom Champ_Generique_Predefini::construit_expression() } + case 15: + { + // enstrophie_totale = volume-integrated 0.5*|omega|^2, with omega = curl(u) + expression = " Reduction_0D { methode somme_ponderee "; + expression += " source Transformation { methode formule expression 1 0.5*norme_omega*norme_omega "; + expression += " sources { Transformation { methode norme localisation elem source RefChamp { Pb_champ "; + expression += nom_pb_; + expression += " vorticite } nom_source norme_omega } } } } "; + break; + } + default : { Cerr<<"Only keywords among "<(size_vect)); + end_gpu_timer(__KERNEL_NAME__); } DoubleTrav vect_source; @@ -287,7 +290,8 @@ const Champ_base& Champ_Generique_Reduction_0D::get_champ(OWN_PTR(Champ_base)&) //pour appliquer val_extraite = mp_prodscal(vect_source,un) //Sa dimension est alors fixee par rapport au nombre d items de la source //ex : zvf.nb_faces() si loc==FACE - if (methode_=="somme" || methode_=="moyenne" || methode_=="sum" || methode_=="average") + const bool flag = methode_=="somme" || methode_=="moyenne" || methode_=="sum" || methode_=="average"; + if (flag) { Entity loc = get_localisation(); if (loc==Entity::ELEMENT) @@ -316,19 +320,30 @@ const Champ_base& Champ_Generique_Reduction_0D::get_champ(OWN_PTR(Champ_base)&) } else { - ToDo_Kokkos("critical, warning check you have a NR test case with .son !"); - int k=0; - for (int i=0; i(valeurs_source).view_ro(); + DoubleArrView vect = static_cast(vect_source).view_rw(); + Kokkos::parallel_for(start_gpu_timer(__KERNEL_NAME__), n_faces, + KOKKOS_LAMBDA(const int i) { - if (methode_=="somme" || methode_=="moyenne" || methode_=="sum" || methode_=="average") - vect_source(i) = valeurs_source(i); - else + if (orientation(i) == comp) vect(i) = valeurs(i); + }); + end_gpu_timer(__KERNEL_NAME__); + } + else + { + ToDo_Kokkos("critical not so easy, numerotation dependant"); + int k = 0; + for (int i = 0; i < n_faces; i++) + if (zvf.orientation(i) == comp) { vect_source(k) = valeurs_source(i); k++; } - } + } } // Passage si necessaire de la composante pour les Champ_face extraire(val_extraite,vect_source,basis_function,(nb_dim==nb_comp?-1:comp)); @@ -344,10 +359,15 @@ const Champ_base& Champ_Generique_Reduction_0D::get_champ(OWN_PTR(Champ_base)&) } else { - ToDo_Kokkos("critical, warning check you have a NR test case with .son !"); - for (int i=0; i(espace_valeurs).view_rw(); + Kokkos::parallel_for(start_gpu_timer(__KERNEL_NAME__), size, KOKKOS_LAMBDA(const int i) + { + if (orientation(i)==comp) + valeurs(i) = val_extraite; + }); + end_gpu_timer(__KERNEL_NAME__); } } } diff --git a/src/Kernel/Champs/Champ_Generique_Transformation.cpp b/src/Kernel/Champs/Champ_Generique_Transformation.cpp index a69934301b..ba180272f4 100644 --- a/src/Kernel/Champs/Champ_Generique_Transformation.cpp +++ b/src/Kernel/Champs/Champ_Generique_Transformation.cpp @@ -538,12 +538,7 @@ const Champ_base& Champ_Generique_Transformation::get_champ(OWN_PTR(Champ_base)& //de stockage DoubleTrav positions; if (localisation_ == "elem") - { - if (zvf.xp().nb_dim() != 2) /* xp() non initialise */ - zvf.domaine().calculer_centres_gravite(positions); - else - zvf.get_position(positions); - } + zvf.get_position(positions); else if (localisation_ == "som") positions = get_ref_domain().coord_sommets(); else if (localisation_ == "faces") @@ -579,7 +574,7 @@ const Champ_base& Champ_Generique_Transformation::get_champ(OWN_PTR(Champ_base)& Process::exit(); } DoubleTravs sources_val(nb_sources); - IntTrav nb_comps(nb_sources); + ArrOfInt nb_comps(nb_sources); Noms nom_source(nb_sources); int dim_compo = 2*dimension; Noms compo(dim_compo); @@ -888,6 +883,7 @@ const Champ_base& Champ_Generique_Transformation::get_champ(OWN_PTR(Champ_base)& { ToDo_Kokkos("Code but check test!"); int dim = dimension; + int nb_comp = nb_comp_; int nb_elem = valeurs_espace.dimension(0); Kokkos::Array sources; for (int so=0; so #include #include +#include +#include Implemente_instanciable(Champ_front_txyz,"Champ_front_fonc_txyz",Ch_front_var_instationnaire_indep); // XD champ_front_fonc_txyz front_field_base champ_front_fonc_txyz 0 Boundary field which is not constant in space and in time. @@ -98,25 +100,33 @@ Champ_front_base& Champ_front_txyz::affecter_(const Champ_front_base& ch) void Champ_front_txyz::mettre_a_jour(double temps) { - int dim=nb_comp(); - const Frontiere_dis_base& fr_dis=frontiere_dis(); + int dim = nb_comp(); + const Frontiere_dis_base& fr_dis = frontiere_dis(); const Domaine_VF& zvf = ref_cast(Domaine_VF, fr_dis.domaine_dis()); - int nb_faces=ref_cast(Front_VF, fr_dis).nb_faces(); + int nb_faces = ref_cast(Front_VF, fr_dis).nb_faces(); int premiere_face = ref_cast(Front_VF, fr_dis).num_premiere_face(); - DoubleTab& tab=valeurs_au_temps(temps); - for(int i=0; i= 3); + CDoubleTabView xv = zvf.xv().view_ro(); + DoubleTabView tab = tab_val.view_rw(); + for (int k = 0; k < dim; k++) { - for(int k=0; k= 3) - fxyz[k].setVar(3,zvf.xv(premiere_face + i, 2)); - tab(i,k)=fxyz[k].eval(); - } + ParserView fxyzk(fxyz[k]); + fxyzk.parseString(); + Kokkos::parallel_for(start_gpu_timer(__KERNEL_NAME__), nb_faces, KOKKOS_LAMBDA(const int i) + { + int threadId = fxyzk.acquire(); + fxyzk.setVar(0, temps, threadId); + fxyzk.setVar(1, xv(premiere_face + i, 0), threadId); + fxyzk.setVar(2, xv(premiere_face + i, 1), threadId); + fxyzk.setVar(3, dim3 ? xv(premiere_face + i, 2) : 0.0, threadId); + tab(i, k) = fxyzk.eval(threadId); + fxyzk.release(threadId); + }); + end_gpu_timer(__KERNEL_NAME__); } - tab.echange_espace_virtuel(); + tab_val.echange_espace_virtuel(); } double Champ_front_txyz::valeur_au_temps_et_au_point(double temps,int som,double x,double y, double z, int k) const diff --git a/src/Kernel/Champs/Champ_front_vide.h b/src/Kernel/Champs/Champ_front_vide.h index d9f2007ba1..bbe71e4a5f 100644 --- a/src/Kernel/Champs/Champ_front_vide.h +++ b/src/Kernel/Champs/Champ_front_vide.h @@ -34,8 +34,8 @@ class Champ_front_vide : public Champ_front_base public: bool has_valeurs_au_temps(double temps) const override { return false; } - DoubleTab& valeurs_au_temps(double temps) override { Process::exit("Impossible d'appeler les valeurs d'un champ_fronc_vide"); return les_valeurs->valeurs();}; - const DoubleTab& valeurs_au_temps(double temps) const override { Process::exit("Impossible d'appeler les valeurs d'un champ_fronc_vide"); return les_valeurs->valeurs();}; + DoubleTab& valeurs_au_temps(double temps) override { Process::exit("Impossible d'appeler les valeurs d'un champ_front_vide"); return les_valeurs->valeurs();}; + const DoubleTab& valeurs_au_temps(double temps) const override { Process::exit("Impossible d'appeler les valeurs d'un champ_front_vide"); return les_valeurs->valeurs();}; int avancer(double temps) override {return 1;}; int reculer(double temps) override {return 1;}; Champ_front_base& affecter_(const Champ_front_base& ch) override {return *this;}; diff --git a/src/Kernel/Champs/Champs_Don/Champ_Don_lu.cpp b/src/Kernel/Champs/Champs_Don/Champ_Don_lu.cpp index c30792f99a..5df5851921 100644 --- a/src/Kernel/Champs/Champs_Don/Champ_Don_lu.cpp +++ b/src/Kernel/Champs/Champs_Don/Champ_Don_lu.cpp @@ -104,7 +104,6 @@ Entree& Champ_Don_lu::readOn(Entree& is) // Lecture des valeurs dans le fichier fic DoubleTab& mes_val = valeurs(); IntTab compteur(nb_elems); - compteur = 0; DoubleVect point(dimension); DoubleVect val_lu(dim); int elem2; diff --git a/src/Kernel/Champs/Champs_Don/Champ_Uniforme_Morceaux.cpp b/src/Kernel/Champs/Champs_Don/Champ_Uniforme_Morceaux.cpp index ec4a3d7613..f08ddc13cf 100644 --- a/src/Kernel/Champs/Champs_Don/Champ_Uniforme_Morceaux.cpp +++ b/src/Kernel/Champs/Champs_Don/Champ_Uniforme_Morceaux.cpp @@ -97,9 +97,9 @@ Entree& Champ_Uniforme_Morceaux::readOn(Entree& is) z = z/nsom; for( k=0; k< dim; k++) { - fxyz[k].setVar("x",x); - fxyz[k].setVar("y",y); - fxyz[k].setVar("z",z); + fxyz[k].setVar(0,x); + fxyz[k].setVar(1,y); + fxyz[k].setVar(2,z); valeurs_(poly,k)=fxyz[k].eval(); } } @@ -134,9 +134,9 @@ Entree& Champ_Uniforme_Morceaux::readOn(Entree& is) z = z/nsom; for( k=0; k< dim; k++) { - fxyz[k].setVar("x",x); - fxyz[k].setVar("y",y); - fxyz[k].setVar("z",z); + fxyz[k].setVar(0,x); + fxyz[k].setVar(1,y); + fxyz[k].setVar(2,z); valeurs_(ssz(poly),k)=fxyz[k].eval(); } } diff --git a/src/Kernel/Champs_dis/Champ_Fonc_P0_base.h b/src/Kernel/Champs_dis/Champ_Fonc_P0_base.h index 62a6794cdd..4d760c130f 100644 --- a/src/Kernel/Champs_dis/Champ_Fonc_P0_base.h +++ b/src/Kernel/Champs_dis/Champ_Fonc_P0_base.h @@ -18,6 +18,11 @@ #include #include +#include +#include +#include +#include +class Table; class Champ_Fonc_P0_base: public Champ_Fonc_base, public Champ_implementation_P0 { @@ -78,6 +83,12 @@ class Champ_Fonc_P0_base: public Champ_Fonc_base, public Champ_implementation_P0 return Champ_implementation_P0::remplir_coord_noeuds_et_polys(positions, polys); } + inline void mettre_a_jour(double t, const Table& table, VECT(OBS_PTR(Champ_base))& les_ch_param) + { + Champ_implementation_P0::mettre_a_jour(t, table, les_ch_param); + Champ_Fonc_base::mettre_a_jour(t); + } + protected: Champ_base& le_champ() override { return *this; } const Champ_base& le_champ() const override { return *this; } diff --git a/src/Kernel/Champs_dis/Champ_implementation_P0.cpp b/src/Kernel/Champs_dis/Champ_implementation_P0.cpp index 21ce4d971a..ab3ef51292 100644 --- a/src/Kernel/Champs_dis/Champ_implementation_P0.cpp +++ b/src/Kernel/Champs_dis/Champ_implementation_P0.cpp @@ -1,5 +1,5 @@ /**************************************************************************** -* Copyright (c) 2025, CEA +* Copyright (c) 2026, CEA * All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: @@ -21,6 +21,9 @@ #include #include #include +#include +#include +#include DoubleVect& Champ_implementation_P0::valeur_a_elem(const DoubleVect& position, DoubleVect& result, int poly) const { @@ -152,7 +155,8 @@ DoubleTab& Champ_implementation_P0::valeur_aux_elems(const DoubleTab& positions, assert(tab_values.line_size() == nb_components); assert(tab_values.line_size() == nb_components || nb_components == 1); - bool kernelOnDevice = tab_result.checkDataOnDevice(tab_values); + // PL: 2026/05/11, as VDF and VEF is well now ported on GPU, we force the algorithm on the device: + bool kernelOnDevice = true; // tab_result.checkDataOnDevice(tab_values); if (kernelOnDevice) valeur_aux_elems_kernel(tab_values, tab_polys, tab_result, nb_components); else @@ -187,7 +191,8 @@ DoubleVect& Champ_implementation_P0::valeur_aux_elems_compo(const DoubleTab& pos assert(tab_result.size() == tab_polys.size()); assert(tab_values.line_size() == le_champ().nb_comp()); - bool kernelOnDevice = tab_result.checkDataOnDevice(tab_values); + // PL: 2026/05/11, as VDF and VEF is well now ported on GPU, we force the algorithm on the device: + bool kernelOnDevice = true; // tab_result.checkDataOnDevice(tab_values); if (kernelOnDevice) valeur_aux_elems_compo_kernel(tab_values, tab_polys, tab_result, ncomp); else @@ -333,3 +338,78 @@ int Champ_implementation_P0::affecter_(const Champ_base& ch) return 0; } } + +void Champ_implementation_P0::mettre_a_jour(double t, const Table& table, VECT(OBS_PTR(Champ_base))& les_ch_param) +{ + const Domaine_VF& domaine_VF = get_domaine_dis(); + DoubleTab& mes_valeurs = le_champ().valeurs(); + const int nb_elem = domaine_VF.nb_elem(), nb_elem_tot = domaine_VF.nb_elem_tot(), nb_param = les_ch_param.size(); + const int nbcomp = mes_valeurs.dimension(1); + const DoubleTab& centres_de_gravites = domaine_VF.xp(); + + // ToDo Kokkos: factorize somewhere this array or rewrite valeur_aux_elems ! + IntTrav les_polys(nb_elem_tot); + IntArrView les_polys_v = static_cast(les_polys).view_wo(); + Kokkos::parallel_for(start_gpu_timer(__KERNEL_NAME__), nb_elem_tot, KOKKOS_LAMBDA(const int num_elem) + { + les_polys_v(num_elem) = num_elem; + }); + end_gpu_timer(__KERNEL_NAME__); + + if (nb_param==1 && nbcomp==1 && table.isfonction()==1) + { + // Ported on GPU. ToDo Kokkos, extend to more than one param or more than one nbcomp + DoubleTrav val_param_aux_elems(nb_elem_tot, nbcomp); + les_ch_param[0]->valeur_aux_elems(centres_de_gravites, les_polys, val_param_aux_elems); + // Cree un parser specifique ParserView pour Kokkos: + ParserView parser(table.parser(0)); + parser.parseString(); + CDoubleTabView val_params_aux_elems_v = val_param_aux_elems.view_ro(); + DoubleTabView mes_valeurs_v = mes_valeurs.view_wo(); + Kokkos::parallel_for(start_gpu_timer(__KERNEL_NAME__), nb_elem, KOKKOS_LAMBDA( + const int num_elem) + { + int threadId = parser.acquire(); + for (int ncomp = 0; ncomp < nbcomp; ncomp++) + { + double val = val_params_aux_elems_v(num_elem, ncomp); + + parser.setVar(0, val, threadId); + mes_valeurs_v(num_elem, ncomp) = parser.eval(threadId); + } + parser.release(threadId); + }); + end_gpu_timer(__KERNEL_NAME__); + } + else + { + ToDo_Kokkos("critical"); + DoubleTabs val_params_aux_elems; + for (int i = 0; i < nb_param; i++) + { + DoubleTab vp(nb_elem_tot, les_ch_param[i]->valeurs().dimension(1)); + val_params_aux_elems.add(vp); + } + for (int i = 0; i < nb_param; i++) + les_ch_param[i]->valeur_aux_elems(centres_de_gravites, les_polys, val_params_aux_elems[i]); + + if (table.isfonction() != 2) + { + std::vector vals; + vals.reserve(nb_param); // Pre-allocate space once + for (int num_elem = 0; num_elem < nb_elem; num_elem++) + for (int ncomp = 0; ncomp < nbcomp; ncomp++) + { + vals.clear(); + for (int n = 0; n < nb_param; n++) + vals.push_back(val_params_aux_elems[n](num_elem, les_ch_param[n]->valeurs().dimension(1) == 1 ? 0 : ncomp)); + mes_valeurs(num_elem, ncomp) = table.val(vals, ncomp); + } + } + else + { + table.valeurs(val_params_aux_elems[0], centres_de_gravites, t, mes_valeurs); + } + } +} + diff --git a/src/Kernel/Champs_dis/Champ_implementation_P0.h b/src/Kernel/Champs_dis/Champ_implementation_P0.h index e1d4a74d08..1b77f35fff 100644 --- a/src/Kernel/Champs_dis/Champ_implementation_P0.h +++ b/src/Kernel/Champs_dis/Champ_implementation_P0.h @@ -19,6 +19,7 @@ #include #include #include +class Table; class Champ_implementation_P0: public Champ_implementation { @@ -32,8 +33,9 @@ class Champ_implementation_P0: public Champ_implementation DoubleTab& remplir_coord_noeuds(DoubleTab& positions) const override; int remplir_coord_noeuds_et_polys(DoubleTab& positions, IntVect& polys) const override; int imprime_P0(Sortie&, int) const; + void mettre_a_jour(double, const Table&, VECT(OBS_PTR(Champ_base))&); - public_for_cuda + protected_but_public_for_cuda DoubleTab& valeur_aux_sommets_impl(DoubleTab& result) const override; protected: diff --git a/src/Kernel/Cond_Lim/Dirichlet_loi_paroi.h b/src/Kernel/Cond_Lim/Dirichlet_loi_paroi.h index e41e8951ad..21598b6cf8 100644 --- a/src/Kernel/Cond_Lim/Dirichlet_loi_paroi.h +++ b/src/Kernel/Cond_Lim/Dirichlet_loi_paroi.h @@ -1,5 +1,5 @@ /**************************************************************************** -* Copyright (c) 2024, CEA +* Copyright (c) 2025, CEA * All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: @@ -56,9 +56,10 @@ class Dirichlet_loi_paroi: public Dirichlet virtual double val_imp(int i) const override {return d_(i,0);} virtual double val_imp(int i, int j) const override {return d_(i,j);} + virtual const DoubleTab& tab_val_imp(double temps=DMAXFLOAT) const override { return d_; } virtual double val_imp_au_temps(double temps, int i) const override { - Process::exit(que_suis_je() + " : You shouldn't go through val_imp_au_temps but through val_imp ! "); + Process::exit(que_suis_je() + " : You shouldn't go through val_imp_au_temps but through val_imp ! "); return 1.; } virtual double val_imp_au_temps(double temps, int i, int j) const override diff --git a/src/Kernel/Cond_Lim/Echange_global_impose.cpp b/src/Kernel/Cond_Lim/Echange_global_impose.cpp index 5fcab943c0..563609d408 100644 --- a/src/Kernel/Cond_Lim/Echange_global_impose.cpp +++ b/src/Kernel/Cond_Lim/Echange_global_impose.cpp @@ -1,5 +1,5 @@ /**************************************************************************** -* Copyright (c) 2025, CEA +* Copyright (c) 2026, CEA * All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: @@ -249,6 +249,22 @@ double Echange_global_impose::flux_exterieur_impose(int i,int j) const return champ_exterieur(i,j,phi_ext()); } +const DoubleTab& Echange_global_impose::tab_phi_ext() const +{ + const Front_VF& le_bord = ref_cast(Front_VF, frontiere_dis()); + int size = le_champ_front->valeurs().dimension(0) == 1 ? le_bord.nb_faces_tot() : le_champ_front->valeurs().dimension_tot(0); + int nb_comp = le_champ_front->valeurs().dimension(1); + if (phi_ext_tab_.dimension(0) != size) + phi_ext_tab_.resize(size, nb_comp); + if (phi_ext_lu_) + for (int face = 0; face < size; face++) + for (int comp = 0; comp < nb_comp; comp++) + phi_ext_tab_(face, comp) = flux_exterieur_impose(face, comp); + else + phi_ext_tab_ = 0.; + return phi_ext_tab_; +} + double Echange_global_impose::flux_exterieur_impose(int i) const { return champ_exterieur(i,phi_ext()); diff --git a/src/Kernel/Cond_Lim/Echange_global_impose.h b/src/Kernel/Cond_Lim/Echange_global_impose.h index ed861315c0..7da5c90117 100644 --- a/src/Kernel/Cond_Lim/Echange_global_impose.h +++ b/src/Kernel/Cond_Lim/Echange_global_impose.h @@ -1,5 +1,5 @@ /**************************************************************************** -* Copyright (c) 2025, CEA +* Copyright (c) 2026, CEA * All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: @@ -63,10 +63,14 @@ class Echange_global_impose: public Echange_impose_base virtual double derivee_flux_exterieur_imposee(int i, int j) const; const bool& has_phi_ext() const { return phi_ext_lu_; } + const DoubleTab& tab_phi_ext() const; protected: bool phi_ext_lu_ = false; OWN_PTR(Champ_front_base) derivee_phi_ext_, phi_ext_; + +private: + mutable DoubleTab phi_ext_tab_; }; #endif diff --git a/src/Kernel/Cond_Lim/Echange_impose_base.cpp b/src/Kernel/Cond_Lim/Echange_impose_base.cpp index 3a9aaa382f..eb0f06a6a2 100644 --- a/src/Kernel/Cond_Lim/Echange_impose_base.cpp +++ b/src/Kernel/Cond_Lim/Echange_impose_base.cpp @@ -1,5 +1,5 @@ /**************************************************************************** -* Copyright (c) 2025, CEA +* Copyright (c) 2026, CEA * All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: @@ -255,80 +255,94 @@ void Echange_impose_base::associer_fr_dis_base(const Frontiere_dis_base& fr) Cond_lim_base::associer_fr_dis_base(fr); } -const DoubleTab& Echange_impose_base::tab_T_ext(double temps) const +const DoubleTab& Echange_impose_base::tab_T_ext(double temps, bool with_virtual_faces) const { if (temps==DMAXFLOAT) temps = le_champ_front->get_temps_defaut(); const Front_VF& le_bord = ref_cast(Front_VF, frontiere_dis()); // ToDo factorize in Champ_front_base::valeurs_face() - int size = le_champ_front->valeurs().dimension(0) == 1 ? le_bord.nb_faces_tot() : le_champ_front->valeurs().dimension_tot(0); + int size; + if (with_virtual_faces) + size = le_champ_front->valeurs().dimension(0) == 1 ? le_bord.nb_faces_tot() : le_champ_front->valeurs().dimension_tot(0); + else + size = le_champ_front->valeurs().dimension(0) == 1 ? le_bord.nb_faces() : le_champ_front->valeurs().dimension(0); + if (size>0) { bool update = le_champ_front->instationnaire(); - if (text_.dimension(0) != size) + if (tab_text_.dimension(0) != size) { - text_.resize(size, le_champ_front->valeurs().dimension(1)); + tab_text_.resize(size, le_champ_front->valeurs().dimension(1)); update = true; } update = true; // Provisoire if (update) { - int nb_comp = text_.dimension(1); + int nb_comp = tab_text_.dimension(1); for (int face = 0; face < size; face++) for (int comp = 0; comp < nb_comp; comp++) - text_(face, comp) = T_ext(face, comp); + tab_text_(face, comp) = T_ext(face, comp); } } - return text_; + return tab_text_; } -const DoubleTab& Echange_impose_base::tab_h_imp(double temps) const +const DoubleTab& Echange_impose_base::tab_h_imp(double temps, bool with_virtual_faces) const { if (temps==DMAXFLOAT) temps = le_champ_front->get_temps_defaut(); const Front_VF& le_bord = ref_cast(Front_VF, frontiere_dis()); // ToDo factorize in Champ_front_base::valeurs_face() - int size = le_champ_front->valeurs().dimension(0) == 1 ? le_bord.nb_faces_tot() : le_champ_front->valeurs().dimension_tot(0); + int size; + if (with_virtual_faces) + size = le_champ_front->valeurs().dimension(0) == 1 ? le_bord.nb_faces_tot() : le_champ_front->valeurs().dimension_tot(0); + else + size = le_champ_front->valeurs().dimension(0) == 1 ? le_bord.nb_faces() : le_champ_front->valeurs().dimension(0); + if (size>0) { bool update = le_champ_front->instationnaire(); - if (himp_.dimension(0) != size) + if (tab_himp_.dimension(0) != size) { - himp_.resize(size, le_champ_front->valeurs().dimension(1)); + tab_himp_.resize(size, le_champ_front->valeurs().dimension(1)); update = true; } update = true; // Provisoire if (update) { - int nb_comp = himp_.dimension(1); + int nb_comp = tab_himp_.dimension(1); for (int face = 0; face < size; face++) for (int comp = 0; comp < nb_comp; comp++) - himp_(face, comp) = h_imp(face, comp); + tab_himp_(face, comp) = h_imp(face, comp); } } - return himp_; + return tab_himp_; } -const DoubleTab& Echange_impose_base::tab_emissivite(double temps) const +const DoubleTab& Echange_impose_base::tab_emissivite(double temps, bool with_virtual_faces) const { if (temps==DMAXFLOAT) temps = le_champ_front->get_temps_defaut(); const Front_VF& le_bord = ref_cast(Front_VF, frontiere_dis()); // ToDo factorize in Champ_front_base::valeurs_face() - int size = le_champ_front->valeurs().dimension(0) == 1 ? le_bord.nb_faces_tot() : le_champ_front->valeurs().dimension_tot(0); + int size; + if (with_virtual_faces) + size = le_champ_front->valeurs().dimension(0) == 1 ? le_bord.nb_faces_tot() : le_champ_front->valeurs().dimension_tot(0); + else + size = le_champ_front->valeurs().dimension(0) == 1 ? le_bord.nb_faces() : le_champ_front->valeurs().dimension(0); if (size>0) { bool update = le_champ_front->instationnaire(); - if (eps_.dimension(0) != size) + if (tab_eps_.dimension(0) != size) { - eps_.resize(size, le_champ_front->valeurs().dimension(1)); + tab_eps_.resize(size, le_champ_front->valeurs().dimension(1)); update = true; } update = true; // Provisoire if (update) { - int nb_comp = eps_.dimension(1); + int nb_comp = tab_eps_.dimension(1); for (int face = 0; face < size; face++) for (int comp = 0; comp < nb_comp; comp++) - eps_(face, comp) = emissivite(face, comp); + tab_eps_(face, comp) = emissivite(face, comp); } } - return eps_; + return tab_eps_; } diff --git a/src/Kernel/Cond_Lim/Echange_impose_base.h b/src/Kernel/Cond_Lim/Echange_impose_base.h index fb08bfb929..03d46c0acb 100644 --- a/src/Kernel/Cond_Lim/Echange_impose_base.h +++ b/src/Kernel/Cond_Lim/Echange_impose_base.h @@ -45,9 +45,9 @@ class Echange_impose_base : public Cond_lim_base inline bool has_emissivite() const { return bool(emissivite_); } inline bool has_h_imp() const { return bool(h_imp_); } - const DoubleTab& tab_T_ext(double temps=DMAXFLOAT) const; - const DoubleTab& tab_h_imp(double temps=DMAXFLOAT) const; - const DoubleTab& tab_emissivite(double temps=DMAXFLOAT) const; + const DoubleTab& tab_T_ext(double temps=DMAXFLOAT, bool with_virtual_faces=false) const; + const DoubleTab& tab_h_imp(double temps=DMAXFLOAT, bool with_virtual_faces=false) const; + const DoubleTab& tab_emissivite(double temps=DMAXFLOAT, bool with_virtual_faces=false) const; virtual double T_ext(int num) const; virtual double T_ext(int num,int k) const; virtual double h_imp(int num) const; @@ -92,9 +92,9 @@ protected : OWN_PTR(Champ_front_base) h_imp_, emissivite_ /* si Echange_externe_radiatif */; private: // Stocke toutes les valeurs sur les faces (utile pour GPU): - mutable DoubleTab text_; - mutable DoubleTab himp_; - mutable DoubleTab eps_; + mutable DoubleTab tab_text_; + mutable DoubleTab tab_himp_; + mutable DoubleTab tab_eps_; }; #endif /* Echange_impose_base_included */ diff --git a/src/Kernel/Cond_Lim/Navier.h b/src/Kernel/Cond_Lim/Navier.h index 0210fe1fad..89ad5d5aba 100644 --- a/src/Kernel/Cond_Lim/Navier.h +++ b/src/Kernel/Cond_Lim/Navier.h @@ -1,5 +1,5 @@ /**************************************************************************** -* Copyright (c) 2024, CEA +* Copyright (c) 2025, CEA * All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: @@ -32,7 +32,7 @@ class Navier: public Cond_lim_base Declare_base(Navier); public: int compatible_avec_eqn(const Equation_base&) const override { return 1; } - + virtual const DoubleTab* coefficient_frottement() const { return nullptr; }; virtual double coefficient_frottement(int ) const { return 0.; } virtual double coefficient_frottement(int , int ) const { return 0.; } virtual double coefficient_frottement_grad(int ) const { return 0.; } // Change the coefficient when calculation of gradient : nu = nullptr diff --git a/src/Kernel/Cond_Lim/Neumann.cpp b/src/Kernel/Cond_Lim/Neumann.cpp index 919cf559d0..b5bfc5fd48 100644 --- a/src/Kernel/Cond_Lim/Neumann.cpp +++ b/src/Kernel/Cond_Lim/Neumann.cpp @@ -64,7 +64,7 @@ double Neumann::flux_impose(int i, int j) const * * @return const DoubleTab& Reference to the updated imposed flux array. */ -const DoubleTab& Neumann::flux_impose(bool with_virtual_faces) const +const DoubleTab& Neumann::tab_flux_impose(bool with_virtual_faces) const { const Front_VF& le_bord = ref_cast(Front_VF, frontiere_dis()); // ToDo factorize in Champ_front_base::valeurs_face() diff --git a/src/Kernel/Cond_Lim/Neumann.h b/src/Kernel/Cond_Lim/Neumann.h index 8cebcbe9bd..14e4abdb68 100644 --- a/src/Kernel/Cond_Lim/Neumann.h +++ b/src/Kernel/Cond_Lim/Neumann.h @@ -33,7 +33,7 @@ class Neumann: public Cond_lim_base public: virtual double flux_impose(int i) const; virtual double flux_impose(int i, int j) const; - const DoubleTab& flux_impose(bool nb_faces_tot=false) const; + const DoubleTab& tab_flux_impose(bool nb_faces_tot=false) const; protected: mutable DoubleTab flux_impose_; // Stocke toutes les valeurs du flux sur toutes les faces de la frontiere (pas d'hypothese sur un champ uniforme). Utile pour le GPU. diff --git a/src/Kernel/Cond_Lim/Neumann_val_ext.h b/src/Kernel/Cond_Lim/Neumann_val_ext.h index b35eca5966..e56deea570 100644 --- a/src/Kernel/Cond_Lim/Neumann_val_ext.h +++ b/src/Kernel/Cond_Lim/Neumann_val_ext.h @@ -1,5 +1,5 @@ /**************************************************************************** -* Copyright (c) 2022, CEA +* Copyright (c) 2025, CEA * All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: @@ -37,6 +37,7 @@ class Neumann_val_ext: public Neumann virtual double val_ext(int i) const = 0; virtual double val_ext(int i, int j) const = 0; + virtual const DoubleTab& tab_val_ext() const = 0; }; #endif diff --git a/src/Kernel/Framework/Champ_Inc_base.cpp b/src/Kernel/Framework/Champ_Inc_base.cpp index 0e0dbe9fd3..35296a649e 100644 --- a/src/Kernel/Framework/Champ_Inc_base.cpp +++ b/src/Kernel/Framework/Champ_Inc_base.cpp @@ -742,45 +742,93 @@ DoubleTab Champ_Inc_base::valeur_aux_bords() const } //sinon, calcul a partir des CLs const Domaine_VF& domaine = ref_cast(Domaine_VF, domaine_dis_base()); - const IntTab& f_e = domaine.face_voisins(), &f_s = domaine.face_sommets(); - DoubleTrav result(domaine.xv_bord().dimension_tot(0), valeurs().line_size()); + const IntTab& f_s = domaine.face_sommets(); + DoubleTrav tab_result(domaine.xv_bord().dimension_tot(0), valeurs().line_size()); const Conds_lim& cls = domaine_Cl_dis().les_conditions_limites(); - int j, k, f, fb, s, n, N = result.line_size(), is_p = (le_nom().debute_par("pression") || le_nom().debute_par("pressure")), n_som; + int k, N = tab_result.line_size(), is_p = (le_nom().debute_par("pression") || le_nom().debute_par("pressure")), n_som; for (const auto& itr : cls) { const Front_VF& fr = ref_cast(Front_VF, itr->frontiere_dis()); //valeur au bord imposee, sauf si c'est une paroi (dans ce cas, la CL peut avoir moins de composantes que le champ -> Energie_Multiphase) if (is_p ? sub_type(Neumann, itr.valeur()) : (sub_type(Dirichlet, itr.valeur()) && !sub_type(Scalaire_impose_paroi, itr.valeur()))) - for (j = 0; j < fr.nb_faces_tot(); j++) - for (f = fr.num_face(j), fb = domaine.fbord(f), n = 0; n < N; n++) - result(fb, n) = is_p ? ref_cast(Neumann, itr.valeur()).flux_impose(j, n) : ref_cast(Dirichlet, itr.valeur()).val_imp(j, n); + { + int nb_faces = domaine.nb_faces(); + int premiere_face_int = domaine.premiere_face_int(); + CIntArrView ind_faces_virt_bord = domaine.ind_faces_virt_bord().view_ro(); + CIntArrView num_face = fr.num_face().view_ro(); + // tab_flux_impose is pressure here + CDoubleTabView val_imp = is_p ? ref_cast(Neumann, itr.valeur()).tab_flux_impose(true).view_ro() : ref_cast(Dirichlet, itr.valeur()).tab_val_imp().view_ro(); + DoubleTabView result = tab_result.view_wo(); + Kokkos::parallel_for(start_gpu_timer(__KERNEL_NAME__), range_1D(0, fr.nb_faces_tot()), KOKKOS_LAMBDA(const int j) + { + int f = num_face(j); + int fb = f < premiere_face_int ? f : ind_faces_virt_bord(f - nb_faces); // domaine.fbord(f); + for (int n = 0; n < N; n++) + result(fb, n) = val_imp(j, n); + }); + end_gpu_timer(__KERNEL_NAME__); + } else if (sub_type(Neumann_val_ext, itr.valeur())) //valeur externe imposee - for (j = 0; j < fr.nb_faces_tot(); j++) - for (f = fr.num_face(j), fb = domaine.fbord(f), n = 0; n < N; n++) - result(fb, n) = ref_cast(Neumann_val_ext, itr.valeur()).val_ext(j, n); + { + int nb_faces = domaine.nb_faces(); + int premiere_face_int = domaine.premiere_face_int(); + CIntArrView ind_faces_virt_bord = domaine.ind_faces_virt_bord().view_ro(); + CIntArrView num_face = fr.num_face().view_ro(); + CDoubleTabView val_ext = ref_cast(Neumann_val_ext, itr.valeur()).tab_val_ext().view_ro(); + DoubleTabView result = tab_result.view_wo(); + Kokkos::parallel_for(start_gpu_timer(__KERNEL_NAME__), range_1D(0, fr.nb_faces_tot()), KOKKOS_LAMBDA(const int j) + { + int f = num_face(j); + int fb = f < premiere_face_int ? f : ind_faces_virt_bord(f - nb_faces); // domaine.fbord(f); + for (int n = 0; n < N; n++) + result(fb, n) = val_ext(j, n); + }); + end_gpu_timer(__KERNEL_NAME__); + } else if (sub_type(Champ_Inc_P0_base, *this)) - for (j = 0; j < fr.nb_faces_tot(); j++) //Champ P0 : on peut prendre la valeur en l'element - for (f = fr.num_face(j), fb = domaine.fbord(f), n = 0; n < N; n++) - result(fb, n) = valeurs()(f_e(f, f_e(f, 0) == -1), n); - else if (sub_type(Champ_Inc_P1_base, *this)) - for (j = 0; j < fr.nb_faces_tot(); j++) //Champ P1 : moyenne des valeurs aux sommets + { + int nb_faces = domaine.nb_faces(); + int premiere_face_int = domaine.premiere_face_int(); + CIntArrView ind_faces_virt_bord = domaine.ind_faces_virt_bord().view_ro(); + CIntArrView num_face = fr.num_face().view_ro(); + CIntTabView face_voisins = domaine.face_voisins().view_ro(); + CDoubleTabView inco = valeurs().view_ro(); + DoubleTabView result = tab_result.view_wo(); + //Champ P0 : on peut prendre la valeur en l'element + Kokkos::parallel_for(start_gpu_timer(__KERNEL_NAME__), range_1D(0, fr.nb_faces_tot()), KOKKOS_LAMBDA(const int j) { - f = fr.num_face(j), fb = domaine.fbord(f); - for (n_som = 0; n_som < f_s.dimension(1) && f_s(f, n_som) >= 0;) - n_som++; - for (n = 0; n < N; n++) - result(fb, n) = 0; - for (k = 0; k < n_som; k++) - for (s = f_s(f, k), n = 0; n < N; n++) - result(fb, n) += valeurs()(s, n) / n_som; - } + int f = num_face(j); + int fb = f < premiere_face_int ? f : ind_faces_virt_bord(f - nb_faces); // domaine.fbord(f); + for (int n = 0; n < N; n++) + result(fb, n) = inco(face_voisins(f, face_voisins(f, 0) == -1), n); + }); + end_gpu_timer(__KERNEL_NAME__); + } + else if (sub_type(Champ_Inc_P1_base, *this)) + { + ToDo_Kokkos("critical"); + for (int j = 0; j < fr.nb_faces_tot(); j++) //Champ P1 : moyenne des valeurs aux sommets + { + int f = fr.num_face(j), fb = domaine.fbord(f); + for (n_som = 0; n_som < f_s.dimension(1) && f_s(f, n_som) >= 0;) + n_som++; + for (int n = 0; n < N; n++) + tab_result(fb, n) = 0; + for (k = 0; k < n_som; k++) + for (int s = f_s(f, k), n = 0; n < N; n++) + tab_result(fb, n) += valeurs()(s, n) / n_som; + } + } else if (que_suis_je() == "Champ_P1NC") - for (j = 0; j < fr.nb_faces_tot(); j++) - for (f = fr.num_face(j), fb = domaine.fbord(f), n = 0; n < N; n++) - result(fb, n) = valeurs()(f, n); + { + ToDo_Kokkos("critical"); + for (int j = 0; j < fr.nb_faces_tot(); j++) + for (int f = fr.num_face(j), fb = domaine.fbord(f), n = 0; n < N; n++) + tab_result(fb, n) = valeurs()(f, n); + } else Process::exit("Champ_Inc_base::valeur_aux_bords() : must code something!"); } - return result; + return tab_result; } diff --git a/src/Kernel/Framework/Champ_base.cpp b/src/Kernel/Framework/Champ_base.cpp index 81678cc92d..f94f4365f6 100644 --- a/src/Kernel/Framework/Champ_base.cpp +++ b/src/Kernel/Framework/Champ_base.cpp @@ -176,10 +176,10 @@ DoubleTab& Champ_base::valeur_aux_centres_de_gravite(const Domaine& dom, DoubleT } IntTrav les_polys(nb_elem); - IntArrView les_polys_v = static_cast(les_polys).view_wo(); + IntArrView polys = static_cast(les_polys).view_wo(); Kokkos::parallel_for(start_gpu_timer(__KERNEL_NAME__), nb_elem, KOKKOS_LAMBDA(const int i) { - les_polys_v(i) = i; + polys(i) = i; }); end_gpu_timer(__KERNEL_NAME__); diff --git a/src/Kernel/Framework/Debog.cpp b/src/Kernel/Framework/Debog.cpp index fd7b308e6b..24d208a953 100644 --- a/src/Kernel/Framework/Debog.cpp +++ b/src/Kernel/Framework/Debog.cpp @@ -86,6 +86,12 @@ void Debog::verifier_getref(const char* const msg, int x, int& ref) Debog_Pb::get_debog_instance()->verifier(msg, x, &ref); } +void Debog::verifier(const std::string& msg, const DoubleVect& x) +{ + if (Debog_Pb::get_debog_instance()) + Debog_Pb::get_debog_instance()->verifier(msg.c_str(), x); +} + void Debog::verifier(const char* const msg, const DoubleVect& x) { if (Debog_Pb::get_debog_instance()) diff --git a/src/Kernel/Framework/Debog.h b/src/Kernel/Framework/Debog.h index 4ba1072046..7761ca6b32 100644 --- a/src/Kernel/Framework/Debog.h +++ b/src/Kernel/Framework/Debog.h @@ -1,5 +1,5 @@ /**************************************************************************** -* Copyright (c) 2024, CEA +* Copyright (c) 2026, CEA * All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: @@ -17,7 +17,7 @@ #define Debog_included #include - +#include class Champ_Inc_base; class Matrice_Base; class MD_Vector; @@ -29,6 +29,7 @@ class Debog static int active(); static void verifier(const char *const msg, double); static void verifier(const char *const msg, int); + static void verifier(const std::string& msg, const DoubleVect&); static void verifier(const char *const msg, const DoubleVect&); static void verifier(const char *const msg, const IntVect&); static void verifier_bord(const char *const msg, const DoubleVect& arr, int num_deb); diff --git a/src/Kernel/Framework/Debog_Pb.tpp b/src/Kernel/Framework/Debog_Pb.tpp index de02d8dd8f..03b0616e84 100644 --- a/src/Kernel/Framework/Debog_Pb.tpp +++ b/src/Kernel/Framework/Debog_Pb.tpp @@ -1,5 +1,5 @@ /**************************************************************************** -* Copyright (c) 2025, CEA +* Copyright (c) 2026, CEA * All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: @@ -196,7 +196,7 @@ void Debog_Pb::verifier_partie_std(const TRUSTVect<_TYPE_>& reference, const TRU if (delta > seuil_relatif_) { detailed_log_file_ << " DIFF " << message << " reference[" << i2 + j << "]=" << y << " \tcurrent[" << i1 + j - << "]=" << x << " \trelative error=" << delta << finl; + << "]=" << x << " \trelative difference=" << delta << finl; } } else // int @@ -243,13 +243,13 @@ void Debog_Pb::verifier_partie_std(const TRUSTVect<_TYPE_>& reference, const TRU if (Process::je_suis_maitre()) { if (IS_DOUBLE) - log_file_ << " " << resu << " : Max relative error " << max_err_items_reels << " (max ref value=" << adim << ") id=" << identificateur << finl; + log_file_ << " " << resu << " : Max relative difference " << max_err_items_reels << " (max ref value=" << adim << ") id=" << identificateur << finl; else log_file_ << " " << resu << " : integer field " << identificateur << finl; } if (IS_DOUBLE) - detailed_log_file_ << " " << resu << " : Max relative error " << max_err_items_reels << " (max ref value=" << adim << ") id=" << identificateur << finl; + detailed_log_file_ << " " << resu << " : Max relative difference " << max_err_items_reels << " (max ref value=" << adim << ") id=" << identificateur << finl; else detailed_log_file_ << " " << resu << " : integer field " << identificateur << finl; @@ -404,7 +404,7 @@ Debog_Pb::verifier(const char *const msg, _TYPE_ x, _TYPE_ *ref_value) if (delta >= seuil_absolu_ && delta / adim >= seuil_relatif_) { err = (_TYPE_)delta; - detailed_log_file_ << " DIFF (double) reference=" << y << " \tcurrent=" << x << " \trelative error=" << delta / adim << " \t(max ref value=" << adim << ")" << finl; + detailed_log_file_ << " DIFF (double) reference=" << y << " \tcurrent=" << x << " \trelative difference=" << delta / adim << " \t(max ref value=" << adim << ")" << finl; } } else // int @@ -420,7 +420,7 @@ Debog_Pb::verifier(const char *const msg, _TYPE_ x, _TYPE_ *ref_value) const char *ok = (err > 0.) ? " ERROR " : " OK "; if (IS_DOUBLE) { - log_file_ << ok << " : comparing double: reference=" << y << " absolute error=" << err << finl; + log_file_ << ok << " : comparing double: reference=" << y << " absolute difference=" << err << finl; if (err > 0.) error_function(); } else // int diff --git a/src/Kernel/Framework/Domaine_Cl_dis_base.cpp b/src/Kernel/Framework/Domaine_Cl_dis_base.cpp index c033d8fc17..57d1a02ac2 100644 --- a/src/Kernel/Framework/Domaine_Cl_dis_base.cpp +++ b/src/Kernel/Framework/Domaine_Cl_dis_base.cpp @@ -58,7 +58,6 @@ Entree& Domaine_Cl_dis_base::readOn(Entree& is) int n = ledomaine.nb_front_Cl(); IntTab front_deja_lu(n); - front_deja_lu = 0; les_conditions_limites().dimensionner(n); int nb_clim=0; diff --git a/src/Kernel/Framework/Ecrire_fichier_xyz_valeur.cpp b/src/Kernel/Framework/Ecrire_fichier_xyz_valeur.cpp index 64a3face7b..349b9775cb 100644 --- a/src/Kernel/Framework/Ecrire_fichier_xyz_valeur.cpp +++ b/src/Kernel/Framework/Ecrire_fichier_xyz_valeur.cpp @@ -198,7 +198,7 @@ void Ecrire_fichier_xyz_valeur::write_fields() const if(champ_stat) { DoubleTab copie(field->valeurs()); - field->valeurs() = op_stat->calculer_valeurs(); + op_stat->calculer(field->valeurs()); field->valeur_aux(pos, val); field->valeurs() = copie; } diff --git a/src/Kernel/Framework/Equation_base.h b/src/Kernel/Framework/Equation_base.h index 63f1fc5e62..7b5bc10b52 100644 --- a/src/Kernel/Framework/Equation_base.h +++ b/src/Kernel/Framework/Equation_base.h @@ -257,7 +257,7 @@ public : } inline const bool& diffusion_multi_scalaire() const { return diffusion_multi_scalaire_; } - public_for_cuda + protected_but_public_for_cuda void Gradient_conjugue_diff_impl(DoubleTrav& secmem, DoubleTab& solution, int size_terme_mul, const DoubleTab& term_mul); protected : diff --git a/src/Kernel/Framework/IBM/Source_PDF_base.cpp b/src/Kernel/Framework/IBM/Source_PDF_base.cpp index 4fecca3722..9f8c32508e 100644 --- a/src/Kernel/Framework/IBM/Source_PDF_base.cpp +++ b/src/Kernel/Framework/IBM/Source_PDF_base.cpp @@ -771,7 +771,6 @@ void Source_PDF_base::update_elem_IBM(DoubleTab& vecteur_deplacement, double alp assert (nb_elem == vecteur_deplacement.dimension(0)); assert (dim_esp == vecteur_deplacement.dimension(1)); IntTab indic_dead_cell(nb_elem); - indic_dead_cell = 0; // calcul voisins de chaque element traverse IntLists elem_voisins(nb_elem_tot); diff --git a/src/Kernel/Framework/Operateur.cpp b/src/Kernel/Framework/Operateur.cpp index fc3b9267b8..fbdd0a834d 100644 --- a/src/Kernel/Framework/Operateur.cpp +++ b/src/Kernel/Framework/Operateur.cpp @@ -20,6 +20,7 @@ #include #include #include +#include Sortie& Operateur::ecrire(Sortie& os) const { @@ -147,6 +148,7 @@ void Operateur::completer() le_champ_inco=mon_equation->inconnue(); l_op_base().completer(); + op_base_ = l_op_base().que_suis_je(); } void Operateur::associer_champ(const Champ_Inc_base& ch, const std::string& nom_ch) @@ -230,6 +232,7 @@ int Operateur::impr(Sortie& os) const */ DoubleTab& Operateur::ajouter(const Champ_Inc_base& ch, DoubleTab& resu) const { + Debog::verifier(op_base_+"::ajouter(ch,resu) avant ch=",ch.valeurs()); int i ; int nstep=l_op_base().get_nb_ss_pas_de_temps(); double dt=equation().schema_temps().pas_de_temps(); @@ -249,6 +252,7 @@ DoubleTab& Operateur::ajouter(const Champ_Inc_base& ch, DoubleTab& resu) const solveur_masse.appliquer(derivee); inco.ajoute_sans_ech_esp_virt(dt, derivee, VECT_ALL_ITEMS) ; } + Debog::verifier(op_base_+"::ajouter(ch,resu) apres resu=",resu); return resu; } @@ -282,7 +286,10 @@ DoubleTab& Operateur::calculer(const Champ_Inc_base& ch,DoubleTab& resu) const */ DoubleTab& Operateur::ajouter(DoubleTab& resu) const { - return ajouter(le_champ_inco->valeurs(), resu); + Debog::verifier(op_base_+"::ajouter() avant resu=",resu); + OBS_PTR(DoubleTab) ref = ajouter(le_champ_inco->valeurs(), resu); + Debog::verifier(op_base_+"::ajouter() apres resu=",resu); + return ref; } /*! @brief Applique l'operateur au champ inconnu et renvoie le resultat. diff --git a/src/Kernel/Framework/Operateur.h b/src/Kernel/Framework/Operateur.h index b792b816ba..6f54bb3070 100644 --- a/src/Kernel/Framework/Operateur.h +++ b/src/Kernel/Framework/Operateur.h @@ -1,5 +1,5 @@ /**************************************************************************** -* Copyright (c) 2024, CEA +* Copyright (c) 2026, CEA * All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: @@ -71,7 +71,7 @@ public : virtual int op_non_nul() const =0; protected : - std::string nom_inco_; + std::string nom_inco_, op_base_; OBS_PTR(Champ_Inc_base) le_champ_inco; Motcle typ; }; diff --git a/src/Kernel/Framework/Operateur_base.cpp b/src/Kernel/Framework/Operateur_base.cpp index 8a0577b48e..d0dcc2f515 100644 --- a/src/Kernel/Framework/Operateur_base.cpp +++ b/src/Kernel/Framework/Operateur_base.cpp @@ -233,7 +233,11 @@ DoubleTab& Operateur_base::ajouter(const DoubleTab& inco, DoubleTab& secmem) co if (equation().discretisation().is_poly_family()) ajouter_blocs({}, secmem); else - ajouter_blocs({}, secmem, {{ equation().inconnue().le_nom().getString(),inco }} ); //pour prise en compte du parametre inco (qui est pas forcement l'inco de l'equation) + { + tabs_t semi_impl; + semi_impl[equation().inconnue().le_nom().getString()].ref(inco); /* evite la copie de inco dans tabs_t */ + ajouter_blocs({}, secmem, semi_impl); + } } else Process::exit(que_suis_je() + " : ajouter() not coded!"); return secmem; diff --git a/src/Kernel/Framework/Solveur_Masse_Elem_proto.cpp b/src/Kernel/Framework/Solveur_Masse_Elem_proto.cpp index 37a80f6d53..5a93bda027 100644 --- a/src/Kernel/Framework/Solveur_Masse_Elem_proto.cpp +++ b/src/Kernel/Framework/Solveur_Masse_Elem_proto.cpp @@ -36,24 +36,28 @@ void Solveur_Masse_Elem_proto::preparer_calcul_proto() solv_mass_->equation().init_champ_conserve(); } -DoubleTab& Solveur_Masse_Elem_proto::appliquer_impl_proto(DoubleTab& sm) const +DoubleTab& Solveur_Masse_Elem_proto::appliquer_impl_proto(DoubleTab& tab_sm) const { const Domaine_VF& domaine = le_dom_.valeur(); - const DoubleVect& ve = domaine.volumes(), &pe = solv_mass_->equation().milieu().porosite_elem(); - const DoubleTab& der = solv_mass_->equation().champ_conserve().derivees().at(solv_mass_->equation().inconnue().le_nom().getString()); - - int e, ne_tot = domaine.nb_elem_tot(), n, N = sm.line_size(); - assert(sm.dimension_tot(0) >= ne_tot && N == der.line_size()); + int ne_tot = domaine.nb_elem_tot(), N = tab_sm.line_size(); + const DoubleTab& tab_der = solv_mass_->equation().champ_conserve().derivees().at(solv_mass_->equation().inconnue().le_nom().getString()); + assert(tab_sm.dimension_tot(0) >= ne_tot && N == tab_der.line_size()); /* partie elem */ - for (e = 0; e < ne_tot; e++) - for (n = 0; n < N; n++) - if (std::abs(der(e, n)) > 1e-10) + CDoubleArrView pe = solv_mass_->equation().milieu().porosite_elem().view_ro(); + CDoubleArrView ve = domaine.volumes().view_ro(); + CDoubleTabView der = tab_der.view_ro(); + DoubleTabView sm = tab_sm.view_rw(); + Kokkos::parallel_for(start_gpu_timer(__KERNEL_NAME__), range_1D(0, ne_tot), KOKKOS_LAMBDA( const int e) + { + for (int n = 0; n < N; n++) + if (Kokkos::fabs(der(e, n)) > 1e-10) sm(e, n) /= pe(e) * ve(e) * der(e, n); else sm(e, n) = 0; //cas d'une evanescence - - return sm; + }); + end_gpu_timer(__KERNEL_NAME__); + return tab_sm; } void Solveur_Masse_Elem_proto::dimensionner_blocs_proto(matrices_t matrices, const tabs_t& semi_impl) const diff --git a/src/Kernel/Framework/Solveur_Masse_base.cpp b/src/Kernel/Framework/Solveur_Masse_base.cpp index 01e61e9d18..d3e2b64e78 100644 --- a/src/Kernel/Framework/Solveur_Masse_base.cpp +++ b/src/Kernel/Framework/Solveur_Masse_base.cpp @@ -125,6 +125,7 @@ DoubleTab& Solveur_Masse_base::appliquer(DoubleTab& x) const DoubleTab_parts values_parts(values); tab_divide_any_shape(x, values_parts[0], VECT_REAL_ITEMS); } + Debog::verifier("Solveur_Masse_base::appliquer before appliquer_impl, x:",x); return appliquer_impl(x); // M-1.x } diff --git a/src/Kernel/Framework/Sources.cpp b/src/Kernel/Framework/Sources.cpp index 369295b583..f9a718a90b 100644 --- a/src/Kernel/Framework/Sources.cpp +++ b/src/Kernel/Framework/Sources.cpp @@ -1,5 +1,5 @@ /**************************************************************************** -* Copyright (c) 2024, CEA +* Copyright (c) 2026, CEA * All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: @@ -16,6 +16,7 @@ #include #include #include +#include Implemente_instanciable(Sources, "Sources", LIST(Source)); // XD sources listobj sources -1 source_base -1 The sources. @@ -84,7 +85,12 @@ Entree& Sources::readOn(Entree& is) */ DoubleTab& Sources::ajouter(DoubleTab& xx) const { - for (const auto& itr : *this) itr.ajouter(xx); + Debog::verifier("Sources::ajouter() debut xx=",xx); + for (const auto& itr : *this) + { + itr.ajouter(xx); + Debog::verifier(itr.que_suis_je()+"::ajouter() apres xx=",xx); + } return xx; } diff --git a/src/Kernel/Geometrie/DecoupeBord.cpp b/src/Kernel/Geometrie/DecoupeBord.cpp index ce0fef8ff4..4a97cd026f 100644 --- a/src/Kernel/Geometrie/DecoupeBord.cpp +++ b/src/Kernel/Geometrie/DecoupeBord.cpp @@ -222,11 +222,11 @@ void Impl_32_64<_SIZE_>::create_listb_from_xyz(const Domaine_t& dom, const Noms& for (int j=0; j(res+0.5); nb=std::max(nb,face_min); diff --git a/src/Kernel/Geometrie/DomaineCutter.cpp b/src/Kernel/Geometrie/DomaineCutter.cpp index 581e49d8a4..98ce03ddbf 100644 --- a/src/Kernel/Geometrie/DomaineCutter.cpp +++ b/src/Kernel/Geometrie/DomaineCutter.cpp @@ -1304,7 +1304,6 @@ void DomaineCutter_32_64<_SIZE_>::ecrire_domaines(const Nom& basename, const Dom //To detect my parts (when running in parallel) ArrOfInt myDomaines(nb_parties_); - myDomaines = 0; ArrsOfInt otherProcDomaines(Process::nproc()); //if some domains are splitted between multiple procs, @@ -1681,7 +1680,6 @@ void DomaineCutter_32_64<_SIZE_>::ecrire_domaines(const Nom& basename, const Dom } } ArrOfInt tmp_edge_cut(nb_parties_); - tmp_edge_cut = 0; recevoir(tmp_edge_cut, proc, 0, proc+2008); for(int i_part=0; i_part& type_elem, const type_elem_face = "quadrangle_VEF"; break; case Type_Face::quadrangle_3D: - type_elem_face = (sub_type(Hexaedre,type_elem)?"rectangle":"quadrangle_VEF"); + type_elem_face = ((sub_type(Hexaedre,type_elem) || sub_type(Hexaedre_64, type_elem))?"rectangle":"quadrangle_VEF"); break; case Type_Face::quadrangle_3D_axi: type_elem_face = "quadrangle_VEF"; diff --git a/src/Kernel/Geometrie/Extraire_domaine.cpp b/src/Kernel/Geometrie/Extraire_domaine.cpp index 41f49f8302..2b9dd71fa1 100644 --- a/src/Kernel/Geometrie/Extraire_domaine.cpp +++ b/src/Kernel/Geometrie/Extraire_domaine.cpp @@ -89,7 +89,7 @@ Entree& Extraire_domaine::interpreter_(Entree& is) parser_condition_elements.setVar(2,xp(elem,2),threadId); double res=parser_condition_elements.eval(threadId); parser_condition_elements.release(threadId); - if (std::fabs(res)>1e-5) + if (Kokkos::fabs(res)>1e-5) { marq_elem(elem)=1; local_nb_elem_m++; diff --git a/src/Kernel/Geometrie/Extraire_surface.cpp b/src/Kernel/Geometrie/Extraire_surface.cpp index d3a84e930e..e3089c6996 100644 --- a/src/Kernel/Geometrie/Extraire_surface.cpp +++ b/src/Kernel/Geometrie/Extraire_surface.cpp @@ -143,7 +143,7 @@ void Extraire_surface::extraire_surface_without_cleaning(Domaine& domaine_surfac parser_condition_elements.setVar(2,xp(elem,2),threadId); double res = parser_condition_elements.eval(threadId); parser_condition_elements.release(threadId); - marq_elem(elem) = std::fabs(res)>1e-5 ? 1 : 0; + marq_elem(elem) = Kokkos::fabs(res)>1e-5 ? 1 : 0; }); end_gpu_timer(__KERNEL_NAME__); tab_marq_elem.echange_espace_virtuel(); @@ -226,7 +226,7 @@ void Extraire_surface::extraire_surface_without_cleaning(Domaine& domaine_surfac parser_condition_faces.setVar(2,xv(fac,2),threadId); double res=parser_condition_faces.eval(threadId); parser_condition_faces.release(threadId); - if (std::fabs(res)>1e-5) + if (Kokkos::fabs(res)>1e-5) if (marq[fac]!=-1) // pas un joint, ou on est le proprietaire { marq[fac]=1; diff --git a/src/Kernel/Geometrie/Transformer.cpp b/src/Kernel/Geometrie/Transformer.cpp index 4d186ecd3b..dfdb3040d1 100644 --- a/src/Kernel/Geometrie/Transformer.cpp +++ b/src/Kernel/Geometrie/Transformer.cpp @@ -93,9 +93,9 @@ void Transformer_32_64<_SIZE_>::transformer(Domaine_t& dom, Noms& les_fcts) } for (int j = 0; j < Objet_U::dimension; j++) { - fxyz[j].setVar("x", x); - fxyz[j].setVar("y", y); - fxyz[j].setVar("z", z); + fxyz[j].setVar(0, x); + fxyz[j].setVar(1, y); + fxyz[j].setVar(2, z); new_sommets(i, j) = fxyz[j].eval(); } } diff --git a/src/Kernel/Math/MD_Vector_base.h b/src/Kernel/Math/MD_Vector_base.h index d76550bf88..44118d0c61 100644 --- a/src/Kernel/Math/MD_Vector_base.h +++ b/src/Kernel/Math/MD_Vector_base.h @@ -1,5 +1,5 @@ /**************************************************************************** -* Copyright (c) 2025, CEA +* Copyright (c) 2026, CEA * All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: diff --git a/src/Kernel/Math/MD_Vector_composite.h b/src/Kernel/Math/MD_Vector_composite.h index c191caac4c..92ef1154ff 100644 --- a/src/Kernel/Math/MD_Vector_composite.h +++ b/src/Kernel/Math/MD_Vector_composite.h @@ -1,5 +1,5 @@ /**************************************************************************** -* Copyright (c) 2025, CEA +* Copyright (c) 2026, CEA * All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: diff --git a/src/Kernel/Math/MD_Vector_mono.cpp b/src/Kernel/Math/MD_Vector_mono.cpp index 09daa49370..302ffb6bf7 100644 --- a/src/Kernel/Math/MD_Vector_mono.cpp +++ b/src/Kernel/Math/MD_Vector_mono.cpp @@ -1,5 +1,5 @@ /**************************************************************************** -* Copyright (c) 2025, CEA +* Copyright (c) 2026, CEA * All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: @@ -42,10 +42,8 @@ Sortie& MD_Vector_mono::printOn(Sortie& os) const return os; } -void flatten(const ArrOfInt& blocs_items, ArrOfInt& items) +static void flatten(const ArrOfInt& blocs_items, ArrOfInt& items) { - // Build from blocs_items_to_sum_ - //const ArrOfInt& blocs_items_to_sum = get_blocs_items_to_sum(); const int nblocs = blocs_items.size_array() >> 1; const int *bloc_itr = blocs_items.addr(); int size = 0; @@ -56,30 +54,27 @@ void flatten(const ArrOfInt& blocs_items, ArrOfInt& items) size += end_bloc - begin_bloc; } items.resize(size); - int item = 0; + int k = 0; bloc_itr = blocs_items.addr(); for (int bloc=0; bloc 0) + flatten(blocs_items_to_sum_, items_to_sum_); return items_to_sum_; } const ArrOfInt& MD_Vector_mono::get_items_to_compute() const { - if (items_to_compute_.size_array()==0) - flatten(get_blocs_items_to_compute(), items_to_compute_); + if (items_to_compute_.size_array() == 0 && blocs_items_to_compute_.size_array() > 0) + flatten(blocs_items_to_compute_, items_to_compute_); return items_to_compute_; } diff --git a/src/Kernel/Math/MD_Vector_mono.h b/src/Kernel/Math/MD_Vector_mono.h index 7e246f5ece..bfe8b97b42 100644 --- a/src/Kernel/Math/MD_Vector_mono.h +++ b/src/Kernel/Math/MD_Vector_mono.h @@ -1,5 +1,5 @@ /**************************************************************************** -* Copyright (c) 2025, CEA +* Copyright (c) 2026, CEA * All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: diff --git a/src/Kernel/Math/Matrices/Matrice_Base.cpp b/src/Kernel/Math/Matrices/Matrice_Base.cpp index fb0f2f987c..1439c7bd30 100644 --- a/src/Kernel/Math/Matrices/Matrice_Base.cpp +++ b/src/Kernel/Math/Matrices/Matrice_Base.cpp @@ -55,7 +55,7 @@ void Matrice_Base::build_stencil() Process::exit( ); } -int Matrice_Base::get_stencil_size() const +auto Matrice_Base::get_stencil_size() const { return stencil_.dimension( 0 ); } diff --git a/src/Kernel/Math/Matrices/Matrice_Base.h b/src/Kernel/Math/Matrices/Matrice_Base.h index 629e43e55f..845a404ead 100644 --- a/src/Kernel/Math/Matrices/Matrice_Base.h +++ b/src/Kernel/Math/Matrices/Matrice_Base.h @@ -78,15 +78,18 @@ public : virtual void get_symmetric_stencil_and_coefficients(Stencil& stencil, StencilCoeffs& coefficients) const; - int get_stencil_size() const ; + auto get_stencil_size() const ; virtual void build_stencil(); void set_stencil( const Stencil& stencil ); bool is_stencil_up_to_date() const ; + void set_has_constant_nullspace(bool has_constant_nullspace) { has_constant_nullspace_ = has_constant_nullspace; } + bool has_constant_nullspace() const { return has_constant_nullspace_; } protected: bool is_stencil_up_to_date_ = false; + bool has_constant_nullspace_ = false; Stencil stencil_ ; }; diff --git a/src/Kernel/Math/Matrices/Matrice_Bloc.cpp b/src/Kernel/Math/Matrices/Matrice_Bloc.cpp index d91f2e2269..713d154663 100644 --- a/src/Kernel/Math/Matrices/Matrice_Bloc.cpp +++ b/src/Kernel/Math/Matrices/Matrice_Bloc.cpp @@ -20,6 +20,7 @@ #include #include #include +#include Implemente_instanciable_sans_constructeur(Matrice_Bloc,"Matrice_Bloc",Matrice_Base); @@ -229,8 +230,8 @@ void Matrice_Bloc::get_stencil( Stencil& stencil ) const Stencil& local_stencil = local_stencils[ i * nb_column_blocks + j ]; local_matrix.get_stencil( local_stencil ); - const int size = local_stencil.dimension( 0 ); - for ( int k=0; k(local_matrix, local_stencil, local_coeff); - const int size = local_stencil.dimension( 0 ); + const auto size = local_stencil.dimension( 0 ); - for ( int k=0; k(local_matrix, local_stencil, local_coefficients); - const int size = local_stencil.dimension( 0 ); - for ( int k=0; k tab_bloc_nnz(4); + auto bloc_nnz = tab_bloc_nnz.view_rw(); + const auto stencil = tab_stencil.view_ro(); + IntArrView cnt_RR = tab_cnt_RR.view_rw(); + IntArrView cnt_RV = tab_cnt_RV.view_rw(); + IntArrView cnt_VR = tab_cnt_VR.view_rw(); + IntArrView cnt_VV = tab_cnt_VV.view_rw(); + Kokkos::parallel_for(start_gpu_timer(__KERNEL_NAME__), nnz, KOKKOS_LAMBDA(const nnz_t k) + { + const int row = stencil(k,0); + const int col = stencil(k,1); + if (col < j) + { + if (row < i) + { + Kokkos::atomic_inc(&cnt_RR[row]); + Kokkos::atomic_inc(&bloc_nnz[0]); + } + else + { + Kokkos::atomic_inc(&cnt_VR[row-i]); + Kokkos::atomic_inc(&bloc_nnz[1]); + } + } + else + { + if (row < i) + { + Kokkos::atomic_inc(&cnt_RV[row]); + Kokkos::atomic_inc(&bloc_nnz[2]); + } + else + { + Kokkos::atomic_inc(&cnt_VV[row-i]); + Kokkos::atomic_inc(&bloc_nnz[3]); + } + } + }); + end_gpu_timer(__KERNEL_NAME__); + RR.dimensionner(i, j, tab_bloc_nnz(0)); + RV.dimensionner(i, m-j, tab_bloc_nnz(2)); + VR.dimensionner(n-i, j, tab_bloc_nnz(1)); + VV.dimensionner(n-i, m-j, tab_bloc_nnz(3)); + // Fill tab1: + TRUSTArray tab_ptr_RR(i), tab_ptr_RV(i), tab_ptr_VR(n-i), tab_ptr_VV(n-i); + auto ptr_RR = tab_ptr_RR.view_rw(); + auto ptr_RV = tab_ptr_RV.view_rw(); + auto ptr_VR = tab_ptr_VR.view_rw(); + auto ptr_VV = tab_ptr_VV.view_rw(); + auto RR_tab1 = RR.get_set_tab1().view_wo(); + Kokkos::parallel_scan(start_gpu_timer(__KERNEL_NAME__), i+1, KOKKOS_LAMBDA(const int row, nnz_t& offset, const bool final) + { + if (final) RR_tab1[row] = offset; + if (row < i) + { + if (final) ptr_RR[row] = offset; + offset += cnt_RR[row]; + } + }); + end_gpu_timer(__KERNEL_NAME__); + auto RV_tab1 = RV.get_set_tab1().view_wo(); + Kokkos::parallel_scan(start_gpu_timer(__KERNEL_NAME__), i+1, KOKKOS_LAMBDA(const int row, nnz_t& offset, const bool final) + { + if (final) RV_tab1[row] = offset; + if (row < i) + { + if (final) ptr_RV[row] = offset; + offset += cnt_RV[row]; + } + }); + end_gpu_timer(__KERNEL_NAME__); + auto VR_tab1 = VR.get_set_tab1().view_wo(); + Kokkos::parallel_scan(start_gpu_timer(__KERNEL_NAME__), n-i+1, KOKKOS_LAMBDA(const int row, nnz_t& offset, const bool final) + { + if (final) VR_tab1[row] = offset; + if (row < n-i) + { + if (final) ptr_VR[row] = offset; + offset += cnt_VR[row]; + } + }); + end_gpu_timer(__KERNEL_NAME__); + auto VV_tab1 = VV.get_set_tab1().view_wo(); + Kokkos::parallel_scan(start_gpu_timer(__KERNEL_NAME__), n-i+1, KOKKOS_LAMBDA(const int row, nnz_t& offset, const bool final) + { + if (final) VV_tab1[row] = offset; + if (row < n-i) + { + if (final) ptr_VV[row] = offset; + offset += cnt_VV[row]; + } + }); + end_gpu_timer(__KERNEL_NAME__); + + // Fill tab2 and coeff: + auto RR_tab2 = RR.get_set_tab2().view_wo(); + auto RR_coeff = RR.get_set_coeff().view_wo(); + auto RV_tab2 = RV.get_set_tab2().view_wo(); + auto RV_coeff = RV.get_set_coeff().view_wo(); + auto VR_tab2 = VR.get_set_tab2().view_wo(); + auto VR_coeff = VR.get_set_coeff().view_wo(); + auto VV_tab2 = VV.get_set_tab2().view_wo(); + auto VV_coeff = VV.get_set_coeff().view_wo(); + Kokkos::parallel_for(start_gpu_timer(__KERNEL_NAME__), nnz, KOKKOS_LAMBDA(const nnz_t k) + { + const int row = stencil(k,0); + const int col = stencil(k,1); + const double coeff = 0.; + if (col < j) + { + if (row < i) + { + auto slot = Kokkos::atomic_fetch_inc(&ptr_RR[row]); + RR_tab2[slot] = col; + RR_coeff[slot] = coeff; + } + else + { + auto slot = Kokkos::atomic_fetch_inc(&ptr_VR[row-i]); + VR_tab2[slot] = col; + VR_coeff[slot] = coeff; + } + } + else + { + if (row < i) + { + auto slot = Kokkos::atomic_fetch_inc(&ptr_RV[row]); + RV_tab2[slot] = col-j; + RV_coeff[slot] = coeff; + } + else + { + auto slot = Kokkos::atomic_fetch_inc(&ptr_VV[row-i]); + VV_tab2[slot] = col-j; + VV_coeff[slot] = coeff; + } + } + }); + end_gpu_timer(__KERNEL_NAME__); + // Convertit en notation Fortran RR.formeF(); RV.formeF(); VR.formeF(); VV.formeF(); - - // Compactage de la matrice RR.compacte(); RV.compacte(); VR.compacte(); VV.compacte(); + // Tri par colonne croissante + RR.sort_stencil(); + RV.sort_stencil(); + VR.sort_stencil(); + VV.sort_stencil(); } Matrice_Bloc& Matrice_Bloc::operator *=( double x ) diff --git a/src/Kernel/Math/Matrices/Matrice_Bloc.h b/src/Kernel/Math/Matrices/Matrice_Bloc.h index a0d0bb21ea..2b457e9c51 100644 --- a/src/Kernel/Math/Matrices/Matrice_Bloc.h +++ b/src/Kernel/Math/Matrices/Matrice_Bloc.h @@ -20,6 +20,7 @@ #include #include #include +#include #include #include @@ -93,14 +94,8 @@ public : int nb_bloc_lignes() const; // retourne N_ int nb_bloc_colonnes(void ) const; // retourne M_ - // Remplissage par une matrice morse symetrique - void remplir(const IntLists& voisins, const DoubleLists& valeurs, const DoubleVect& terme_diag, const int i, const int n); - - // // Remplissage par une matrice morse - void remplir(const IntLists& voisins, const DoubleLists& valeurs, const int i, const int n, const int j, const int m); - - // Remplissage par une matrice morse symetrique ou non - void remplir(const IntLists& voisins, const DoubleLists& valeurs, const DoubleVect& terme_diag, const int i, const int n, const int j, const int m); + // Remplissage depuis un Stencil + void remplir(const Stencil& stencil, const int i, const int n, int j=-1, int m=-1); // // Conversion vers une Matrice_Morse void block_to_morse( Matrice_Morse& matrix ) const; diff --git a/src/Kernel/Math/Matrices/Matrice_Bloc_Sym.cpp b/src/Kernel/Math/Matrices/Matrice_Bloc_Sym.cpp index 1a3a730c79..cb0c80e0eb 100644 --- a/src/Kernel/Math/Matrices/Matrice_Bloc_Sym.cpp +++ b/src/Kernel/Math/Matrices/Matrice_Bloc_Sym.cpp @@ -343,8 +343,8 @@ void Matrice_Bloc_Sym::get_symmetric_stencil( Stencil& stencil ) const local_matrix.get_stencil( local_stencil_ ); } - const int size = local_stencil_.dimension( 0 ); - for ( int k=0; k tab_elim_coeff(tab2_.size_array()); + int nb_coefficient_to_suppress=0; // Nombre de coefficients supprimes if (elim_coeff_nul) { ArrOfDouble tab_coeff_max(n); - tab_coeff_max = 0.; // Recherche des coefficients nuls hors diagonale a supprimer de la matrice morse + const auto tab1 = tab1_.view_ro(); + const auto coeff = coeff_.view_ro(); + DoubleArrView coeff_max = tab_coeff_max.view_rw(); + auto elim_coeff = tab_elim_coeff.view_rw(); + int coeff_nuls = 0; + Kokkos::parallel_reduce(start_gpu_timer(__KERNEL_NAME__), range_1D(0, n), KOKKOS_LAMBDA(const int i, int& coeff_nuls_local) { - ArrOfInt tab_cnt(1); - tab_cnt = 0; - auto tab1 = tab1_.view_ro(); - CDoubleArrView coeff = coeff_.view_ro(); - DoubleArrView coeff_max = tab_coeff_max.view_rw(); - auto elim_coeff = tab_elim_coeff.view_rw(); - IntArrView cnt = tab_cnt.view_rw(); - Kokkos::parallel_for(start_gpu_timer(__KERNEL_NAME__), range_1D(0, n), KOKKOS_LAMBDA(const int i) - { - auto k1 = tab1(i)-1; - auto k2 = tab1(i+1)-1; - for (auto k = k1; k < k2; k++) - { - double abs_c = Kokkos::fabs(coeff(k)); - if (abs_c > coeff_max(i)) coeff_max(i) = abs_c; - if (coeff(k) == 0) - { - Kokkos::atomic_add(&cnt(0), 1); - elim_coeff(k) = 1; - } - } - }); - end_gpu_timer(__KERNEL_NAME__); - coeff_nuls = tab_cnt(0); - } - + nnz_t k1 = tab1(i) - 1; + nnz_t k2 = tab1(i + 1) - 1; + for (nnz_t k = k1; k < k2; k++) + { + if (Kokkos::fabs(coeff(k)) > coeff_max(i)) coeff_max(i) = Kokkos::fabs(coeff(k)); + if (coeff(k) == 0) + { + coeff_nuls_local++; + elim_coeff(k) = 1; + } + } + }, coeff_nuls); + end_gpu_timer(__KERNEL_NAME__); + nb_coefficient_to_suppress+=coeff_nuls; if (elim_coeff_nul==2) { // Recherche des coefficients quasi nuls hors diagonale (1.e-12 plus petit que le coefficient le plus grand de la ligne) a supprimer de la matrice morse const double eps = Objet_U::precision_geom; - ArrOfInt tab_cnt(1); - tab_cnt = 0; - auto tab1 = tab1_.view_ro(); - CDoubleArrView coeff = coeff_.view_ro(); - CDoubleArrView coeff_max = tab_coeff_max.view_ro(); - IntArrView elim_coeff = tab_elim_coeff.view_rw(); - IntArrView cnt = tab_cnt.view_rw(); - Kokkos::parallel_for(start_gpu_timer(__KERNEL_NAME__), range_1D(0, n), KOKKOS_LAMBDA(const int i) + int coeff_quasi_nuls = 0; + Kokkos::parallel_reduce(start_gpu_timer(__KERNEL_NAME__), range_1D(0, n), KOKKOS_LAMBDA(const int i, int& coeff_quasi_nuls_local) { double cm = coeff_max(i); - if (!est_egal(cm, 0., eps) && cm < 1e10) + if (!est_egal(cm, 0., eps) && cm < 1e10) // Le plus grand coefficient doit etre strictement positif { - auto k1 = tab1(i) - 1; - auto k2 = tab1(i + 1) - 1; - for (auto k = k1; k < k2; k++) - if (coeff(k) != 0 && est_egal(Kokkos::fabs(coeff(k)) / cm, 0., eps)) + nnz_t k1 = tab1(i) - 1; + nnz_t k2 = tab1(i + 1) - 1; + for (nnz_t k = k1; k < k2; k++) + if (coeff(k) != 0 // Les coefficients nuls ont deja ete trouves + && est_egal(Kokkos::fabs(coeff(k)) / cm, 0., eps)) { - Kokkos::atomic_add(&cnt(0), 1); + coeff_quasi_nuls_local++; elim_coeff(k) = 1; } } - }); + }, coeff_quasi_nuls); end_gpu_timer(__KERNEL_NAME__); - coeff_quasi_nuls = tab_cnt(0); + nb_coefficient_to_suppress+=coeff_quasi_nuls; } } // Recherche des coefficients doublons - int nb_doublons=0; + int doublons = 0; + const auto tab1 = tab1_.view_ro(); + const auto tab2 = tab2_.view_ro(); + const auto coeff = coeff_.view_ro(); + auto elim_coeff = tab_elim_coeff.view_rw(); + Kokkos::parallel_reduce(start_gpu_timer(__KERNEL_NAME__), n, KOKKOS_LAMBDA(const int i, int& doublons_local) { - auto tab1 = tab1_.view_ro(); - CIntArrView tab2 = tab2_.view_ro(); - CDoubleArrView coeff = coeff_.view_ro(); - IntArrView elim_coeff = tab_elim_coeff.view_rw(); - ArrOfInt tab_doublons(1); - tab_doublons = 0; - ArrOfInt tab_error(1); - tab_error = 0; - IntArrView doublons = tab_doublons.view_rw(); - IntArrView error = tab_error.view_rw(); - Kokkos::parallel_for(start_gpu_timer(__KERNEL_NAME__), range_1D(0, n), KOKKOS_LAMBDA(const int i) - { - auto k1 = tab1(i)-1; - auto k2 = tab1(i+1)-1; - int jmax = -1; // Highest column of a coefficient in the line i - for (auto k = k1; k < k2; k++) - { - int j = tab2(k)-1; - if (j > jmax) - jmax = j; - else - { - // Found a column j lower than jmax, check if not defined before: - for (auto kk = k-1; kk >= k1; kk--) - { - int jj = tab2(kk)-1; - if (jj == j) - { - // Already defined! - Kokkos::atomic_add(&doublons(0), 1); - elim_coeff(k) = 1; - // Check if same coefficients: - if (coeff(kk) != coeff(k)) - Kokkos::atomic_add(&error(0), 1); - break; - } - } - } - } - }); - end_gpu_timer(__KERNEL_NAME__); - nb_doublons = tab_doublons(0); - if (tab_error(0)) - { - Cerr << "Error in a Matrix Morse: duplicate entries with different values!" << finl; - exit(); - } - } - - auto nnz(tab1_(0)); - nnz=0; - if (nb_doublons || coeff_nuls || coeff_quasi_nuls) - { - // Step 1: Count kept entries per row (parallel_for over rows) - ArrOfInt tab_kept_per_row(n); + nnz_t k1 = tab1(i) - 1; + nnz_t k2 = tab1(i + 1) - 1; + int jmax = -1; // Highest column of a coefficient in the line i + for (nnz_t k = k1; k < k2; k++) { - auto tab1 = tab1_.view_ro(); - CIntArrView elim_coeff = tab_elim_coeff.view_ro(); - IntArrView kept_per_row = tab_kept_per_row.view_wo(); - Kokkos::parallel_for(start_gpu_timer(__KERNEL_NAME__), range_1D(0, n), KOKKOS_LAMBDA(const int i) - { - int count = 0; - auto k1 = tab1(i)-1; - auto k2 = tab1(i+1)-1; - for (auto k = k1; k < k2; k++) - if (!elim_coeff(k)) count++; - kept_per_row(i) = count; - }); - end_gpu_timer(__KERNEL_NAME__); - } - - // Step 2: Save old tab1_ (needed for source offsets in scatter step) - auto old_tab1(tab1_); - - // Step 3: Update tab1_ via prefix scan (updates tab1_(1..n), tab1_(0)=1 unchanged) - using tab1_scan_t = decltype(nnz); - { - auto tab1 = tab1_.view_rw(); - CIntArrView kept_per_row = tab_kept_per_row.view_ro(); - Kokkos::parallel_scan(start_gpu_timer(__KERNEL_NAME__), range_1D(0, n), KOKKOS_LAMBDA(const int i, tab1_scan_t& update, const bool final) - { - update += kept_per_row(i); - if (final) tab1(i+1) = update + 1; - }); - end_gpu_timer(__KERNEL_NAME__); - } - - // Step 4: Out-of-place scatter of coeff_ and tab2_ to new positions (parallel_for over rows) - // Safe because new_pos(i) <= old_pos(i) always, and rows are processed independently - nnz = tab1_[n] - 1; - auto new_coeff(coeff_); - auto new_tab2(tab2_); - { - auto tab1 = tab1_.view_ro(); - auto old_tab1_ro = old_tab1.view_ro(); - CDoubleArrView coeff_src = coeff_.view_ro(); - CIntArrView tab2_src = tab2_.view_ro(); - DoubleArrView coeff_dst = new_coeff.view_wo(); - IntArrView tab2_dst = new_tab2.view_wo(); - CIntArrView elim_coeff = tab_elim_coeff.view_ro(); - Kokkos::parallel_for(start_gpu_timer(__KERNEL_NAME__), range_1D(0, n), KOKKOS_LAMBDA(const int i) - { - auto new_pos = tab1(i) - 1; - auto k1 = old_tab1_ro(i)-1; - auto k2 = old_tab1_ro(i+1)-1; - for (auto k = k1; k < k2; k++) - if (!elim_coeff(k)) + int j = tab2(k) - 1; + if (j > jmax) + jmax = j; + else + { + // Found a column j lower than jmax, check if not defined before: + for (nnz_t kk = k-1; kk >= k1; kk--) { - coeff_dst(new_pos) = coeff_src(k); - tab2_dst(new_pos) = tab2_src(k); - new_pos++; + int jj = tab2(kk) - 1; + if (jj == j) + { + // Already defined! + doublons_local++; + elim_coeff(k) = 1; + // Check if same coefficients: + if (coeff(kk) != coeff(k)) Process::Kokkos_exit("Error in Matrice_Morse::compacte !"); + break; + } } - }); - end_gpu_timer(__KERNEL_NAME__); - } - - // Step 5: Copy compacted data back - { - auto tab2 = tab2_.view_rw(); - auto coeff = coeff_.view_rw(); - CIntArrView new_tab2_ro = new_tab2.view_ro(); - CDoubleArrView new_coeff_ro = new_coeff.view_ro(); - Kokkos::parallel_for(start_gpu_timer(__KERNEL_NAME__), range_1D(0, nnz), KOKKOS_LAMBDA(const int i) - { - tab2(i) = new_tab2_ro(i); - coeff(i) = new_coeff_ro(i); - }); - end_gpu_timer(__KERNEL_NAME__); + } } - } - else + }, doublons); + end_gpu_timer(__KERNEL_NAME__); + nb_coefficient_to_suppress+=doublons; + if (nb_coefficient_to_suppress) { - nnz = tab1_[n] - 1; + Cerr << nb_coefficient_to_suppress << " null or duplicated coefficients removed from a CSR matrix." << finl; + // Copie de la matrice: + ToDo_Kokkos("avoid this 3 copy..."); + auto tab_old_tab1(tab1_); + auto tab_old_tab2(tab2_); + auto tab_old_coeff(coeff_); + // Redimensionnement de l'actuelle: + tab2_.resize(tab_old_coeff.size() - nb_coefficient_to_suppress); + coeff_.resize(tab_old_coeff.size() - nb_coefficient_to_suppress); + // Copie des coefficients a garder: + const auto old_tab1 = tab_old_tab1.view_ro(); + const auto old_tab2 = tab_old_tab2.view_ro(); + const auto old_coeff = tab_old_coeff.view_ro(); + auto new_tab1 = tab1_.view_wo(); + auto new_tab2 = tab2_.view_wo(); + auto new_coeff = coeff_.view_wo(); + Kokkos::parallel_scan(start_gpu_timer(__KERNEL_NAME__), n, KOKKOS_LAMBDA(const int i, nnz_t& offset, const bool final) + { + nnz_t k1 = old_tab1(i) - 1; + nnz_t k2 = old_tab1(i+1) - 1; + nnz_t count = 0; + for (nnz_t k = k1; k < k2; k++) + if (!elim_coeff(k)) count++; + if (final) + { + new_tab1(i+1) = offset + count + 1; + nnz_t nnz = offset; + for (nnz_t k = k1; k < k2; k++) + if (!elim_coeff(k)) + { + new_coeff(nnz) = old_coeff(k); + new_tab2(nnz) = old_tab2(k); + nnz++; + } + } + offset += count; + }); + end_gpu_timer(__KERNEL_NAME__); } - - // On redimensionne les tableaux - tab2_.resize(nnz); - coeff_.resize(nnz); - morse_matrix_structure_has_changed_=1, sorted_ = 0; assert_check_morse_matrix_structure( ); } @@ -815,9 +738,10 @@ Matrice_Morse& Matrice_Morse::diagmulmat(const DoubleVect& x) Cerr << "Matrice_Morse::diagmulmat bad dimensions" << finl; exit(); } + set_tab1_int32(); F77NAME(DIAMUA)(&m ,&l, - coeff_.addr(),tab2_.addr(),reinterpret_cast(tab1_.addr()),x.addr(), - coeff_.addr(),tab2_.addr(),reinterpret_cast(tab1_.addr())); + coeff_.addr(),tab2_.addr(),get_tab1_int32().addr(),x.addr(), + coeff_.addr(),tab2_.addr(),const_cast(get_tab1_int32().addr())); return(*this); } @@ -1262,6 +1186,7 @@ int Matrice_Morse::inverse(const DoubleVect& secmem, DoubleVect& solution, int minits = 10; int maxits = std::max(minits, retry_on_failure ? nn : max_iter); int io = 0; + set_tab1_int32(); F77NAME(PGMRES)(&nn, &ima, toto.addr(), solution.addr(), vv.addr(), &coeff_seuilr, &maxits, &io, coeff_.addr(), tab2_.addr(), get_tab1_int32().addr(), alu.addr(), jlu.addr(), ju.addr(), &ie); @@ -1748,10 +1673,13 @@ void Matrice_Morse::remplir(const int ideb, const int jdeb, const int n, const i void Matrice_Morse::formeC() { int n=nb_lignes(); + ToDo_Kokkos("critical"); for(int ii=0; ii<=n; ii++) tab1_(ii)--; + ToDo_Kokkos("critical"); for(int ii=0; ii k1 && tab2(m - 1) > col_k) + { + tab2(m) = tab2(m - 1); + coeff(m) = coeff(m - 1); + --m; + } + tab2(m) = col_k; + coeff(m) = val_k; + } + }); + end_gpu_timer(__KERNEL_NAME__); + + morse_matrix_structure_has_changed_ = 1; + sorted_ = 1; + return *this; +} + // Explicit instantiations for 'auto nnz' abbreviated function templates template Matrice_Morse::Matrice_Morse(int, int); template Matrice_Morse::Matrice_Morse(int, int, int); diff --git a/src/Kernel/Math/Matrices/Matrice_Morse.h b/src/Kernel/Math/Matrices/Matrice_Morse.h index aa0caea9dd..590e90a7da 100644 --- a/src/Kernel/Math/Matrices/Matrice_Morse.h +++ b/src/Kernel/Math/Matrices/Matrice_Morse.h @@ -46,6 +46,7 @@ * * @sa Matrice_Base Matrice_Morse_Sym */ +class Matrice_Morse_Sym; class Matrice_Morse : public Matrice_Base { @@ -164,6 +165,9 @@ public : // A= creat_transposee(B) virtual Matrice_Morse& transpose(const Matrice_Morse& a); + // Build full (non-symmetric) Morse matrix from a symmetric one (upper triangle storage) + Matrice_Morse& convert(const Matrice_Morse_Sym& MS); + // A=x*A (x vecteur diag) virtual Matrice_Morse& diagmulmat(const DoubleVect& x); diff --git a/src/Kernel/Math/Matrices/Matrice_Morse_Sym.cpp b/src/Kernel/Math/Matrices/Matrice_Morse_Sym.cpp index bffbbb31a1..66702d300d 100644 --- a/src/Kernel/Math/Matrices/Matrice_Morse_Sym.cpp +++ b/src/Kernel/Math/Matrices/Matrice_Morse_Sym.cpp @@ -561,7 +561,7 @@ void Matrice_Morse_Sym::get_symmetric_stencil( Stencil& stencil ) const } } - const int new_size = stencil.dimension( 0 ); + const auto new_size = stencil.dimension( 0 ); stencil.resize( new_size, 2 ); } @@ -633,7 +633,7 @@ void Matrice_Morse_Sym::get_symmetric_stencil_and_coefficients( Stencil& st } } - const int new_size = stencil.dimension( 0 ); + const auto new_size = stencil.dimension( 0 ); assert( coefficients.size_array( ) == new_size ); diff --git a/src/Kernel/Math/Matrices/Matrice_Sym.cpp b/src/Kernel/Math/Matrices/Matrice_Sym.cpp index c385a1d1c2..24cd5a07df 100644 --- a/src/Kernel/Math/Matrices/Matrice_Sym.cpp +++ b/src/Kernel/Math/Matrices/Matrice_Sym.cpp @@ -35,8 +35,8 @@ void Matrice_Sym::unsymmetrize_stencil( const int nb_lines, ArrOfInt offsets( nb_lines + 1 ); offsets[ 0 ] = 0; - const int symmetric_stencil_size = symmetric_stencil.dimension( 0 ); - for ( int k=0; k 0 ) @@ -167,66 +167,63 @@ void Matrix_tools::allocate_morse_matrix( const int nb_lines, Matrice_Morse& matrix , const bool& attach_stencil_to_matrix ) { - assert( is_normalized_stencil( stencil ) ); - - const int nb_coefficients = stencil.dimension( 0 ); - matrix.dimensionner( nb_lines, nb_columns, nb_coefficients ); - { - auto& tab1 = matrix.get_set_tab1(); - auto& tab2 = matrix.get_set_tab2(); - if ( nb_coefficients > 0 ) - { - tab1 = 0; - tab1[0] = 1; - for ( int i=0; i Build with no coefficients + build_morse_matrix(nb_lines, nb_columns, stencil, coefficients, matrix, attach_stencil_to_matrix); } void Matrix_tools::build_morse_matrix( const int nb_lines, const int nb_columns, - const Stencil& stencil, - const StencilCoeffs& coefficients, - Matrice_Morse& matrix ) + const Stencil& tab_stencil, + const StencilCoeffs& tab_coefficients, + Matrice_Morse& matrix, + const bool& attach_stencil_to_matrix) { // No : stencil do not rely on sorted columns //assert( is_normalized_stencil( stencil ) ); - const int nb_coefficients = stencil.dimension( 0 ); - assert( nb_coefficients == coefficients.size_array( ) ); + using nnz_t = decltype(tab_stencil.dimension(0)); + const nnz_t nnz = tab_stencil.dimension( 0 ); + bool has_coefficients = tab_coefficients.size_array() != 0; + assert(!has_coefficients || nnz == tab_coefficients.size_array()); matrix.dimensionner( nb_lines, nb_columns, - nb_coefficients ); + nnz ); - if ( nb_coefficients > 0 ) + if (nnz > 0) { - matrix.get_set_tab1() =0 ; - matrix.get_set_tab1()( 0 ) = 1; - for ( int i=0; i= 0 ); - assert( stencil( i ,0 ) < nb_lines ); - assert( stencil( i ,1 ) >= 0 ); - assert( stencil( i ,1 ) < nb_columns ); - - matrix.get_set_tab1()( stencil( i, 0 ) + 1 ) += 1; - matrix.get_set_tab2()( i ) = stencil( i, 1 ) + 1; - matrix.get_set_coeff()( i ) = coefficients[ i ]; - } - for ( int i=0; i 0 ) - { - matrix.get_set_tab1()= 0 ; - matrix.get_set_tab1()( 0 ) = 1; - for ( int i=0; i= 0 ); - assert( stencil( i ,0 ) < order ); - assert( stencil( i ,1 ) >= 0 ); - assert( stencil( i ,1 ) < order ); - assert( stencil( i, 0 ) <= stencil( i, 1 ) ); - - matrix.get_set_tab1()( stencil( i, 0 ) + 1 ) += 1; - matrix.get_set_tab2()( i ) = stencil( i, 1 ) + 1; - } - for ( int i=0; i 0 ) - { - matrix.get_set_tab1() = 0 ; - matrix.get_set_tab1()( 0 ) = 1; - for ( int i=0; i= 0 ); - assert( stencil( i ,0 ) < order ); - assert( stencil( i ,1 ) >= 0 ); - assert( stencil( i ,1 ) < order ); - assert( stencil( i, 0 ) <= stencil( i, 1 ) ); - - matrix.get_set_tab1()( stencil( i, 0 ) + 1 ) += 1; - matrix.get_set_tab2()( i ) = stencil( i, 1 ) + 1; - matrix.get_set_coeff()( i ) = coefficients[ i ]; - } - for ( int i=0; i -#include #include #include #include #ifdef TRUST_USE_ROCM #include #endif -#include // Mandatory to have MPIX_CUDA_AWARE_SUPPORT defined or not #include Implemente_instanciable(Solv_AMG,"Solv_AMG",SolveurSys_base); @@ -57,7 +55,13 @@ Entree& Solv_AMG::readOn(Entree& is) { // amg GCP|BISGTSTAB|GMRES { atol|rtol doublee [st double] [impr] } is >> solver_; - if ((Motcle)solver_!="GCP") + if ((Motcle)solver_=="GCP") + ksp_type_ = "cg"; + else if ((Motcle)solver_=="GMRES") + ksp_type_ = "gmres"; + else if ((Motcle)solver_=="BICGSTAB") + ksp_type_ = "bcgs"; + else { Cerr << solver_ << " not supported yet for AMG !" << finl; Process::exit(); @@ -87,6 +91,19 @@ Entree& Solv_AMG::readOn(Entree& is) return is; } + +// On some GFX cards, hipsparse crashes so we take Kokkos brackend for PETSc: +Nom petsc_use_kokkos() +{ + Nom chaine_lue(""); + const char* value = std::getenv("ROCM_ARCH"); + if (value != nullptr && std::string(value) == "gfx1100") + { + chaine_lue = " -vec_type kokkos -mat_type aijkokkos "; + } + return chaine_lue; +} + void Solv_AMG::create_block_amg(int n, Nom precond) { if (getenv("TRUST_AMG")!=nullptr) precond = getenv("TRUST_AMG"); @@ -95,6 +112,7 @@ void Solv_AMG::create_block_amg(int n, Nom precond) chaine_lue_+=petsc_cg_issue_ ? "gmres" : "cg"; // Switch CG to GMRES for more robustness (BiCGstab is slower than GMRES 2xSPMV vs 1) chaine_lue_+=rtol_>0 ? Nom(rtol_, " -ksp_rtol %e") : ""; chaine_lue_+=atol_>0 ? Nom(atol_, " -ksp_atol %e") : ""; + chaine_lue_+=petsc_use_kokkos(); chaine_lue_+=" -ksp_norm_type UNPRECONDITIONED \ -pc_type fieldsplit \ -pc_fieldsplit_type additive"; @@ -107,19 +125,22 @@ void Solv_AMG::create_block_amg(int n, Nom precond) Cerr << "Use more GPUs, or try slower options: -fieldsplit_P0_pc_gamg_agg_nsmooths 0 -fieldsplit_P1_pc_gamg_agg_nsmooths 0" << finl; chaine_lue_+=" -info :pc -fieldsplit_P0_ksp_type preonly \ -fieldsplit_P0_pc_type gamg \ --fieldsplit_P0_pc_gamg_threshold 0.01 \ -fieldsplit_P0_pc_gamg_square_graph 1 \ --fieldsplit_P1_ksp_type preonly \ +-fieldsplit_P0_pc_gamg_threshold"; + chaine_lue_+=st_>0 ? Nom(st_, " %e") : " 0.01"; + chaine_lue_+=" -fieldsplit_P1_ksp_type preonly \ -fieldsplit_P1_pc_type gamg \ --fieldsplit_P1_pc_gamg_threshold 0.01 \ --fieldsplit_P1_pc_gamg_square_graph 1"; +-fieldsplit_P1_pc_gamg_square_graph 1 \ +-fieldsplit_P1_pc_gamg_threshold"; + chaine_lue_+=st_>0 ? Nom(st_, " %e") : " 0.01"; if (n==3) { chaine_lue_+=" -fieldsplit_Pa_ksp_type preonly \ -fieldsplit_Pa_ksp_type preonly \ -fieldsplit_Pa_pc_type gamg \ --fieldsplit_Pa_pc_gamg_threshold 0.01 \ --fieldsplit_Pa_pc_gamg_square_graph 1"; +-fieldsplit_Pa_pc_gamg_square_graph 1 \ +-fieldsplit_Pa_pc_gamg_threshold"; + chaine_lue_+=st_>0 ? Nom(st_, " %e") : " 0.01"; } // Use Kokkos backend (slower though) to avoid memory issue on Nvidia: // src/mat/impls/aij/seq/seqcusparse/aijcusparse.cu:3269 cuda error 2 (cudaErrorMemoryAllocation) : out of memory @@ -133,23 +154,27 @@ void Solv_AMG::create_block_amg(int n, Nom precond) chaine_lue_+=" -fieldsplit_P0_ksp_type preonly \ -fieldsplit_P0_pc_type hypre \ -fieldsplit_P0_pc_hypre_type boomeramg \ --fieldsplit_P0_pc_hypre_boomeramg_strong_threshold 0.1 \ -fieldsplit_P0_pc_hypre_boomeramg_print_statistics 1 \ --fieldsplit_P1_ksp_type preonly \ +-fieldsplit_P0_pc_hypre_boomeramg_strong_threshold"; + chaine_lue_+=st_>0 ? Nom(st_, " %e") : " 0.1"; + chaine_lue_+=" -fieldsplit_P1_ksp_type preonly \ -fieldsplit_P1_pc_type hypre \ -fieldsplit_P1_pc_hypre_type boomeramg \ --fieldsplit_P1_pc_hypre_boomeramg_strong_threshold 0.1 \ --fieldsplit_P1_pc_hypre_boomeramg_print_statistics 1"; +-fieldsplit_P1_pc_hypre_boomeramg_print_statistics 1 \ +-fieldsplit_P1_pc_hypre_boomeramg_strong_threshold"; + chaine_lue_+=st_>0 ? Nom(st_, " %e") : " 0.1"; if (n==3) { chaine_lue_+=" -fieldsplit_Pa_ksp_type preonly \ -fieldsplit_Pa_pc_type hypre \ -fieldsplit_Pa_pc_hypre_type boomeramg \ --fieldsplit_Pa_pc_hypre_boomeramg_strong_threshold 0.1 \ --fieldsplit_Pa_pc_hypre_boomeramg_print_statistics 1"; +-fieldsplit_Pa_pc_hypre_boomeramg_print_statistics 1 \ +-fieldsplit_Pa_pc_hypre_boomeramg_strong_threshold"; + chaine_lue_+=st_>0 ? Nom(st_, " %e") : " 0.1"; } // To avoid this issue on Nvidia: CUSPARSE ERROR (code = 11, insufficient resources) at csr_spgemm_device_cusparse.c:152 -#ifdef TRUST_USE_CUDA + // Seen also on HIP on gfx1100 +#ifdef TRUST_USE_GPU if (n==2) chaine_lue_+=" -fieldsplit_P0_pc_mg_galerkin_mat_product_algorithm hypre"; if (n==2) chaine_lue_+=" -fieldsplit_P1_pc_mg_galerkin_mat_product_algorithm hypre"; if (n==3) chaine_lue_+=" -fieldsplit_Pa_pc_mg_galerkin_mat_product_algorithm hypre"; @@ -160,21 +185,24 @@ void Solv_AMG::create_block_amg(int n, Nom precond) Cerr << "Warning! PETSc with AmgX preconditioner was not tested yet for nnz>2^31 !" << finl; chaine_lue_+=" -fieldsplit_P0_ksp_type preonly \ -fieldsplit_P0_pc_type amgx \ --fieldsplit_P0_pc_amgx_strength_threshold 0.1 \ -fieldsplit_P0_pc_amgx_verbose 1 \ -fieldsplit_P0_pc_amgx_print_grid_stats 1 \ --fieldsplit_P1_ksp_type preonly \ +-fieldsplit_P0_pc_amgx_strength_threshold"; + chaine_lue_+=st_>0 ? Nom(st_, " %e") : " 0.1"; + chaine_lue_+=" -fieldsplit_P1_ksp_type preonly \ -fieldsplit_P1_pc_type amgx \ --fieldsplit_P1_pc_amgx_strength_threshold 0.1 \ -fieldsplit_P1_pc_amgx_verbose 1 \ --fieldsplit_P1_pc_amgx_print_grid_stats 1"; +-fieldsplit_P1_pc_amgx_print_grid_stats 1 \ +-fieldsplit_P1_pc_amgx_strength_threshold"; + chaine_lue_+=st_>0 ? Nom(st_, " %e") : " 0.1"; if (n==3) { chaine_lue_+=" -fieldsplit_Pa_ksp_type preonly \ -fieldsplit_Pa_pc_type amgx \ --fieldsplit_Pa_pc_amgx_strength_threshold 0.5 \ -fieldsplit_Pa_pc_amgx_verbose 1 \ --fieldsplit_Pa_pc_amgx_print_grid_stats 1"; +-fieldsplit_Pa_pc_amgx_print_grid_stats 1 \ +-fieldsplit_Pa_pc_amgx_strength_threshold"; + chaine_lue_+=st_>0 ? Nom(st_, " %e") : " 0.1"; } } else @@ -182,52 +210,84 @@ void Solv_AMG::create_block_amg(int n, Nom precond) chaine_lue_ +=" }"; } -Nom boomeramg(double st) +void Solv_AMG::create_gamg() { - Nom chaine(" { precond boomeramg { }"); - if (st>=0) - { - chaine += " cli { -pc_hypre_boomeramg_strong_threshold"; - chaine += Nom(st, "%e"); - chaine += " }"; - } - return chaine; + // Possibibly faster on VDF mesh than boomerAMG ? + chaine_lue_="cli { -ksp_type "; + chaine_lue_+=petsc_cg_issue_ ? "gmres" : ksp_type_; // Switch to GMRES for more robustness (BiCGstab is slower than GMRES 2xSPMV vs 1) + chaine_lue_+=rtol_>0 ? Nom(rtol_, " -ksp_rtol %e") : ""; + chaine_lue_+=atol_>0 ? Nom(atol_, " -ksp_atol %e") : ""; + chaine_lue_+=petsc_use_kokkos(); + chaine_lue_+=" -ksp_norm_type UNPRECONDITIONED \ +-info :pc \ +-pc_type gamg \ +-mg_levels_ksp_max_it 1"; + chaine_lue_+=" -pc_gamg_threshold"; + chaine_lue_+=st_>0 ? Nom(st_, " %e") : " 0.01"; + chaine_lue_ +=" }"; +} + +void Solv_AMG::create_boomeramg() +{ + // We enable explicitly a lot of option to have same convergence on CPU and GPU + // cause by default, PETSc enables differently on CPU and GPU... + // But warning, we have a different behaviour for boomeramg on CPU than 1.9.8 ! + // For instance, by default we have: Coarsening Type = Falgout-CLJP and modified classical interpolation + // Warning ! ext+i seems RAM costly. ext+i-cc is better ? + // Add -pc_hypre_boomeramg_P_max 6 ideal for 3D VDF with 6 neighbours ? + // Chebyshev reduit pas mal le nombre d'iterations mais ms/it plus eleve donc pas de gain sur le total + chaine_lue_="cli { -ksp_type "; + chaine_lue_+=petsc_cg_issue_ ? "gmres" : ksp_type_; // Switch to GMRES for more robustness (BiCGstab is slower than GMRES 2xSPMV vs 1) + chaine_lue_+=rtol_>0 ? Nom(rtol_, " -ksp_rtol %e") : ""; + chaine_lue_+=atol_>0 ? Nom(atol_, " -ksp_atol %e") : ""; + chaine_lue_+=petsc_use_kokkos(); + chaine_lue_+=" -ksp_norm_type UNPRECONDITIONED \ +-pc_type hypre \ +-pc_hypre_type boomeramg \ +-pc_mg_galerkin_mat_product_algorithm hypre \ +-pc_hypre_boomeramg_relax_type_all l1scaled-Jacobi \ +-pc_hypre_boomeramg_no_CF true \ +-pc_hypre_boomeramg_coarsen_type pmis \ +-pc_hypre_boomeramg_interp_type ext+i \ +-pc_hypre_boomeramg_print_statistics 1"; + chaine_lue_+=" -pc_hypre_boomeramg_strong_threshold"; + chaine_lue_+=st_>0 ? Nom(st_, " %e") : " 0.3"; + chaine_lue_ +=" }"; } + void Solv_AMG::create_amg() { // We select the more efficient/robust one: - chaine_lue_ = solver_; #if defined(TRUST_USE_CUDA) - library_ = "petsc_gpu"; - chaine_lue_ += boomeramg(st_); // Best GPU solver - // KSP divergence with cg+boomeramg/amgx on multi-node with MPI GPU Aware (seen also on Lumi) so we switch to gmres (bgcs slower) ! - if (Process::nproc()>4) petsc_cg_issue_ = true; -#if defined(MPIX_CUDA_AWARE_SUPPORT) - if (Process::nproc()>4) + Nom precond = Process::nproc()<=4 ? "boomeramg" : "amgx"; + if (getenv("TRUST_AMG")!=nullptr) + precond = getenv("TRUST_AMG"); // Surcharge possible pour test rapide + + if (precond=="boomeramg") + { + library_ = "petsc_gpu"; + return create_boomeramg(); // Best GPU solver + } + else if (precond=="amgx") { + // Switch to AmgX+AmgXWrapper (soon deprecated?) library_ = "amgx"; chaine_lue_ = solver_; chaine_lue_ += " { precond c-amg {"; if (st_>=0) chaine_lue_ += Nom(st_, " p:strength_threshold %e"); chaine_lue_ += " }"; } -#endif -#elif defined(TRUST_USE_ROCM) - library_ = "petsc_gpu"; - const char* value = std::getenv("ROCM_ARCH"); - if (value != nullptr && std::string(value) == "gfx1100") + else { - if (st_>=0) Process::exit("st option not supported yet in Solv_AMG"); - if (Process::is_parallel()) - chaine_lue_ += " { precond ua-amg { }"; // Converge mais plus lent que sa-amg - else - chaine_lue_ += " { precond sa-amg { }"; // Crash en parallele + library_ = "petsc_gpu"; + return create_gamg(); } - else - chaine_lue_ += boomeramg(st_); // Best GPU solver (// sa-amg is slow...) +#elif defined(TRUST_USE_ROCM) + library_ = "petsc_gpu"; + return create_boomeramg(); // Best GPU solver (// sa-amg is slow...) #else library_ = "petsc"; - chaine_lue_ += boomeramg(st_); // Best CPU solver + return create_boomeramg(); // Best CPU solver #endif chaine_lue_ += rtol_>0 ? Nom(rtol_, " rtol %e") : Nom(atol_, " atol %e"); if (impr_) chaine_lue_ += " impr"; @@ -235,49 +295,74 @@ void Solv_AMG::create_amg() chaine_lue_ += " }"; } +void Solv_AMG::create_solver() +{ + create_amg(); + if (nb_blocks_>1) + { + // Block matrix : we use PCFieldsplit (eg: VEF) for preconditioner + // Much better convergence for P0P1 for instance + Cerr << "Detecting " << nb_blocks_ << "x" << nb_blocks_ << " blocks into the matrix. Creating a specific block preconditioning:" << finl; + if (chaine_lue_.contient("gamg")) + create_block_amg(nb_blocks_, "gamg"); + else if (chaine_lue_.contient("boomeramg")) + create_block_amg(nb_blocks_, "boomeramg"); + else if (library_=="amgx") + { + library_ = "petsc_gpu"; + create_block_amg(nb_blocks_, "amgx"); + } + } + Cerr << "====================================================================" << finl; + Cerr << "[AMG] Creating solver: " << library_ << " " << chaine_lue_ << finl; + Cerr << "====================================================================" << finl; + EChaine entree(chaine_lue_); + Nom nom_solveur("Solv_"); + nom_solveur+=library_; + solveur_.typer(nom_solveur); + solveur_.nommer("solveur_pression"); + if (library_=="amgx") + ref_cast(Solv_AMGX, solveur_.valeur()).create_solver(entree); + else if (library_=="petsc") + ref_cast(Solv_Petsc, solveur_.valeur()).create_solver(entree); + else if (library_=="petsc_gpu") + ref_cast(Solv_Petsc_GPU, solveur_.valeur()).create_solver(entree); + else + Process::exit("Unsupported case in Solv_AMG::readOn"); + solveur_->set_save_matrix(save_matrix()); + solveur_->set_read_matrix(read_matrix()); +} + int Solv_AMG::resoudre_systeme(const Matrice_Base& mat, const DoubleVect& b, DoubleVect& x) { // We don't create solver during readOn as usual but just before solve to get more infos about matrix/vectors to fine tune if (!solveur_) { - create_amg(); - int nb_blocks = sub_type(MD_Vector_composite, b.get_md_vector().valeur()) ? ref_cast(MD_Vector_composite, b.get_md_vector().valeur()).nb_parts() : 1; - if (nb_blocks>1) - { - // Block matrix : we use PCFieldsplit (eg: VEF) for preconditioner - // Much better convergence for P0P1 for instance - Cerr << "Detecting " << nb_blocks << "x" << nb_blocks << " blocks into the matrix. Creating a specific block preconditioning:" << finl; - if (chaine_lue_.contient("gamg")) - create_block_amg(nb_blocks, "gamg"); - else if (chaine_lue_.contient("boomeramg")) - create_block_amg(nb_blocks, "boomeramg"); - else if (library_=="amgx") - { - library_ = "petsc_gpu"; - create_block_amg(nb_blocks, "amgx"); - } - } - Cerr << "====================================================================" << finl; - Cerr << "Creating AMG solver: " << library_ << " " << chaine_lue_ << finl; - Cerr << "====================================================================" << finl; - EChaine entree(chaine_lue_); - Nom nom_solveur("Solv_"); - nom_solveur+=library_; - solveur_.typer(nom_solveur); - solveur_.nommer("solveur_pression"); - if (library_=="amgx") - ref_cast(Solv_AMGX, solveur_.valeur()).create_solver(entree); - else if (library_=="petsc") - ref_cast(Solv_Petsc, solveur_.valeur()).create_solver(entree); - else if (library_=="petsc_gpu") - ref_cast(Solv_Petsc_GPU, solveur_.valeur()).create_solver(entree); - else - Process::exit("Unsupported case in Solv_AMG::readOn"); - solveur_->set_save_matrix(save_matrix()); - solveur_->set_read_matrix(read_matrix()); + // Seen on Cuda, multi-node MPI-Cuda Aware but also on Lumi (amg unsymmetric preconditioner, cg may diverge) + // KSP divergence with cg+amg so we switch to gmres+amg (bcgs slower) + if (Process::nproc()>4) petsc_cg_issue_ = true; + nb_blocks_ = sub_type(MD_Vector_composite, b.get_md_vector().valeur()) ? ref_cast(MD_Vector_composite, b.get_md_vector().valeur()).nb_parts() : 1; + create_solver(); + Cerr << "[AMG] If you experience OOM during setup, try to increase the strong threshold (st keyword): AMG XXX { rtol XXX impr st XXX }" << finl; } statistics().end_count(STD_COUNTERS::system_solver,-1,0); - int res = solveur_.resoudre_systeme(mat, b, x); + int nb_iter=0; + try + { + nb_iter = solveur_.resoudre_systeme(mat, b, x); + } + catch(...) + { + statistics().end_count(STD_COUNTERS::system_solver,1,nb_iter); + petsc_cg_issue_ = true; + create_solver(); + nb_iter = solveur_.resoudre_systeme(mat, b, x); + } statistics().begin_count(STD_COUNTERS::system_solver,statistics().get_last_opened_counter_level()+1); - return res; + if (rtol_<0) + { + Cout << "Warning: you define only atol (absolute tolerance, dimensional value) for the AMG solver." << finl; + Cout << "Strongly recomended to rather define rtol (relative tolerance) as the first convergence criteria and atol as a second criteria." << finl; + } + return nb_iter; } diff --git a/src/Kernel/Math/SolvSys/Solv_AMG.h b/src/Kernel/Math/SolvSys/Solv_AMG.h index 831e5a2ec3..443f21c81f 100644 --- a/src/Kernel/Math/SolvSys/Solv_AMG.h +++ b/src/Kernel/Math/SolvSys/Solv_AMG.h @@ -40,11 +40,16 @@ public : private : void create_amg(); void create_block_amg(int,Nom); + void create_gamg(); + void create_boomeramg(); + void create_solver(); SolveurSys solveur_; Nom library_="", solver_="", options_=""; double rtol_=-1, atol_=-1, st_=-1; bool impr_ = false; bool petsc_cg_issue_ = false; + int nb_blocks_ = 1; + std::string ksp_type_ = ""; }; #endif diff --git a/src/Kernel/Math/SolvSys/Solv_Externe.cpp b/src/Kernel/Math/SolvSys/Solv_Externe.cpp index e5255625c5..3ffe07daeb 100644 --- a/src/Kernel/Math/SolvSys/Solv_Externe.cpp +++ b/src/Kernel/Math/SolvSys/Solv_Externe.cpp @@ -36,14 +36,7 @@ Entree& Solv_Externe::readOn(Entree& is) void Solv_Externe::MorseSymToMorse(const Matrice_Morse_Sym& MS, Matrice_Morse& M) { - M = MS; - Matrice_Morse mattmp(MS); - M.transpose(mattmp); - int ordre = M.ordre(); - for (int i=0; i void Update_lhs_rhs(const DoubleVect& b, DoubleVect& x); template diff --git a/src/Kernel/Math/SolvSys/Solv_Gmres.h b/src/Kernel/Math/SolvSys/Solv_Gmres.h index c78ef1143e..a7e256b458 100644 --- a/src/Kernel/Math/SolvSys/Solv_Gmres.h +++ b/src/Kernel/Math/SolvSys/Solv_Gmres.h @@ -42,7 +42,6 @@ protected : int lire_motcle_non_standard(const Motcle&, Entree&) override; int Gmres(const Matrice_Morse&, const DoubleVect&, DoubleVect& ); - int gmres_local( const Matrice_Morse& A, const DoubleVect& b, DoubleVect& tab_x1); DoubleVects v; //espcace Krilov bool is_local_gmres = false; @@ -52,6 +51,8 @@ protected : DoubleTab h; DoubleVect r; DoubleVect h_loc, dh_loc; + protected_but_public_for_cuda + int gmres_local( const Matrice_Morse& A, const DoubleVect& b, DoubleVect& tab_x1); }; #endif /* Solv_Gmres_included */ diff --git a/src/Kernel/Math/SolvSys/Solv_Petsc.cpp b/src/Kernel/Math/SolvSys/Solv_Petsc.cpp index 84c97a67d7..0dd11a0c20 100644 --- a/src/Kernel/Math/SolvSys/Solv_Petsc.cpp +++ b/src/Kernel/Math/SolvSys/Solv_Petsc.cpp @@ -1315,6 +1315,9 @@ void Solv_Petsc::create_solver(Entree& entree) PCSetType(PreconditionneurPetsc_, PCHYPRE); PCHYPRESetType(PreconditionneurPetsc_, "boomeramg"); // Classical C-AMG pc_supported_on_gpu_by_petsc=1; +#ifdef TRUST_USE_CUDA + add_option("pc_mg_galerkin_mat_product_algorithm", "hypre"); // AVoid OOM on device on CUDA +#endif // Changement pc_hypre_boomeramg_relax_type_all pour PETSc 3.10, la matrice de // preconditionnement etant seqaij, symetric-SOR/jacobi (defaut) provoque KSP_DIVERGED_INDEFINITE_PC // Voir: https://lists.mcs.anl.gov/mailman/htdig/petsc-users/2012-December/015922.html @@ -2042,7 +2045,7 @@ int Solv_Petsc::resoudre_systeme(const Matrice_Base& la_matrice, const DoubleVec if (dm_!=nullptr) DMDestroy(&dm_); } - + has_constant_nullspace_ = la_matrice.has_constant_nullspace(); matrice_symetrique_ = true; // On suppose que la matrice est symetrique // Construction de la numerotation globale: @@ -2285,16 +2288,13 @@ int Solv_Petsc::solve(ArrOfDouble& residu) Cerr << "KSP_DIVERGED_ITS" << finl; Cerr << "That means the solver didn't converge within the maximal iterations number." << finl; Cerr << "You can change the maximal number of iterations with the -ksp_max_it option." << finl; -#ifdef MPIX_CUDA_AWARE_SUPPORT - // Probleme vu avec GPU direct si >= 4 GPUs et preconditinneurs C-AMG ou BOOMERAMG - // OK pour SA-AMG et Jacobi - // Il faudrait faire un reproducer a soumettre a PETSc... - Cerr << "It seems there is a convergence issue (bug?) with MPI GPU Aware library with PETSc CG and some preconditioners." << finl; - Cerr << "Try using BICGSTAB instead of GCP to bypass the issue." << finl; - Process::exit(); -#endif } else Cerr << (int)Reason << finl; + if ((Reason==KSP_DIVERGED_INDEFINITE_PC || Reason==KSP_DIVERGED_INDEFINITE_MAT) && type_ksp_ == "cg") + { + Cerr << "It seems you are using GCP but with an unsymmetric preconditioning." << finl; + Cerr << "Try using GMRES or BICCGTAB to bypass this issue of non convergence." << finl; + } throw Reason; } if (Reason<0 && !return_on_error_) exit(); @@ -3272,7 +3272,6 @@ void Solv_Petsc::Update_matrix(Mat& MatricePetsc, const Matrice_Morse& mat_morse // On dimensionne ces tableaux a la taille la plus grande possible // ToDo : recalcul de nnz utile ? ArrOfInt nnz(nb_rows_); - nnz = 0; ArrOfTID& renum_array = renum_; // tab seen as a flat array (can't use ArrOfPetscInt& because of C++ ref cast...) const auto& tab1 = mat_morse.get_tab1(); const auto& tab2 = mat_morse.get_tab2(); @@ -3338,6 +3337,26 @@ void Solv_Petsc::Update_matrix(Mat& MatricePetsc, const Matrice_Morse& mat_morse MatAssemblyEnd(MatricePetsc, MAT_FINAL_ASSEMBLY); } + /* attach null space if any */ + if (has_constant_nullspace_) + { + MatNullSpace nullsp; + MatNullSpaceCreate(PETSC_COMM_WORLD, PETSC_TRUE, 0, PETSC_NULLPTR, &nullsp); + MatSetNullSpace(MatricePetsc, nullsp); + MatSetNearNullSpace(MatricePetsc, nullsp); // useful for AMG + PetscBool isNull; + MatNullSpaceTest(nullsp,MatricePetsc,&isNull); + if (!isNull) + { + Cerr << "[Petsc] Warning, matrix has not null space as specified! We remove it..." << finl; + MatSetNullSpace(MatricePetsc, PETSC_NULLPTR); + MatSetNearNullSpace(MatricePetsc, PETSC_NULLPTR); + } + else + Cerr << "[Petsc] Creating null space on the matrix." << finl; + MatNullSpaceDestroy(&nullsp); + } + if (!nouveau_stencil_ && reorder_matrix_) { Mat Aperm; diff --git a/src/Kernel/Math/SolvSys/Solv_Petsc.h b/src/Kernel/Math/SolvSys/Solv_Petsc.h index e1a8048def..76fa80082e 100644 --- a/src/Kernel/Math/SolvSys/Solv_Petsc.h +++ b/src/Kernel/Math/SolvSys/Solv_Petsc.h @@ -105,7 +105,7 @@ public : { return amgx_initialized_; }; -#if PETSC_VERSION_GE(3,24,0) +#if PETSC_VERSION_GE(3,24,0) && PETSC_VERSION_LT(3,25,0) PetscErrorCode set_convergence_test(PetscErrorCode (*converge)(KSP,PetscInt,PetscReal,KSPConvergedReason*,void*),void *cctx,PetscErrorCode (*destroy)(void**)) #else PetscErrorCode set_convergence_test(PetscErrorCode (*converge)(KSP,PetscInt,PetscReal,KSPConvergedReason*,void*),void *cctx,PetscErrorCode (*destroy)(void*)) @@ -124,7 +124,7 @@ public : } #endif - public_for_cuda + protected_but_public_for_cuda #ifdef PETSCKSP_H virtual void Update_matrix(Mat& MatricePetsc, const Matrice_Morse& mat_morse); // Fill the (previously allocated) PETSc matrix with mat_morse coefficients #endif @@ -197,7 +197,7 @@ protected : VecScatter VecScatter_; // Scatter context needed when petsc_decide_=1 to gather values of global to local solution #endif - + bool has_constant_nullspace_ = false; // To enable Null Space treatment int solveur_direct_ = no; // Pour savoir si l'on manipule un solveur direct et non iteratif bool gpu_ = false; // Utilisation des solveurs GPU de PETSc bool amgx_ = false; // Utilisation des solveurs GPU de AMGX diff --git a/src/Kernel/Math/TRUSTTab.tpp b/src/Kernel/Math/TRUSTTab.tpp index ae67efae1c..ae6c61bd6b 100644 --- a/src/Kernel/Math/TRUSTTab.tpp +++ b/src/Kernel/Math/TRUSTTab.tpp @@ -1,5 +1,5 @@ /**************************************************************************** -* Copyright (c) 2025, CEA +* Copyright (c) 2026, CEA * All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: @@ -17,6 +17,7 @@ #define TRUSTTab_TPP_included #include +#include // TODO : FIXME : delete template @@ -675,8 +676,13 @@ inline void TRUSTTab<_TYPE_,_SIZE_>::set_md_vector(const MD_Vector& md_vector) #ifndef LATATOOLS _SIZE_ dim0 = dimension_tot_0_; if (md_vector.non_nul()) - // renvoie -1 si l'appel est invalide ou si le MD_Vector est mix (cf doc MD_Vector_base): - dim0 = md_vector->get_nb_items_reels(); + { + if (sub_type(MD_Vector_seq, md_vector.valeur())) + dim0 = (_SIZE_)md_vector->nb_items_seq_tot(); + else + // renvoie -1 si l'appel est invalide ou si le MD_Vector est mix (cf doc MD_Vector_base): + dim0 = md_vector->get_nb_items_reels(); + } dimensions_[0] = dim0; assert(verifie_LINE_SIZE()); // a appeler meme pour un md_vector nul (pour remettre size_reelle_): @@ -759,9 +765,6 @@ template template inline void TRUSTTab<_TYPE_,_SIZE_>::ajoute_produit_tensoriel(_T_ alpha, const TRUSTTab<_T_,_SIZE_>& x, const TRUSTTab<_T_,_SIZE_>& y) { - this->ensureDataOnHost(); - x.ensureDataOnHost(); - y.ensureDataOnHost(); // Tableaux vus comme des tableaux unidimensionnels (pour ne pas avoir a gerer nb_dim) const TRUSTVect<_T_,_SIZE_>& vx = x, &vy = y; TRUSTVect<_T_,_SIZE_>& v = *this; @@ -790,23 +793,9 @@ inline void TRUSTTab<_TYPE_,_SIZE_>::ajoute_produit_tensoriel(_T_ alpha, const T bloc_itr = Block_Iter<_SIZE_>(0, v.size_totale() / v.line_size()); // iterator on a single (big) block } } + if (nblocs_left == 0) return; - for (; nblocs_left; nblocs_left--) - { - const _SIZE_ debut = (*(bloc_itr++)), fin = (*(bloc_itr++)); - _SIZE_ v_index = debut * line_size_xy; - for (_SIZE_ i = debut; i < fin; i++) - for (_SIZE_ j = 0; j < line_size_x; j++) - { - _T_ xval = vx[i * line_size_x + j]; - for (_SIZE_ k = 0; k < line_size_y; k++) - { - _T_ yval = vy[i * line_size_y + k]; - v[v_index] += alpha * xval * yval; - v_index++; - } - } - } + ::ajoute_produit_tensoriel(alpha, v, vx, vy, nblocs_left, bloc_itr, line_size_x, line_size_y, line_size_xy); } // Resolution du systeme Ax=b diff --git a/src/Kernel/Math/TRUSTTab_tools.cpp b/src/Kernel/Math/TRUSTTab_tools.cpp index 04063838b9..a385ef92d4 100644 --- a/src/Kernel/Math/TRUSTTab_tools.cpp +++ b/src/Kernel/Math/TRUSTTab_tools.cpp @@ -141,3 +141,29 @@ template void local_carre_norme_tab(const TRUSTTab& tableau, template void local_carre_norme_tab(const TRUSTTab& tableau, TRUSTArray& norme_colonne); template void local_max_abs_tab(const TRUSTTab& tableau, TRUSTArray& max_colonne); template void local_max_abs_tab(const TRUSTTab& tableau, TRUSTArray& max_colonne); + +template +void ajoute_produit_tensoriel(_T_ alpha, TRUSTVect<_T_,_SIZE_>& tab_v, const TRUSTVect<_T_,_SIZE_>& tab_x, const TRUSTVect<_T_,_SIZE_>& tab_y, + int nblocs_left, Block_Iter<_SIZE_> bloc_itr, + int line_size_x, int line_size_y, int line_size_xy) +{ + auto x = tab_x.template view_ro<1>().data(); + auto y = tab_y.template view_ro<1>().data(); + auto v = tab_v.template view_rw<1>().data(); +#ifdef TRUST_USE_GPU + if (nblocs_left > 3) ToDo_Kokkos("nblocs_left too high, optimize by rewriting as ajoute_produit_tensoriel_kernel"); +#endif + for (; nblocs_left; nblocs_left--) + { + const _SIZE_ debut = (*(bloc_itr++)), fin = (*(bloc_itr++)); + Kokkos::parallel_for(start_gpu_timer(__KERNEL_NAME__), range_3D({debut, 0, 0}, {fin, line_size_x, line_size_y}), + KOKKOS_LAMBDA(const int i, const int j, const int k) + { + v[i * line_size_xy + j * line_size_y + k] += alpha * x[i * line_size_x + j] * y[i * line_size_y + k]; + }); + end_gpu_timer(__KERNEL_NAME__); + } +} + +template void ajoute_produit_tensoriel(double alpha, TRUSTVect& v, const TRUSTVect& vx, const TRUSTVect& vy, int nblocs_left, Block_Iter bloc_itr, int line_size_x, int line_size_y, int line_size_xy); +template void ajoute_produit_tensoriel(float alpha, TRUSTVect& v, const TRUSTVect& vx, const TRUSTVect& vy, int nblocs_left, Block_Iter bloc_itr, int line_size_x, int line_size_y, int line_size_xy); diff --git a/src/Kernel/Math/TRUSTTab_tools.tpp b/src/Kernel/Math/TRUSTTab_tools.tpp index 84e6974d06..8f18f5cce5 100644 --- a/src/Kernel/Math/TRUSTTab_tools.tpp +++ b/src/Kernel/Math/TRUSTTab_tools.tpp @@ -64,6 +64,11 @@ inline void mp_max_abs_tab(const TRUSTTab<_T_,int>& tableau, TRUSTArray<_T_,int> Process::mp_max_for_each_item(max_colonne); } +template +extern void ajoute_produit_tensoriel(_T_ alpha, TRUSTVect<_T_,_SIZE_>& v, const TRUSTVect<_T_,_SIZE_>& vx, const TRUSTVect<_T_,_SIZE_>& vy, + int nblocs_left, Block_Iter<_SIZE_> bloc_itr, + int line_size_x, int line_size_y, int line_size_xy); + #ifndef LATATOOLS /** * @brief Compares two `TRUSTTab` objects for equality. diff --git a/src/Kernel/Math/TRUSTVect_tools.cpp b/src/Kernel/Math/TRUSTVect_tools.cpp index 1a06a09268..7ecb662cd9 100644 --- a/src/Kernel/Math/TRUSTVect_tools.cpp +++ b/src/Kernel/Math/TRUSTVect_tools.cpp @@ -142,43 +142,30 @@ template void ajoute_produit_scalaire(TRUSTVect& resu, f namespace { template -void operation_speciale_tres_generic_kernel(TRUSTVect<_TYPE_, _SIZE_>& resu, const TRUSTVect<_TYPE_, _SIZE_>& vx, int nblocs_left, +void operation_speciale_tres_generic_kernel(TRUSTVect<_TYPE_, _SIZE_>& tab_resu, const TRUSTVect<_TYPE_, _SIZE_>& tab_vx, int nblocs_left, Block_Iter<_SIZE_>& bloc_itr, const int line_size_vx, const _SIZE_ vect_size_tot, const int delta_line_size) { - auto vx_view= vx.template view_ro<1, ExecSpace>().data(); - auto resu_view= resu.template view_rw<1, ExecSpace>().data(); -#ifdef TRUST_USE_GPU - if (nblocs_left>3) ToDo_Kokkos("nblocs_left too high, optimize by rewriting as local_operations_vect_bis_generic_kernel"); -#endif - for (; nblocs_left; nblocs_left--) - { - // Get index of next bloc start: - const int begin_bloc = (*(bloc_itr++)) * line_size_vx; - const int end_bloc = (*(bloc_itr++)) * line_size_vx; - - assert(begin_bloc >= 0 && end_bloc <= vect_size_tot && end_bloc >= begin_bloc); - - // Adjust pointers to indices - const int resu_start_idx = begin_bloc * delta_line_size; - - Kokkos::RangePolicy policy(begin_bloc, end_bloc); - if (statistics().get_use_gpu()) start_gpu_timer(__KERNEL_NAME__); - Kokkos::parallel_for(policy, KOKKOS_LAMBDA(const int i) + auto vx = tab_vx.template view_ro<1, ExecSpace>().data(); + auto resu = tab_resu.template view_rw<1, ExecSpace>().data(); + const bool has_items = static_cast(bloc_itr.items_); + auto items = has_items ? bloc_itr.items_->template view_ro<1, ExecSpace>().data() : nullptr; + const _SIZE_ n_items = has_items ? bloc_itr.items_->size_array() : vect_size_tot / line_size_vx; + if (statistics().get_use_gpu()) start_gpu_timer(__KERNEL_NAME__); + Kokkos::parallel_for(Kokkos::RangePolicy(0, n_items), KOKKOS_LAMBDA(const _SIZE_ i) + { + for (int jv = 0; jv < line_size_vx; jv++) { - const _TYPE_ x = vx_view[i]; - - //The // for could be also placed there + const _SIZE_ vx_i = (has_items ? items[i] : i) * line_size_vx + jv; + const _TYPE_ x = vx[vx_i]; for (int j = 0; j < delta_line_size; ++j) { - const int resu_idx = resu_start_idx + i * delta_line_size + j; - if (IS_MUL) - resu_view[resu_idx] *= x; - else //If it's not MUL, it's DIV - resu_view[resu_idx] *= ((_TYPE_)1 / x); + const _SIZE_ resu_idx = vx_i * delta_line_size + j; + if (IS_MUL) resu[resu_idx] *= x; + else resu[resu_idx] *= ((_TYPE_)1 / x); } - }); - if (statistics().get_use_gpu()) end_gpu_timer(__KERNEL_NAME__, is_default_exec_space); - } + } + }); + if (statistics().get_use_gpu()) end_gpu_timer(__KERNEL_NAME__, is_default_exec_space); } } #endif @@ -192,17 +179,17 @@ void operation_speciale_tres_generic(TRUSTVect<_TYPE_, _SIZE_>& resu, const TRUS static constexpr bool IS_MUL = (_TYPE_OP_ == TYPE_OPERATION_VECT_SPEC_GENERIC::MUL_); //it's either MUL or DIV // get info for computation - const int line_size = resu.line_size(), line_size_vx = vx.line_size(), vect_size_tot = resu.size_totale(); + const int line_size = resu.line_size(), line_size_vx = vx.line_size(); const MD_Vector& md = resu.get_md_vector(); // Le line_size du vecteur resu doit etre un multiple du line_size du vecteur vx assert(line_size > 0 && line_size_vx > 0 && line_size % line_size_vx == 0); const int delta_line_size = line_size / line_size_vx; - assert(vx.size_totale() * delta_line_size == vect_size_tot); // this test is necessary if md is null + assert(vx.size_totale() * delta_line_size == resu.size_totale()); // this test is necessary if md is null assert(vx.get_md_vector() == md); - // Determine blocs of data to process, depending on " opt" + // Determine blocs of data to process using vx dimensions so items_ holds flat vx indices int nblocs_left; - Block_Iter<_SIZE_> bloc_itr = ::determine_blocks(opt, md, vect_size_tot, line_size, nblocs_left); + Block_Iter<_SIZE_> bloc_itr = ::determine_blocks(opt, md, vx.size_totale(), line_size_vx, nblocs_left); // Shortcut for empty arrays (avoid case line_size == 0) if (bloc_itr.empty()) return; @@ -211,9 +198,9 @@ void operation_speciale_tres_generic(TRUSTVect<_TYPE_, _SIZE_>& resu, const TRUS //Lauch computation with the execution space and view types as (template) parameters if (kernelOnDevice) - operation_speciale_tres_generic_kernel(resu, vx, nblocs_left, bloc_itr, line_size_vx, vect_size_tot, delta_line_size); + operation_speciale_tres_generic_kernel(resu, vx, nblocs_left, bloc_itr, line_size_vx, vx.size_totale(), delta_line_size); else - operation_speciale_tres_generic_kernel(resu, vx, nblocs_left, bloc_itr, line_size_vx, vect_size_tot, delta_line_size); + operation_speciale_tres_generic_kernel(resu, vx, nblocs_left, bloc_itr, line_size_vx, vx.size_totale(), delta_line_size); #ifndef NDEBUG // In debug mode, put invalid values where data has not been computed @@ -237,35 +224,26 @@ template void operation_speciale_tres_generic -void operation_speciale_generic_kernel(TRUSTVect<_TYPE_, _SIZE_>& resu, const TRUSTVect<_TYPE_, _SIZE_>& vx, _TYPE_ alpha, int nblocs_left, +void operation_speciale_generic_kernel(TRUSTVect<_TYPE_, _SIZE_>& tab_resu, const TRUSTVect<_TYPE_, _SIZE_>& tab_vx, _TYPE_ alpha, int nblocs_left, Block_Iter<_SIZE_>& bloc_itr, const _SIZE_ vect_size_tot, const int line_size) { - auto vx_view= vx.template view_ro<1, ExecSpace>().data(); - auto resu_view= resu.template view_rw<1, ExecSpace>().data(); -#ifdef TRUST_USE_GPU - if (nblocs_left>3) ToDo_Kokkos("nblocs_left too high, optimize by rewriting as local_operations_vect_bis_generic_kernel"); -#endif - for (; nblocs_left; nblocs_left--) - { - // Get index of next bloc start: - const _SIZE_ begin_bloc = (*(bloc_itr++)) * line_size; - const _SIZE_ end_bloc = (*(bloc_itr++)) * line_size; - - assert(begin_bloc >= 0 && end_bloc <= vect_size_tot && end_bloc >= begin_bloc); - - Kokkos::RangePolicy policy(begin_bloc, end_bloc); - if (statistics().get_use_gpu()) start_gpu_timer(__KERNEL_NAME__); - Kokkos::parallel_for(policy, KOKKOS_LAMBDA(const int i) + auto vx = tab_vx.template view_ro<1, ExecSpace>().data(); + auto resu = tab_resu.template view_rw<1, ExecSpace>().data(); + const bool has_items = static_cast(bloc_itr.items_); + auto items = has_items ? bloc_itr.items_->template view_ro<1, ExecSpace>().data() : nullptr; + const _SIZE_ n_items = has_items ? bloc_itr.items_->size_array() : vect_size_tot / line_size; + if (statistics().get_use_gpu()) start_gpu_timer(__KERNEL_NAME__); + Kokkos::parallel_for(Kokkos::RangePolicy(0, n_items), KOKKOS_LAMBDA(const _SIZE_ i) + { + for (int j = 0; j < line_size; j++) { - const _TYPE_ x = vx_view[i]; - - if (IS_ADD) //done at compile time - resu_view[i] += alpha * x; - else //If it's not ADD, it's SQUARE - resu_view[i] += alpha * x * x; - }); - if (statistics().get_use_gpu()) end_gpu_timer(__KERNEL_NAME__, is_default_exec_space); - } + const _SIZE_ item = (has_items ? items[i] : i) * line_size + j; + const _TYPE_ x = vx[item]; + if (IS_ADD) resu[item] += alpha * x; + else resu[item] += alpha * x * x; + } + }); + if (statistics().get_use_gpu()) end_gpu_timer(__KERNEL_NAME__, is_default_exec_space); } } #endif @@ -316,7 +294,7 @@ template void ajoute_operation_speciale_generic -void operator_vect_vect_generic_kernel(TRUSTVect<_TYPE_,_SIZE_>& resu, const TRUSTVect<_TYPE_, _SIZE_>& vx, int nblocs_left, +void operator_vect_vect_generic_kernel(TRUSTVect<_TYPE_,_SIZE_>& tab_resu, const TRUSTVect<_TYPE_, _SIZE_>& tab_vx, int nblocs_left, Block_Iter<_SIZE_>& bloc_itr, const _SIZE_ vect_size_tot, const int line_size) { static constexpr bool IS_ADD = (_TYPE_OP_ == TYPE_OPERATOR_VECT::ADD_), IS_SUB = (_TYPE_OP_ == TYPE_OPERATOR_VECT::SUB_), @@ -324,35 +302,30 @@ void operator_vect_vect_generic_kernel(TRUSTVect<_TYPE_,_SIZE_>& resu, const TRU IS_EGAL = (_TYPE_OP_ == TYPE_OPERATOR_VECT::EGAL_); #ifdef TRUST_USE_GPU - auto vx_view= vx.template view_ro<1, ExecSpace>().data(); - auto resu_view= resu.template view_rw<1, ExecSpace>().data(); -#ifdef TRUST_USE_GPU - if (nblocs_left>3) ToDo_Kokkos("nblocs_left too high, optimize by rewriting as local_operations_vect_bis_generic_kernel"); -#endif - for (; nblocs_left; nblocs_left--) - { - // Get index of next bloc start: - const _SIZE_ begin_bloc = (*(bloc_itr++)) * line_size; - const _SIZE_ end_bloc = (*(bloc_itr++)) * line_size; - - assert(begin_bloc >= 0 && end_bloc <= vect_size_tot && end_bloc >= begin_bloc); - Kokkos::RangePolicy policy(begin_bloc, end_bloc); - if (statistics().get_use_gpu()) start_gpu_timer(__KERNEL_NAME__); - Kokkos::parallel_for(policy, KOKKOS_LAMBDA(const _SIZE_ i) + auto vx = tab_vx.template view_ro<1, ExecSpace>().data(); + auto resu = tab_resu.template view_rw<1, ExecSpace>().data(); + const bool has_items = static_cast(bloc_itr.items_); + auto items = has_items ? bloc_itr.items_->template view_ro<1, ExecSpace>().data() : nullptr; + const _SIZE_ n_items = has_items ? bloc_itr.items_->size_array() : vect_size_tot / line_size; + if (statistics().get_use_gpu()) start_gpu_timer(__KERNEL_NAME__); + Kokkos::parallel_for(Kokkos::RangePolicy(0, n_items), KOKKOS_LAMBDA(const _SIZE_ i) + { + for (int j = 0; j < line_size; j++) { - const _TYPE_ x = vx_view[i]; - if (IS_ADD) resu_view[i] += x; - if (IS_SUB) resu_view[i] -= x; - if (IS_MULT) resu_view[i] *= x; - if (IS_DIV) resu_view[i] /= x; - if (IS_EGAL) resu_view[i] = x; - }); - if (statistics().get_use_gpu()) end_gpu_timer(__KERNEL_NAME__, is_default_exec_space); - } + const _SIZE_ item = (has_items ? items[i] : i) * line_size + j; + const _TYPE_ x = vx[item]; + if (IS_ADD) resu[item] += x; + if (IS_SUB) resu[item] -= x; + if (IS_MULT) resu[item] *= x; + if (IS_DIV) resu[item] /= x; + if (IS_EGAL) resu[item] = x; + } + }); + if (statistics().get_use_gpu()) end_gpu_timer(__KERNEL_NAME__, is_default_exec_space); #else // Need to keep C++ optimized (pointer) implementation for PolyMAC_CDO in Flica5 - _TYPE_ *resu_base = resu.data(); - const _TYPE_ *x_base = vx.data(); + _TYPE_ *resu_base = tab_resu.data(); + const _TYPE_ *x_base = tab_vx.data(); for (; nblocs_left; nblocs_left--) { // Get index of next bloc start: @@ -429,7 +402,7 @@ template void operator_vect_vect_generic( namespace { template -void operator_vect_single_generic_kernel(TRUSTVect<_TYPE_,_SIZE_>& resu, const _TYPE_ x, int nblocs_left, +void operator_vect_single_generic_kernel(TRUSTVect<_TYPE_,_SIZE_>& tab_resu, const _TYPE_ x, int nblocs_left, Block_Iter<_SIZE_>& bloc_itr, const _SIZE_ vect_size_tot, const int line_size) { static constexpr bool IS_ADD = (_TYPE_OP_ == TYPE_OPERATOR_SINGLE::ADD_), IS_SUB = (_TYPE_OP_ == TYPE_OPERATOR_SINGLE::SUB_), @@ -437,34 +410,29 @@ void operator_vect_single_generic_kernel(TRUSTVect<_TYPE_,_SIZE_>& resu, const _ IS_NEGATE = (_TYPE_OP_ == TYPE_OPERATOR_SINGLE::NEGATE_), IS_INV = (_TYPE_OP_ == TYPE_OPERATOR_SINGLE::INV_), IS_ABS = (_TYPE_OP_ == TYPE_OPERATOR_SINGLE::ABS_), IS_SQRT = (_TYPE_OP_ == TYPE_OPERATOR_SINGLE::SQRT_), IS_SQUARE = (_TYPE_OP_ == TYPE_OPERATOR_SINGLE::SQUARE_); - auto resu_view= resu.template view_rw<1, ExecSpace>().data(); -#ifdef TRUST_USE_GPU - if (nblocs_left>3) ToDo_Kokkos("nblocs_left too high, optimize by rewriting as local_operations_vect_bis_generic_kernel"); -#endif - for (; nblocs_left; nblocs_left--) - { - // Get index of next bloc start: - const _SIZE_ begin_bloc = (*(bloc_itr++)) * line_size; - const _SIZE_ end_bloc = (*(bloc_itr++)) * line_size; - - assert(begin_bloc >= 0 && end_bloc <= vect_size_tot && end_bloc >= begin_bloc); - Kokkos::RangePolicy policy(begin_bloc, end_bloc); - if (statistics().get_use_gpu()) start_gpu_timer(__KERNEL_NAME__); - Kokkos::parallel_for(policy, KOKKOS_LAMBDA(const _SIZE_ i) + auto resu = tab_resu.template view_rw<1, ExecSpace>().data(); + const bool has_items = static_cast(bloc_itr.items_); + auto items = has_items ? bloc_itr.items_->template view_ro<1, ExecSpace>().data() : nullptr; + const _SIZE_ n_items = has_items ? bloc_itr.items_->size_array() : vect_size_tot / line_size; + if (statistics().get_use_gpu()) start_gpu_timer(__KERNEL_NAME__); + Kokkos::parallel_for(Kokkos::RangePolicy(0, n_items), KOKKOS_LAMBDA(const _SIZE_ i) + { + for (int j = 0; j < line_size; j++) { - if (IS_SUB) resu_view[i] -= x; - if (IS_ADD) resu_view[i] += x; - if (IS_MULT) resu_view[i] *= x; - if (IS_EGAL) resu_view[i] = x; - if (IS_NEGATE) resu_view[i] = -resu_view[i]; - if (IS_ABS) resu_view[i] = (_TYPE_) Kokkos::abs(resu_view[i]); - if (IS_SQRT) resu_view[i] = (_TYPE_) Kokkos::sqrt(resu_view[i]); - if (IS_SQUARE) resu_view[i] = resu_view[i]*resu_view[i]; - if (IS_DIV) resu_view[i] /= x; - if (IS_INV) resu_view[i] = (_TYPE_) ((_TYPE_)1 /resu_view[i]); - }); - if (statistics().get_use_gpu()) end_gpu_timer(__KERNEL_NAME__, is_default_exec_space); - } + const _SIZE_ item = (has_items ? items[i] : i) * line_size + j; + if (IS_SUB) resu[item] -= x; + if (IS_ADD) resu[item] += x; + if (IS_MULT) resu[item] *= x; + if (IS_EGAL) resu[item] = x; + if (IS_NEGATE) resu[item] = -resu[item]; + if (IS_ABS) resu[item] = (_TYPE_) Kokkos::abs(resu[item]); + if (IS_SQRT) resu[item] = (_TYPE_) Kokkos::sqrt(resu[item]); + if (IS_SQUARE) resu[item] = resu[item]*resu[item]; + if (IS_DIV) resu[item] /= x; + if (IS_INV) resu[item] = (_TYPE_) ((_TYPE_)1 /resu[item]); + } + }); + if (statistics().get_use_gpu()) end_gpu_timer(__KERNEL_NAME__, is_default_exec_space); } } #endif @@ -556,7 +524,7 @@ namespace { template void local_extrema_vect_generic_kernel(const TRUSTVect<_TYPE_,_SIZE_>& vx, int nblocs_left, Block_Iter<_SIZE_>& bloc_itr, - const _SIZE_ vect_size_tot, const int line_size, _TYPE_& min_max_val, int& i_min_max) + const _SIZE_ vect_size_tot, const int line_size, _TYPE_& min_max_val, _SIZE_& i_min_max) { // Shortcut for empty arrays (avoid case line_size == 0) if (bloc_itr.empty()) return ; @@ -570,53 +538,40 @@ void local_extrema_vect_generic_kernel(const TRUSTVect<_TYPE_,_SIZE_>& vx, int n static constexpr bool IS_ABS = (IS_MAX_ABS || IS_MIN_ABS); // Define the reducer, based on the reduction type - using reducer = typename std::conditional, Kokkos::MinLoc<_TYPE_, int>>::type; + using reducer = typename std::conditional, Kokkos::MinLoc<_TYPE_, _SIZE_>>::type; // Define the type of what the reducer will return ( a value + a index) using reducer_value_type = typename reducer::value_type; if (not(IS_MAXS || IS_MINS)) {Process::exit("Wrong operation type in local_extrema_vect_generic_kernel");} auto vx_view= vx.template view_ro<1, ExecSpace>().data(); -#ifdef TRUST_USE_GPU - if (nblocs_left>3) ToDo_Kokkos("nblocs_left too high, optimize by rewriting as local_operations_vect_bis_generic_kernel"); -#endif - for (; nblocs_left; nblocs_left--) - { - // Get index of next bloc start: - const _SIZE_ begin_bloc = (*(bloc_itr++)) * line_size; - const _SIZE_ end_bloc = (*(bloc_itr++)) * line_size; - - //Asserts - assert(begin_bloc >= 0 && end_bloc <= vect_size_tot && end_bloc >= begin_bloc); + const bool has_items = static_cast(bloc_itr.items_); + auto items = has_items ? bloc_itr.items_->template view_ro<1, ExecSpace>().data() : nullptr; + const _SIZE_ n_items = has_items ? bloc_itr.items_->size_array() : vect_size_tot / line_size; - //Define Policy - Kokkos::RangePolicy policy(begin_bloc, end_bloc); - - // Define the object in which the reduction is saved - reducer_value_type bloc_min_max; - - //Reduction - if (statistics().get_use_gpu()) start_gpu_timer(__KERNEL_NAME__); - Kokkos::parallel_reduce(policy, - KOKKOS_LAMBDA(const int i, reducer_value_type& local_min_max) + reducer_value_type global_min_max; + if (statistics().get_use_gpu()) start_gpu_timer(__KERNEL_NAME__); + Kokkos::parallel_reduce(Kokkos::RangePolicy(0, n_items), + KOKKOS_LAMBDA(const _SIZE_ i, reducer_value_type& local_min_max) + { + for (int j = 0; j < line_size; j++) { - const _TYPE_ val = (IS_ABS) ? Kokkos::abs(vx_view[i]) : vx_view[i]; - + const _SIZE_ item = (has_items ? items[i] : i) * line_size + j; + const _TYPE_ val = (IS_ABS) ? Kokkos::abs(vx_view[item]) : vx_view[item]; if ( (IS_MAXS && val>local_min_max.val) || (IS_MINS && val); + }, + reducer(global_min_max)); + if (statistics().get_use_gpu()) end_gpu_timer(__KERNEL_NAME__, is_default_exec_space); - //Bloc-level reduction - if ( (IS_MAXS && bloc_min_max.val > min_max_val) || (IS_MINS && bloc_min_max.val < min_max_val) ) - { - min_max_val=bloc_min_max.val; - i_min_max= bloc_min_max.loc; - } + if ( (IS_MAXS && global_min_max.val > min_max_val) || (IS_MINS && global_min_max.val < min_max_val) ) + { + min_max_val = global_min_max.val; + i_min_max = global_min_max.loc; } } } @@ -643,7 +598,7 @@ _TYPE_RETURN_ local_extrema_vect_generic(const TRUSTVect<_TYPE_,_SIZE_>& vx, Mp_ //Initialize results _TYPE_ min_max_val = neutral_value<_TYPE_,_TYPE_OP_>(); // _TYPE_ et pas _TYPE_RETURN_ desole ... - int i_min_max = -1 ; // seulement pour IMAX_ et IMIN_ + _SIZE_ i_min_max = -1 ; // seulement pour IMAX_ et IMIN_ //Localize data bool kernelOnDevice = vx.checkDataOnDevice(); @@ -695,60 +650,30 @@ template trustIdType local_extrema_vect_generic -void local_operations_vect_bis_generic_kernel(const TRUSTVect<_TYPE_,_SIZE_>& vx, int nblocs_left, +void local_operations_vect_bis_generic_kernel(const TRUSTVect<_TYPE_,_SIZE_>& tab_vx, int nblocs_left, Block_Iter<_SIZE_>& bloc_itr, const _SIZE_ vect_size_tot, const int line_size, _TYPE_& sum) { static constexpr bool IS_SQUARE = (_TYPE_OP_ == TYPE_OPERATION_VECT_BIS::SQUARE_), IS_SUM = (_TYPE_OP_ == TYPE_OPERATION_VECT_BIS::SOMME_); // Performance important point for TRUSTArray dynamic kernel to have serial mode performance: // Use pointer access into Kokkos loop with [] and getting raw pointer to view with .data() ! - auto vx_view = vx.template view_ro<1, ExecSpace>().data(); - if (nblocs_left>3) - { - // We use flattened items_blocs cause possible huge number in parallel of nblocs_left/kernel launch (e.g. during moyenne(Ps)) - auto items = bloc_itr.items_->template view_ro<1, ExecSpace>().data(); - // Reduction - if (statistics().get_use_gpu()) start_gpu_timer(__KERNEL_NAME__); - Kokkos::parallel_reduce(__KERNEL_NAME__, - Kokkos::RangePolicy(0, bloc_itr.items_->size_array()), - KOKKOS_LAMBDA(const int i, _TYPE_& local_sum) + auto vx = tab_vx.template view_ro<1, ExecSpace>().data(); + const bool has_items = static_cast(bloc_itr.items_); + auto items = has_items ? bloc_itr.items_->template view_ro<1, ExecSpace>().data() : nullptr; + const _SIZE_ n_items = has_items ? bloc_itr.items_->size_array() : vect_size_tot / line_size; + if (statistics().get_use_gpu()) start_gpu_timer(__KERNEL_NAME__); + Kokkos::parallel_reduce(__KERNEL_NAME__, + Kokkos::RangePolicy(0, n_items), + KOKKOS_LAMBDA(const _SIZE_ i, _TYPE_& local_sum) + { + for (int j = 0; j < line_size; j++) { - _SIZE_ item = items[i] * line_size; - const _TYPE_ x = vx_view[item]; + const _SIZE_ item = (has_items ? items[i] : i) * line_size + j; + const _TYPE_ x = vx[item]; if (IS_SQUARE) local_sum += x * x; - if (IS_SUM) local_sum += x; - },sum); - if (statistics().get_use_gpu()) end_gpu_timer(__KERNEL_NAME__, is_default_exec_space); - } - else - { - for (; nblocs_left; nblocs_left--) - { - // Get index of next bloc start: - const _SIZE_ begin_bloc = (*(bloc_itr++)) * line_size; - const _SIZE_ end_bloc = (*(bloc_itr++)) * line_size; - //Asserts - assert(begin_bloc >= 0 && end_bloc <= vect_size_tot && end_bloc >= begin_bloc); - //Define Policy - Kokkos::RangePolicy policy(begin_bloc, end_bloc); - // Define the bloc sum - _TYPE_ bloc_sum = 0; - //Reduction - if (statistics().get_use_gpu()) start_gpu_timer(__KERNEL_NAME__); - Kokkos::parallel_reduce(policy, KOKKOS_LAMBDA( - const _SIZE_ i, _TYPE_ - &local_sum) - { - const _TYPE_ x = vx_view[i]; - if (IS_SQUARE) local_sum += x * x; - if (IS_SUM) local_sum += x; - } - ,bloc_sum); //Reduce in bloc_sum - if (statistics().get_use_gpu()) end_gpu_timer(__KERNEL_NAME__, is_default_exec_space); - - //Bloc-level reduction - sum += bloc_sum; - } - } + if (IS_SUM) local_sum += x; + } + }, sum); + if (statistics().get_use_gpu()) end_gpu_timer(__KERNEL_NAME__, is_default_exec_space); } } #endif @@ -807,11 +732,11 @@ template double local_operations_vect_bis_generic -void invalidate_data_kernel(TRUSTVect<_TYPE_,_SIZE_>& resu, +void invalidate_data_kernel(TRUSTVect<_TYPE_,_SIZE_>& tab_resu, const ArrOfInt& items_blocs, const int line_size, const int blocs_size) { _TYPE_ invalid = (_TYPE_)-987654321; - auto resu_view= resu.template view_rw<1, ExecSpace>().data(); + auto resu = tab_resu.template view_rw<1, ExecSpace>().data(); int i = 0; for (int blocs_idx = 0; blocs_idx < blocs_size; blocs_idx += 2) // process data until beginning of next bloc, or end of array @@ -823,19 +748,19 @@ void invalidate_data_kernel(TRUSTVect<_TYPE_,_SIZE_>& resu, if (statistics().get_use_gpu()) start_gpu_timer(__KERNEL_NAME__); Kokkos::parallel_for(policy,KOKKOS_LAMBDA(const int count) { - resu_view[count]=invalid; + resu[count]=invalid; }); if (statistics().get_use_gpu()) end_gpu_timer(__KERNEL_NAME__, is_default_exec_space); i = items_blocs[blocs_idx+1] * line_size; } - const _SIZE_ bloc_end = resu.size_array(); // Process until end of vector + const _SIZE_ bloc_end = tab_resu.size_array(); // Process until end of vector //Define Policy Kokkos::RangePolicy policy(i, bloc_end); //Loop if (statistics().get_use_gpu()) start_gpu_timer(__KERNEL_NAME__); Kokkos::parallel_for(policy,KOKKOS_LAMBDA(const int count) { - resu_view[count]=invalid; + resu[count]=invalid; }); if (statistics().get_use_gpu()) end_gpu_timer(__KERNEL_NAME__, is_default_exec_space); } @@ -878,43 +803,26 @@ template void invalidate_data(TRUSTVect& resu, Mp_vect_options op namespace { template -void local_prodscal_kernel(const TRUSTVect<_TYPE_,_SIZE_>& vx, const TRUSTVect<_TYPE_,_SIZE_>& vy, int nblocs_left, +void local_prodscal_kernel(const TRUSTVect<_TYPE_,_SIZE_>& tab_vx, const TRUSTVect<_TYPE_,_SIZE_>& tab_vy, int nblocs_left, Block_Iter<_SIZE_>& bloc_itr, const int vect_size_tot, const int line_size, _TYPE_& sum) { - auto vx_view= vx.template view_ro<1, ExecSpace>().data(); - auto vy_view= vy.template view_ro<1, ExecSpace>().data(); -#ifdef TRUST_USE_GPU - if (nblocs_left>3) ToDo_Kokkos("nblocs_left too high, optimize by rewriting as local_operations_vect_bis_generic_kernel"); -#endif - for (; nblocs_left; nblocs_left--) - { - // Get index of next bloc start: - const _SIZE_ begin_bloc = (*(bloc_itr++)) * line_size; - const _SIZE_ end_bloc = (*(bloc_itr++)) * line_size; - - //Asserts - assert(begin_bloc >= 0 && end_bloc <= vect_size_tot && end_bloc >= begin_bloc); - - //Define Policy - Kokkos::RangePolicy policy(begin_bloc, end_bloc); - - // Define the bloc sum - _TYPE_ bloc_sum=0; - - //Reduction - if (statistics().get_use_gpu()) start_gpu_timer(__KERNEL_NAME__); - Kokkos::parallel_reduce(policy, KOKKOS_LAMBDA(const _SIZE_ i, _TYPE_& local_sum) + auto vx = tab_vx.template view_ro<1, ExecSpace>().data(); + auto vy = tab_vy.template view_ro<1, ExecSpace>().data(); + const bool has_items = static_cast(bloc_itr.items_); + auto items = has_items ? bloc_itr.items_->template view_ro<1, ExecSpace>().data() : nullptr; + const _SIZE_ n_items = has_items ? bloc_itr.items_->size_array() : vect_size_tot / line_size; + if (statistics().get_use_gpu()) start_gpu_timer(__KERNEL_NAME__); + Kokkos::parallel_reduce(__KERNEL_NAME__, + Kokkos::RangePolicy(0, n_items), + KOKKOS_LAMBDA(const _SIZE_ i, _TYPE_& local_sum) + { + for (int j = 0; j < line_size; j++) { - local_sum += vx_view[i]*vy_view[i]; + const _SIZE_ item = (has_items ? items[i] : i) * line_size + j; + local_sum += vx[item]*vy[item]; } - , Kokkos::Sum<_TYPE_>(bloc_sum)); //Reduce in bloc_sum - - //timer - if (statistics().get_use_gpu()) end_gpu_timer(__KERNEL_NAME__, is_default_exec_space); - - //Bloc-level reduction - sum += bloc_sum; - } + }, sum); + if (statistics().get_use_gpu()) end_gpu_timer(__KERNEL_NAME__, is_default_exec_space); } } #endif diff --git a/src/Kernel/Operateurs/Operateur_Grad_base.cpp b/src/Kernel/Operateurs/Operateur_Grad_base.cpp index 228bd6f6fd..a769754d1f 100644 --- a/src/Kernel/Operateurs/Operateur_Grad_base.cpp +++ b/src/Kernel/Operateurs/Operateur_Grad_base.cpp @@ -1,5 +1,5 @@ /**************************************************************************** -* Copyright (c) 2024, CEA +* Copyright (c) 2026, CEA * All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: @@ -50,7 +50,14 @@ void Operateur_Grad_base::dimensionner(Matrice_Morse& mat) const DoubleTab& Operateur_Grad_base::ajouter(const DoubleTab& inco, DoubleTab& secmem) const { if (has_interface_blocs()) - secmem *= -1, ajouter_blocs({}, secmem, {{ "pression", inco }}), secmem *= -1; /* pour avoir le bon signe */ + { + secmem *= -1; + tabs_t semi_impl; + //ajouter_blocs({}, secmem, {{"pression", inco}}); + semi_impl["pression"].ref(inco); /* evite la copie de inco dans tabs_t */ + ajouter_blocs({}, secmem, semi_impl); + secmem *= -1; /* pour avoir le bon signe */ + } else Process::exit(que_suis_je() + " : ajouter() not coded!"); return secmem; } diff --git a/src/Kernel/Postraitement/MED/Ecrire_MED.cpp b/src/Kernel/Postraitement/MED/Ecrire_MED.cpp index 1044f73c57..2a117c7d5f 100644 --- a/src/Kernel/Postraitement/MED/Ecrire_MED.cpp +++ b/src/Kernel/Postraitement/MED/Ecrire_MED.cpp @@ -38,6 +38,7 @@ using namespace MEDCoupling; Implemente_instanciable_32_64(Ecrire_MED_32_64,"Write_MED",Interprete); Add_synonym(Ecrire_MED,"Ecrire_MED"); +Add_synonym(Ecrire_MED_64, "Ecrire_MED_64"); // Anonymous namespace for local functions: namespace @@ -417,14 +418,6 @@ void Ecrire_MED_32_64<_SIZE_>::ecrire_domaine_dis(bool append) #endif } -#if INT_is_64_ == 2 -template <> -void Ecrire_MED_32_64::ecrire_domaine_dis(bool append) -{ - Process::exit("Ecrire_MED_32_64::ecrire_domaine_dis() -- Not allowed with a 64b object!"); -} -#endif - /*! @brief Permet d'ecrire le tableau de valeurs val comme un champ dans le fichier med de nom nom_fichier_, avec pour support le domaine de nom nom_dom. * * @param type: CHAMPPOINT,CHAMPMAILLE,CHAMPFACES diff --git a/src/Kernel/Postraitement/MED/TRUST_2_MED.cpp b/src/Kernel/Postraitement/MED/TRUST_2_MED.cpp index c03a794cf7..4e776792d3 100644 --- a/src/Kernel/Postraitement/MED/TRUST_2_MED.cpp +++ b/src/Kernel/Postraitement/MED/TRUST_2_MED.cpp @@ -1,5 +1,5 @@ /**************************************************************************** -* Copyright (c) 2025, CEA +* Copyright (c) 2026, CEA * All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: @@ -285,7 +285,7 @@ INTERP_KERNEL::NormalizedCellType type_geo_trio_to_type_medcoupling(const Nom& t type_cell = INTERP_KERNEL::NORM_QUAD4; mesh_dimension = 2; } - else if ((type_elem=="HEXAEDRE") || (type_elem=="HEXAEDRE_VEF")) + else if ((type_elem.debute_par("HEXAEDRE"))) { type_cell = INTERP_KERNEL::NORM_HEXA8; mesh_dimension = 3; @@ -295,7 +295,7 @@ INTERP_KERNEL::NormalizedCellType type_geo_trio_to_type_medcoupling(const Nom& t type_cell = INTERP_KERNEL::NORM_TRI3; mesh_dimension = 2; } - else if (type_elem=="TETRAEDRE") + else if ((type_elem.debute_par("TETRAEDRE"))) { type_cell = INTERP_KERNEL::NORM_TETRA4; mesh_dimension = 3; diff --git a/src/Kernel/Statistiques_temps/Integrale_tps_produit_champs.cpp b/src/Kernel/Statistiques_temps/Integrale_tps_produit_champs.cpp index 04e4bdb357..122b35ec88 100644 --- a/src/Kernel/Statistiques_temps/Integrale_tps_produit_champs.cpp +++ b/src/Kernel/Statistiques_temps/Integrale_tps_produit_champs.cpp @@ -16,6 +16,7 @@ #include #include #include +#include Implemente_instanciable(Integrale_tps_produit_champs, "Integrale_tps_produit_champs", Integrale_tps_Champ); @@ -46,6 +47,16 @@ void Integrale_tps_produit_champs::mettre_a_jour_integrale() if (t_courant != mon_second_champ()->get_time()) { + // Lors d'une reprise, certains champs (ex. gradient_vitesse) repartent a t=0 + // tandis que d'autres (ex. pression) sont relus depuis la sauvegarde. + // Si tps_integrale_ est deja avance (reprise), on ignore cette incoherence + // au premier pas et on synchronise tps_integrale_ sur le temps courant du + // premier champ pour repartir proprement. + if (tps_integrale_ > 0. && mon_second_champ()->get_time() == 0.) + { + tps_integrale_ = t_courant; + return; + } Cerr << "Integrale_tps_produit_champs::mettre_a_jour_integrale()" << finl; Cerr << "the current time of the field named " << nom[0] << " =" << t_courant << finl; Cerr << "is different of the second field current time " << nom2[0] << " =" << source2.temps() << finl; @@ -85,17 +96,13 @@ void Integrale_tps_produit_champs::ajoute_produit_tensoriel(double alpha, const { if (support_different_) { - ToDo_Kokkos("Use DoubleTrav and don't resize..."); // On ramene au centre des elements + const Domaine& dom = le_champ_->domaine_dis_base().domaine(); const DoubleTab& xp = ref_cast(Domaine_VF,le_champ_->domaine_dis_base()).xp(); int nb_elem_tot = xp.dimension_tot(0); - DoubleTab val_a, val_b; - // Le jour ou les champs seront mieux foutus, on n'aura - // pas a faire ca: - val_a.resize(nb_elem_tot, a.nb_comp()); - val_b.resize(nb_elem_tot, b.nb_comp()); - a.valeur_aux(xp, val_a); - b.valeur_aux(xp, val_b); + DoubleTrav val_a(nb_elem_tot, a.nb_comp()), val_b(nb_elem_tot, b.nb_comp()); + a.valeur_aux_centres_de_gravite(dom, val_a); + b.valeur_aux_centres_de_gravite(dom, val_b); le_champ_->valeurs().ajoute_produit_tensoriel(alpha, val_a, val_b); le_champ_->valeurs().echange_espace_virtuel(); } diff --git a/src/Kernel/Statistiques_temps/Moyenne_volumique.cpp b/src/Kernel/Statistiques_temps/Moyenne_volumique.cpp index ddd02105b7..8679f61329 100644 --- a/src/Kernel/Statistiques_temps/Moyenne_volumique.cpp +++ b/src/Kernel/Statistiques_temps/Moyenne_volumique.cpp @@ -258,7 +258,7 @@ int Moyenne_volumique::get_champ(const Nom& nom_pb, if (tmp == mc_nom_champ) { Operateur_Statistique_tps_base& stat = stats[i_stat].valeur(); - ref_cast_non_const(DoubleTab, stat.integrale().le_champ_calcule().valeurs()) = stat.calculer_valeurs(); + stat.calculer(ref_cast_non_const(DoubleTab, stat.integrale().le_champ_calcule().valeurs())); ref_champ = stat.integrale().le_champ_calcule(); return 1; } diff --git a/src/Kernel/Statistiques_temps/Op_Correlation.cpp b/src/Kernel/Statistiques_temps/Op_Correlation.cpp index 70c923076f..aab2c7d213 100644 --- a/src/Kernel/Statistiques_temps/Op_Correlation.cpp +++ b/src/Kernel/Statistiques_temps/Op_Correlation.cpp @@ -186,7 +186,7 @@ void Op_Correlation::completer(const Probleme_base& Pb, const Nom& prefix) integrale_tps_ab_.le_champ_calcule().changer_temps(Pb.schema_temps().temps_courant()); } -DoubleTab Op_Correlation::calculer_valeurs() const +void Op_Correlation::calculer(DoubleTab& tab_correlation) const { Integrale_tps_produit_champs correlation(integrale_tps_ab_); const double dt_ab = dt_integration_ab(); @@ -200,7 +200,7 @@ DoubleTab Op_Correlation::calculer_valeurs() const assert(est_egal(dt_b, dt_ab)); correlation.ajoute_produit_tensoriel(-1 / (dt_a * dt_b), integrale_tps_a_->le_champ_calcule(), integrale_tps_b_->le_champ_calcule()); } - return correlation.le_champ_calcule().valeurs(); + tab_correlation = correlation.le_champ_calcule().valeurs(); } int Op_Correlation::completer_post_statistiques(const Domaine& dom, const int is_axi, Format_Post_base& format) diff --git a/src/Kernel/Statistiques_temps/Op_Correlation.h b/src/Kernel/Statistiques_temps/Op_Correlation.h index 686b02dfce..fa748407be 100644 --- a/src/Kernel/Statistiques_temps/Op_Correlation.h +++ b/src/Kernel/Statistiques_temps/Op_Correlation.h @@ -51,7 +51,7 @@ class Op_Correlation : public Operateur_Statistique_tps_base inline int reprendre(Entree& is) override; inline void associer_op_stat(const Operateur_Statistique_tps_base&) override; void completer(const Probleme_base&, const Nom&) override; - DoubleTab calculer_valeurs() const override; + void calculer(DoubleTab&) const override; protected: OBS_PTR(Op_Moyenne) la_moyenne_a_; diff --git a/src/Kernel/Statistiques_temps/Op_Correlation_Triple.cpp b/src/Kernel/Statistiques_temps/Op_Correlation_Triple.cpp new file mode 100644 index 0000000000..8266e88517 --- /dev/null +++ b/src/Kernel/Statistiques_temps/Op_Correlation_Triple.cpp @@ -0,0 +1,415 @@ +/**************************************************************************** +* Copyright (c) 2026, CEA +* All rights reserved. +* +* Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: +* 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. +* 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. +* 3. Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. +* +* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. +* IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; +* OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +* +*****************************************************************************/ + +#include +#include +#include +#include +#include +#include +#include + +Implemente_instanciable(Op_Correlation_Triple, "Op_Correlation_Triple", Operateur_Statistique_tps_base); + +Sortie& Op_Correlation_Triple::printOn(Sortie& s) const { return s << que_suis_je() << " " << le_nom(); } +Entree& Op_Correlation_Triple::readOn(Entree& s) { return s; } + +// --------------------------------------------------------------------------- +// get_dt : lecture efficace de dt_integration_ sans rapatrier tout le tableau. +// --------------------------------------------------------------------------- +double get_dt(const DoubleTab& tab, int off_dt) +{ + if (tab.size_array() == 0) return 0.; + // Pour ne recuperer qu'une valeur: + Kokkos::View dt("dt_integration"); + if (tab.nb_dim() == 1) + { + CDoubleArrView val = static_cast(tab).view_ro(); + Kokkos::deep_copy(dt, Kokkos::subview(val, 0)); + } + else + { + CDoubleTabView val = tab.view_ro(); + Kokkos::deep_copy(dt, Kokkos::subview(val, 0, off_dt)); + } + return dt(); +} + +// --------------------------------------------------------------------------- +// associer +// --------------------------------------------------------------------------- +void Op_Correlation_Triple::associer(const Domaine_dis_base& une_zdis, + const Champ_Generique_base& le_champ_a, + const Champ_Generique_base& le_champ_b, + const Champ_Generique_base& le_champ_c, + double t1, double t2) +{ + le_champ_a_ = le_champ_a; + le_champ_b_ = le_champ_b; + le_champ_c_ = le_champ_c; + + OWN_PTR(Champ_base) es; + const Champ_base& source = le_champ_a.get_champ(es); + Nom type = source.que_suis_je(); + int renomme = 0; + if (type.debute_par("Champ")) renomme = 1; + type.suffix("Champ_"); + type.suffix("Fonc_"); + Nom type_final("Champ_Fonc_"); + if (renomme) type_final += type; + else type_final = type; + + integrale_tps_abc_.typer_champ(type_final); + integrale_tps_abc_.le_champ_calcule().associer_domaine_dis_base(une_zdis); + + t_deb_ = t1; + t_fin_ = t2; + tps_courant_ = t1; + integrale_tps_abc_.fixer_t_debut(t1); + integrale_tps_abc_.fixer_t_fin(t2); + integrale_tps_abc_.fixer_tps_integrale(t1); + integrale_tps_abc_.fixer_dt_integr(0.); +} + +// --------------------------------------------------------------------------- +// completer +// +// Le champ calcule de integrale_tps_abc_ est dimensionne avec nb_comp_tot_ +// colonnes au lieu de nb_comp_abc_, pour encoder egalement moy_a_/b_/c_ et +// dt_integration_ dans le meme DoubleTab. +// +// Disposition des colonnes : +// [0 .. nb_comp_abc_-1] : integrale triple +// [off_moy_a_ .. off_moy_a_+nca-1] : moy_a_ = int(F dt) +// [off_moy_b_ .. off_moy_b_+ncb-1] : moy_b_ = int(G dt) +// [off_moy_c_ .. off_moy_c_+ncc-1] : moy_c_ = int(H dt) +// [off_dt_] : dt_integration_ (meme valeur sur tous les elems) +// +// Pour le postraitement, get_champ() extrait les nb_comp_abc_ premieres colonnes +// via calculer_valeurs(). La surcharge de integrale().le_champ_calcule().nb_comp() +// par le Champ_Generique retourne nb_comp_abc_ (voir get_property). +// --------------------------------------------------------------------------- +void Op_Correlation_Triple::completer(const Probleme_base& Pb, const Nom& prefix) +{ + const OBS_PTR(Champ_Generique_base)& a = le_champ_a_; + const OBS_PTR(Champ_Generique_base)& b = le_champ_b_; + const OBS_PTR(Champ_Generique_base)& c = le_champ_c_; + + const Domaine_dis_base& domaine = a->get_ref_domaine_dis_base(); + + OWN_PTR(Champ_base) es_a, es_b, es_c; + const Champ_base& source_a = a->get_champ(es_a); + const Champ_base& source_b = b->get_champ(es_b); + const Champ_base& source_c = c->get_champ(es_c); + + const int nca = source_a.nb_comp(); + const int ncb = source_b.nb_comp(); + const int ncc = source_c.nb_comp(); + nb_comp_a_ = nca; + nb_comp_b_ = ncb; + nb_comp_c_ = ncc; + nb_comp_abc_ = nca * ncb * ncc; + + off_moy_a_ = nb_comp_abc_; + off_moy_b_ = nb_comp_abc_ + nca; + off_moy_c_ = nb_comp_abc_ + nca + ncb; + off_dt_ = nb_comp_abc_ + nca + ncb + ncc; + nb_comp_tot_ = nb_comp_abc_ + nca + ncb + ncc + 1; + + bool ref_abc = sub_type(Champ_Generique_refChamp, a.valeur()) + && sub_type(Champ_Generique_refChamp, b.valeur()) + && sub_type(Champ_Generique_refChamp, c.valeur()); + + Noms noms_a, noms_b, noms_c, compo_a, compo_b, compo_c; + if (!ref_abc) + { + noms_a = a->get_property("nom"); + noms_b = b->get_property("nom"); + noms_c = c->get_property("nom"); + compo_a = a->get_property("composantes"); + compo_b = b->get_property("composantes"); + compo_c = c->get_property("composantes"); + } + else + { + noms_a = a->get_property("nom_cible"); + noms_b = b->get_property("nom_cible"); + noms_c = c->get_property("nom_cible"); + compo_a = source_a.noms_compo(); + compo_b = source_b.noms_compo(); + compo_c = source_c.noms_compo(); + } + const Nom nom_a = noms_a[0], nom_b = noms_b[0], nom_c = noms_c[0]; + const Noms unites_a = a->get_property("unites"); + const Noms unites_b = b->get_property("unites"); + const Noms unites_c = c->get_property("unites"); + + Nom type_P0 = "Champ_Fonc_P0_"; + type_P0 += Pb.discretisation().que_suis_je().substr_old(1, 3); + const int nb_val = domaine.nb_elem(); + + // Noms de composantes pour le tableau etendu (nb_comp_tot_ colonnes). + // Les premieres nb_comp_abc_ colonnes ont les vrais noms (pour le post). + // Les colonnes auxiliaires ont des noms internes (non postes). + Noms noms_comp(nb_comp_tot_); + { + Nom debut("Correlation_Triple_"); + for (int i = 0; i < nca; i++) + for (int j = 0; j < ncb; j++) + for (int k = 0; k < ncc; k++) + noms_comp[(i * ncb + j) * ncc + k] = + debut + compo_a[i] + "_" + compo_b[j] + "_" + compo_c[k]; + // Colonnes auxiliaires : noms internes + for (int i = 0; i < nca; i++) + noms_comp[off_moy_a_ + i] = Nom("_moy_a_") + compo_a[i]; + for (int i = 0; i < ncb; i++) + noms_comp[off_moy_b_ + i] = Nom("_moy_b_") + compo_b[i]; + for (int i = 0; i < ncc; i++) + noms_comp[off_moy_c_ + i] = Nom("_moy_c_") + compo_c[i]; + noms_comp[off_dt_] = "_dt_integration_"; + } + + Nom nom_post("Correlation_Triple_"); + nom_post += nom_a + "_" + nom_b + "_" + nom_c; + + Nom unite(unites_a[0]); + unite += "."; + unite += unites_b[0]; + unite += "."; + unite += unites_c[0]; + Noms unites_tot(nb_comp_tot_); + for (int i = 0; i < nb_comp_tot_; i++) unites_tot[i] = unite; + + integrale_tps_abc_.support_different() = 1; + integrale_tps_abc_.typer_champ(type_P0); + integrale_tps_abc_.le_champ_calcule().associer_domaine_dis_base(domaine); + integrale_tps_abc_.le_champ_calcule().fixer_nb_comp(nb_comp_tot_); + valeurs_etendues().resize(0, nb_comp_tot_); + integrale_tps_abc_.le_champ_calcule().fixer_nb_valeurs_nodales(nb_val); + valeurs_etendues() = 0.; + + integrale_tps_abc_.le_champ_calcule().nommer(nom_post); + integrale_tps_abc_.le_champ_calcule().set_pdi_name(prefix + nom_post); + integrale_tps_abc_.le_champ_calcule().fixer_noms_compo(noms_comp); + integrale_tps_abc_.le_champ_calcule().fixer_unites(unites_tot); + integrale_tps_abc_.le_champ_calcule().changer_temps(Pb.schema_temps().temps_courant()); +} + +void set_dt(DoubleTab& tab_dt, int off_dt, double val) +{ + if (tab_dt.size_array() == 0) return; + const int n = tab_dt.dimension_tot(0); + if (tab_dt.nb_dim() == 1) { tab_dt(0) = val; } + else + { + DoubleTabView tab = tab_dt.view_rw(); + Kokkos::parallel_for(start_gpu_timer(__KERNEL_NAME__), range_1D(0, n), KOKKOS_LAMBDA(const int i) + { + tab(i, off_dt) = val; + }); + end_gpu_timer(__KERNEL_NAME__); + } +} + +// --------------------------------------------------------------------------- +// mettre_a_jour +// --------------------------------------------------------------------------- +void Op_Correlation_Triple::mettre_a_jour(double tps) +{ + if (tps < t_deb_ || tps > t_fin_) return; + const double dt = tps - tps_courant_; + tps_courant_ = tps; + if (dt <= 0.) return; + + const Domaine_dis_base& zdis = le_champ_a_->get_ref_domaine_dis_base(); + const int n_elem = zdis.domaine().nb_elem_tot(); + + // Interpolation aux centres de gravite des elements. + // On suit exactement le patron de Integrale_tps_produit_champs::ajoute_produit_tensoriel : + // val.resize(nb_elem_tot, nb_comp) SANS md_vector, puis valeur_aux(xp, val). + // Ceci evite toute propagation de md_vector (faces en VEF) dans val, + // qui declencherait l'assertion md_vector_ == v.md_vector_ en mode debug. + const DoubleTab& xp = ref_cast(Domaine_VF, zdis).xp(); + auto aux_elems = [&](const OBS_PTR(Champ_Generique_base)& ch, int nb_comp, DoubleTab& tab_val) + { + OWN_PTR(Champ_base) es; + const Champ_base& src = ch->get_champ(es); + const DoubleTab& tab_raw = src.valeurs(); + if (tab_raw.dimension_tot(0) != n_elem) + { + // Champ aux faces : interpolation aux elements via valeur_aux(xp, tab_val). + // tab_val est dimensionne SANS md_vector pour ne pas heriter celui des faces. + if (nb_comp > 1) tab_val.resize(n_elem, nb_comp); + else tab_val.resize(n_elem); + src.valeur_aux(xp, tab_val); + } + else + { + // Champ deja aux elements : copie simple. + // On copie les valeurs dans un tableau local sans md_vector + // pour rester coherent avec le cas interpole. + if (nb_comp > 1) tab_val.resize(n_elem, nb_comp, RESIZE_OPTIONS::NOCOPY_NOINIT); + else tab_val.resize(n_elem, RESIZE_OPTIONS::NOCOPY_NOINIT); + const int nloc = std::min(n_elem, (int)tab_raw.dimension_tot(0)); + if (nb_comp == 1) + { + CDoubleArrView raw = static_cast(tab_raw).view_ro(); + DoubleArrView val = static_cast(tab_val).view_rw(); + Kokkos::parallel_for(start_gpu_timer(__KERNEL_NAME__), range_1D(0, nloc), KOKKOS_LAMBDA(const int i) + { + val(i) = raw(i); + }); + end_gpu_timer(__KERNEL_NAME__); + } + else + { + CDoubleTabView raw = tab_raw.view_ro(); + DoubleTabView val = tab_val.view_rw(); + Kokkos::parallel_for(start_gpu_timer(__KERNEL_NAME__), range_2D({0, 0}, {nloc, nb_comp}), KOKKOS_LAMBDA(const int i, const int k) + { + val(i, k) = raw(i, k); + }); + end_gpu_timer(__KERNEL_NAME__); + } + } + }; + + DoubleTrav tab_val_a, tab_val_b, tab_val_c; + aux_elems(le_champ_a_, nb_comp_a_, tab_val_a); + aux_elems(le_champ_b_, nb_comp_b_, tab_val_b); + aux_elems(le_champ_c_, nb_comp_c_, tab_val_c); + + DoubleTab& tab_ext = valeurs_etendues(); + const double dt_old = get_dt(tab_ext, off_dt_); + const double dt_new = dt_old + dt; + + const int n = std::min((int)tab_ext.dimension_tot(0), n_elem); + + CDoubleArrView val_a = static_cast(tab_val_a).view_ro(); + CDoubleArrView val_b = static_cast(tab_val_b).view_ro(); + CDoubleArrView val_c = static_cast(tab_val_c).view_ro(); + DoubleTabView ext = tab_ext.view_rw(); + + const int nb_comp_a = nb_comp_a_, nb_comp_b = nb_comp_b_, nb_comp_c = nb_comp_c_; + const int off_moy_a = off_moy_a_, off_moy_b = off_moy_b_, off_moy_c = off_moy_c_; + + // Etape 1 : moy_X += X*dt (colonnes auxiliaires du tableau etendu) + Kokkos::parallel_for(start_gpu_timer(__KERNEL_NAME__), range_2D({0, 0}, {n, nb_comp_a}), KOKKOS_LAMBDA(const int i, const int ia) + { + ext(i, off_moy_a + ia) += (nb_comp_a == 1 ? val_a(i) : val_a(i * nb_comp_a + ia)) * dt; + }); + end_gpu_timer(__KERNEL_NAME__); + Kokkos::parallel_for(start_gpu_timer(__KERNEL_NAME__), range_2D({0, 0}, {n, nb_comp_b}), KOKKOS_LAMBDA(const int i, const int ib) + { + ext(i, off_moy_b + ib) += (nb_comp_b == 1 ? val_b(i) : val_b(i * nb_comp_b + ib)) * dt; + }); + end_gpu_timer(__KERNEL_NAME__); + Kokkos::parallel_for(start_gpu_timer(__KERNEL_NAME__), range_2D({0, 0}, {n, nb_comp_c}), KOKKOS_LAMBDA(const int i, const int ic) + { + ext(i, off_moy_c + ic) += (nb_comp_c == 1 ? val_c(i) : val_c(i * nb_comp_c + ic)) * dt; + }); + end_gpu_timer(__KERNEL_NAME__); + + // Etape 2 : integrale += (F-)*(G-)*(H-)*dt + Kokkos::parallel_for(start_gpu_timer(__KERNEL_NAME__), range_2D({0, 0}, {n, nb_comp_a}), KOKKOS_LAMBDA(const int i, const int ia) + { + for (int ib = 0; ib < nb_comp_b; ib++) + for (int ic = 0; ic < nb_comp_c; ic++) + { + const int idx = (ia * nb_comp_b + ib) * nb_comp_c + ic; + const double fa = (nb_comp_a == 1 ? val_a(i) : val_a(i * nb_comp_a + ia)) - ext(i, off_moy_a + ia) / dt_new; + const double fb = (nb_comp_b == 1 ? val_b(i) : val_b(i * nb_comp_b + ib)) - ext(i, off_moy_b + ib) / dt_new; + const double fc = (nb_comp_c == 1 ? val_c(i) : val_c(i * nb_comp_c + ic)) - ext(i, off_moy_c + ic) / dt_new; + ext(i, idx) += fa * fb * fc * dt; + } + }); + end_gpu_timer(__KERNEL_NAME__); + + // Etape 3 : mise a jour de dt_integration_ (meme valeur partout) + set_dt(tab_ext, off_dt_, dt_new); + + integrale_tps_abc_.fixer_dt_integr(dt_new); + integrale_tps_abc_.le_champ_calcule().changer_temps(tps); +} + +// --------------------------------------------------------------------------- +// calculer_valeurs : extrait et renvoie les nb_comp_abc_ premieres colonnes. +// +// On utilise DoubleTrav(valeurs()) exactement comme Op_Moyenne::calculer_valeurs. +// DoubleTrav copie la structure (taille, md_vector) de l'espace de stockage +// du champ calcule, ce qui garantit que le tableau retourne porte le meme +// md_vector que l'espace de stockage de Champ_Generique_Correlation_Triple. +// Sans cela, l'affectation tab = calculer_valeurs() dans get_champ() declencherait +// l'assertion md_vector_ == v.md_vector_ en mode debug. +// +// Note : valeurs() retourne ici l'integrale_tps_abc_ qui a nb_comp_tot_ colonnes. +// On a besoin d'un tableau a nb_comp_abc_ colonnes. On cree donc un DoubleTrav +// a partir du champ calcule de espace_stockage_ (passe en argument dans get_champ) +// qui a la bonne taille. Mais calculer_valeurs() n'a pas acces a espace_stockage_. +// Solution : on cree le tableau retourne en copiant la structure de ext mais en +// ne retenant que les nb_comp_abc_ premieres colonnes via resize + copie scalaire, +// en preservant le md_vector via copy() de la structure de ext. +// --------------------------------------------------------------------------- +void Op_Correlation_Triple::fill_result(DoubleTab& tab) const +{ + // Remplit tab (deja dimensionne et portant le bon md_vector) avec les valeurs + // de la correlation triple normalisees par dt_integration_. + // Travaille directement sur tab sans creer de tableau temporaire, evitant ainsi + // tout probleme de md_vector dans les affectations. + const DoubleTab& tab_ext = valeurs_etendues(); + const double dt = get_dt(tab_ext, off_dt_); + const int n = std::min((int)tab_ext.dimension_tot(0), (int)tab.dimension_tot(0)); + + if (dt > 0.) + { + CDoubleTabView ext = tab_ext.view_ro(); + if (nb_comp_abc_ == 1) + { + DoubleArrView val = static_cast(tab).view_rw(); + Kokkos::parallel_for(start_gpu_timer(__KERNEL_NAME__), range_1D(0, n), KOKKOS_LAMBDA(const int i) + { + val(i) = ext(i, 0) / dt; + }); + end_gpu_timer(__KERNEL_NAME__); + } + else + { + const int nb_comp = nb_comp_abc_; + DoubleTabView val = tab.view_rw(); + Kokkos::parallel_for(start_gpu_timer(__KERNEL_NAME__), range_2D({0, 0}, {n, nb_comp}), KOKKOS_LAMBDA(const int i, const int k) + { + val(i, k) = ext(i, k) / dt; + }); + end_gpu_timer(__KERNEL_NAME__); + } + } + else + tab = 0.; +} + +void Op_Correlation_Triple::calculer(DoubleTab& tab) const +{ + // Pour les appels hors get_champ (ex: tests unitaires). + // tab est un DoubleTab brut sans md_vector; fill_result y ecrit directement. + const int n = valeurs_etendues().dimension_tot(0); + if (nb_comp_abc_ == 1) tab.resize(n, RESIZE_OPTIONS::NOCOPY_NOINIT); + else tab.resize(n, nb_comp_abc_, RESIZE_OPTIONS::NOCOPY_NOINIT); + fill_result(tab); +} + +int Op_Correlation_Triple::completer_post_statistiques(const Domaine&, const int, Format_Post_base&) +{ + return 1; +} diff --git a/src/Kernel/Statistiques_temps/Op_Correlation_Triple.h b/src/Kernel/Statistiques_temps/Op_Correlation_Triple.h new file mode 100644 index 0000000000..f972745fc6 --- /dev/null +++ b/src/Kernel/Statistiques_temps/Op_Correlation_Triple.h @@ -0,0 +1,189 @@ +/**************************************************************************** +* Copyright (c) 2026, CEA +* All rights reserved. +* +* Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: +* 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. +* 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. +* 3. Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. +* +* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. +* IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; +* OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +* +*****************************************************************************/ + +#ifndef Op_Correlation_Triple_included +#define Op_Correlation_Triple_included + +#include +#include +#include +#include + +/*! @brief class Op_Correlation_Triple + * + * Calcule (1/T) * int_0^T (F-)*(G-)*(H-) dt + * ou (t) = (1/t) * int_0^t X dt' est la moyenne courante. + * + * Algorithme : accumulation incrementale a la volee. + * A chaque pas dt : + * 1. moy_a_ += F*dt, moy_b_ += G*dt, moy_c_ += H*dt + * 2. integrale += (F - moy_a_/T_new)*(G - moy_b_/T_new)*(H - moy_c_/T_new)*dt + * avec T_new = dt_integration_ + dt. + * + * === Strategie de sauvegarde / reprise === + * + * Le framework TRUST impose qu'un operateur ne sauvegarde/reprenne qu'UN + * SEUL bloc champ : le framework ecrit/lit [ident][type] et l'operateur + * ecrit/lit [temps][donnees]. + * Solution : on encode TOUT l'etat dans un UNIQUE DoubleTab etendu + * (integrale_etendue_) de taille (n_elem, nb_comp + nb_comp_a + nb_comp_b + nb_comp_c + 1) : + * colonnes [0 .. nb_comp-1] : integrale triple + * colonnes [nb_comp .. nb_comp+nca-1] : moy_a_ + * colonnes [nb_comp+nca .. nb_comp+nca+ncb-1] : moy_b_ + * colonnes [nb_comp+nca+ncb .. nb_comp+nca+ncb+ncc-1] : moy_c_ + * colonne [nb_comp+nca+ncb+ncc] : dt_integration_ + * + * Ce DoubleTab etendu est le champ calcule de integrale_tps_abc_, nomme avec + * son pdi_name normal. Ainsi sauvegarder/reprendre deleguent entierement a + * Champ_Fonc_base::sauvegarder/reprendre, qui gere xyz, single_hdf et pdi + * sans aucun code specifique dans notre operateur. + * + * Pour le postraitement, get_champ() extrait les nb_comp premieres colonnes. + * Pour PDI, data_a_sauvegarder() declare un seul champ (le tableau etendu), + * exactement comme Op_Correlation. + * + */ +class Op_Correlation_Triple : public Operateur_Statistique_tps_base +{ + Declare_instanciable(Op_Correlation_Triple); +public: + inline const Nom& le_nom() const override { return integrale_tps_abc_.le_champ_calcule().le_nom(); } + inline double temps() const override { return integrale_tps_abc_.le_champ_calcule().temps(); } + inline const Integrale_tps_produit_champs& integrale() const override { return integrale_tps_abc_; } + + inline const OBS_PTR(Champ_Generique_base)& le_champ_a() const { return le_champ_a_; } + // Nombre de composantes a poster (sans les colonnes auxiliaires du tableau etendu) + inline int nb_comp_post() const { return nb_comp_abc_; } + + // Acces aux parties du tableau etendu + inline DoubleTab& valeurs_etendues() { return integrale_tps_abc_.le_champ_calcule().valeurs(); } + inline const DoubleTab& valeurs_etendues() const { return integrale_tps_abc_.le_champ_calcule().valeurs(); } + + void mettre_a_jour(double tps) override; + inline void initialiser(double val) override; + inline void associer(const Domaine_dis_base&, const Champ_base&, double t1, double t2); + inline void associer(const Domaine_dis_base&, const Champ_Generique_base&, double t1, double t2) override; + void associer(const Domaine_dis_base&, const Champ_Generique_base&, + const Champ_Generique_base&, const Champ_Generique_base&, double t1, double t2); + inline void fixer_tstat_deb(double, double) override; + inline void fixer_tstat_fin(double) override; + int completer_post_statistiques(const Domaine& dom, const int is_axi, Format_Post_base& format) override; + inline std::vector data_a_sauvegarder() const override; + inline int sauvegarder(Sortie& os) const override; + inline int reprendre(Entree& is) override; + void associer_op_stat(const Operateur_Statistique_tps_base&) override { } + void completer(const Probleme_base&, const Nom&) override; + void calculer(DoubleTab&) const override; + void fill_result(DoubleTab& tab) const; // ecrit directement dans tab (preserves md_vector) + +protected: + OBS_PTR(Champ_Generique_base) le_champ_a_, le_champ_b_, le_champ_c_; + + // Tableau etendu : [integrale_triple | moy_a | moy_b | moy_c | dt_integration] + // Taille : (n_elem, nb_comp_abc + nca + ncb + ncc + 1) + // Stocke dans le champ calcule de integrale_tps_abc_. + Integrale_tps_produit_champs integrale_tps_abc_; + + // Offsets dans le tableau etendu (fixes dans completer()) + int nb_comp_abc_ = 1; // nb_comp_a * nb_comp_b * nb_comp_c + int off_moy_a_ = 1; // = nb_comp_abc_ + int off_moy_b_ = 2; // = nb_comp_abc_ + nca + int off_moy_c_ = 3; // = nb_comp_abc_ + nca + ncb + int off_dt_ = 4; // = nb_comp_abc_ + nca + ncb + ncc + int nb_comp_tot_ = 5; // = nb_comp_abc_ + nca + ncb + ncc + 1 + + int nb_comp_a_ = 1, nb_comp_b_ = 1, nb_comp_c_ = 1; + + double t_deb_ = 0.; + double t_fin_ = 1.e30; + double tps_courant_ = 0.; +}; + +// --------------------------------------------------------------------------- +// Accesseurs inline vers les parties du tableau etendu +// --------------------------------------------------------------------------- + +// dt_integration_ encode en colonne off_dt_ ligne 0 (identique sur tous les elems) +double get_dt(const DoubleTab& tab, int off_dt); + +void set_dt(DoubleTab& tab, int off_dt, double val); + +inline void Op_Correlation_Triple::initialiser(double val_init) +{ + integrale_tps_abc_.le_champ_calcule().valeurs() = val_init; +} + +inline void Op_Correlation_Triple::associer(const Domaine_dis_base&, const Champ_base&, double, double) +{ + Cerr << "Exactly three fields must be associated to triple correlation operator." << finl; + exit(); +} + +inline void Op_Correlation_Triple::associer(const Domaine_dis_base&, const Champ_Generique_base&, double, double) +{ + Cerr << "Exactly three fields must be associated to triple correlation operator." << finl; + exit(); +} + +inline void Op_Correlation_Triple::fixer_tstat_deb(double tdeb, double tps) +{ + // Ne PAS remettre les valeurs a zero ici : fixer_tstat_deb est appele + // par le framework APRES reprendre() pour repositionner les bornes + // temporelles, et ne doit pas ecraser les donnees restaurees. + // La remise a zero se fait uniquement via initialiser(0), appele par + // fixer_serie() (cas statistiques en serie), pas lors d'une reprise. + t_deb_ = tdeb; + tps_courant_ = tps; + integrale_tps_abc_.fixer_t_debut(tdeb); + integrale_tps_abc_.fixer_tps_integrale(tps); + integrale_tps_abc_.fixer_dt_integr(tps - tdeb); +} + +inline void Op_Correlation_Triple::fixer_tstat_fin(double tps) +{ + t_fin_ = tps; + integrale_tps_abc_.fixer_t_fin(tps); +} + +inline std::vector Op_Correlation_Triple::data_a_sauvegarder() const +{ + // Un seul champ, exactement comme Op_Correlation + const Nom& name = integrale_tps_abc_.le_champ_calcule().get_pdi_name(); + int nb_dim = integrale_tps_abc_.le_champ_calcule().valeurs().nb_dim(); + YAML_data d(name.getString(), "double", nb_dim); + std::vector data; + data.push_back(d); + return data; +} + +inline int Op_Correlation_Triple::sauvegarder(Sortie& os) const +{ + // Delegation complete a Champ_Fonc_base::sauvegarder, exactement comme Op_Correlation + return integrale_tps_abc_.le_champ_calcule().sauvegarder(os); +} + +inline int Op_Correlation_Triple::reprendre(Entree& is) +{ + integrale_tps_abc_.le_champ_calcule().reprendre(is); + // Resynchronisation depuis le tableau etendu restaure + const double dt_repris = get_dt(valeurs_etendues(), off_dt_); + const double tps_repris = integrale_tps_abc_.le_champ_calcule().temps(); + tps_courant_ = tps_repris; + integrale_tps_abc_.fixer_tps_integrale(tps_repris); + integrale_tps_abc_.fixer_dt_integr(dt_repris); + return 1; +} + +#endif diff --git a/src/Kernel/Statistiques_temps/Op_Ecart_type.cpp b/src/Kernel/Statistiques_temps/Op_Ecart_type.cpp index 365d058ec9..078b168ac3 100644 --- a/src/Kernel/Statistiques_temps/Op_Ecart_type.cpp +++ b/src/Kernel/Statistiques_temps/Op_Ecart_type.cpp @@ -68,7 +68,7 @@ void Op_Ecart_type::completer(const Probleme_base& Pb, const Nom& prefix) integrale_carre_champ_.le_champ_calcule().changer_temps(Pb.schema_temps().temps_courant()); } -DoubleTab Op_Ecart_type::calculer_valeurs() const +void Op_Ecart_type::calculer(DoubleTab& ecart_type) const { double dt = dt_integration(); if (!est_egal(dt, dt_integration_carre())) @@ -76,7 +76,6 @@ DoubleTab Op_Ecart_type::calculer_valeurs() const Cerr << "Not implemented yet in Op_Ecart_type::calculer_valeurs()" << finl; exit(); } - DoubleTrav ecart_type(valeurs_carre()); ecart_type = valeurs_carre(); if (dt > 0) { @@ -86,5 +85,4 @@ DoubleTab Op_Ecart_type::calculer_valeurs() const ecart_type.abs(); // To avoid negative number ? ecart_type.racine_carree(); // sqrt(mean(I^2)-mean(I)^2) } - return ecart_type; } diff --git a/src/Kernel/Statistiques_temps/Op_Ecart_type.h b/src/Kernel/Statistiques_temps/Op_Ecart_type.h index c14c38be39..948ae030f4 100644 --- a/src/Kernel/Statistiques_temps/Op_Ecart_type.h +++ b/src/Kernel/Statistiques_temps/Op_Ecart_type.h @@ -1,5 +1,5 @@ /**************************************************************************** -* Copyright (c) 2025, CEA +* Copyright (c) 2026, CEA * All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: @@ -44,7 +44,7 @@ class Op_Ecart_type : public Operateur_Statistique_tps_base inline void fixer_tstat_fin(double) override; inline void associer_op_stat(const Operateur_Statistique_tps_base&) override; void completer(const Probleme_base&, const Nom&) override; - DoubleTab calculer_valeurs() const override; + void calculer(DoubleTab&) const override; inline std::vector data_a_sauvegarder() const override; inline int sauvegarder(Sortie& os) const override; inline int reprendre(Entree& is) override; diff --git a/src/Kernel/Statistiques_temps/Op_Moyenne.cpp b/src/Kernel/Statistiques_temps/Op_Moyenne.cpp index 4fd29ac6d8..cc3e15049e 100644 --- a/src/Kernel/Statistiques_temps/Op_Moyenne.cpp +++ b/src/Kernel/Statistiques_temps/Op_Moyenne.cpp @@ -71,12 +71,10 @@ void Op_Moyenne::completer(const Probleme_base& Pb, const Nom& prefix) integrale_champ_.le_champ_calcule().changer_temps(Pb.schema_temps().temps_courant()); } -DoubleTab Op_Moyenne::calculer_valeurs() const +void Op_Moyenne::calculer(DoubleTab& moyenne) const { double dt = dt_integration(); - DoubleTrav moyenne(valeurs()); moyenne = valeurs(); if (dt > 0) moyenne /= dt; - return moyenne; } diff --git a/src/Kernel/Statistiques_temps/Op_Moyenne.h b/src/Kernel/Statistiques_temps/Op_Moyenne.h index 0cd743592e..a91c3cb4be 100644 --- a/src/Kernel/Statistiques_temps/Op_Moyenne.h +++ b/src/Kernel/Statistiques_temps/Op_Moyenne.h @@ -43,7 +43,7 @@ class Op_Moyenne: public Operateur_Statistique_tps_base inline int sauvegarder(Sortie& os) const override; inline int reprendre(Entree& is) override; void completer(const Probleme_base&, const Nom&) override; - DoubleTab calculer_valeurs() const override; + void calculer(DoubleTab&) const override; protected: Integrale_tps_Champ integrale_champ_; diff --git a/src/Kernel/Statistiques_temps/Operateur_Statistique_tps_base.h b/src/Kernel/Statistiques_temps/Operateur_Statistique_tps_base.h index 675fd0019a..2be7e80dcc 100644 --- a/src/Kernel/Statistiques_temps/Operateur_Statistique_tps_base.h +++ b/src/Kernel/Statistiques_temps/Operateur_Statistique_tps_base.h @@ -1,5 +1,5 @@ /**************************************************************************** -* Copyright (c) 2025, CEA +* Copyright (c) 2026, CEA * All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: @@ -47,7 +47,7 @@ class Operateur_Statistique_tps_base : public Objet_U virtual const Integrale_tps_Champ& integrale() const =0; virtual void initialiser(double val) =0; virtual void completer(const Probleme_base&, const Nom& post_name) =0; - virtual DoubleTab calculer_valeurs() const =0; + virtual void calculer(DoubleTab&) const =0; virtual int completer_post_statistiques(const Domaine& dom,const int is_axi,Format_Post_base& format); inline double tstat_deb() const { return tstat_deb_; } inline double tstat_fin() const { return tstat_fin_; } diff --git a/src/Kernel/Utilitaires/Device.cpp b/src/Kernel/Utilitaires/Device.cpp index 6a36e81cde..04e3642a7a 100644 --- a/src/Kernel/Utilitaires/Device.cpp +++ b/src/Kernel/Utilitaires/Device.cpp @@ -1,5 +1,5 @@ /**************************************************************************** -* Copyright (c) 2025, CEA +* Copyright (c) 2026, CEA * All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: @@ -33,19 +33,6 @@ #include #endif -/* -bool init_device_ = false; -bool clock_on = false; -bool fence = true; -double clock_start; -int timer_counter=0; -#ifdef TRUST_USE_GPU -bool timer = true; -#else -bool timer = false; -#endif -*/ - std::string ptrToString(const void* adr) { std::stringstream ss; @@ -197,23 +184,30 @@ _TYPE_* allocateOnDevice(_TYPE_* ptr, _SIZE_ size) #ifdef TRUST_USE_GPU assert(!isAllocatedOnDevice(ptr)); // Verifie que la zone n'est pas deja allouee statistics().begin_count(STD_COUNTERS::gpu_malloc_free,statistics().get_last_opened_counter_level()+1); - size_t bytes = sizeof(_TYPE_) * size; - size_t free_bytes = DeviceMemory::deviceMemGetInfo(0); - size_t total_bytes = DeviceMemory::deviceMemGetInfo(1); - if (bytes>free_bytes) + _TYPE_ *device_ptr = nullptr; + if (size>0) { - Cerr << "Error ! Trying to allocate " << bytes << " bytes on GPU memory whereas only " << free_bytes << " bytes are available." << finl; - Process::exit(); + size_t bytes = sizeof(_TYPE_) * size; + size_t free_bytes = DeviceMemory::deviceMemGetInfo(0); + size_t total_bytes = DeviceMemory::deviceMemGetInfo(1); + if (bytes > free_bytes) + { + Cerr << "Error ! Trying to allocate " << bytes << " bytes on GPU memory whereas only " << free_bytes + << " bytes are available." << finl; + Process::exit(); + } + device_ptr = static_cast<_TYPE_ *>(Kokkos::kokkos_malloc(bytes)); + if (statistics().is_gpu_verbose_on() && Process::je_suis_maitre()) + { + std::string clock(Process::is_parallel() ? "[clock]#" + std::to_string(Process::me()) : "[clock] "); + double ms = 1000 * statistics().get_time_since_last_open(STD_COUNTERS::gpu_malloc_free); + printf("%s %7.3f ms [Data] Allocate on device [%9s] %6ld Bytes (%ld/%ldGB free) Currently allocated: %6ld\n", + clock.c_str(), ms, ptrToString(ptr).c_str(), long(bytes), free_bytes / (1024 * 1024 * 1024), + total_bytes / (1024 * 1024 * 1024), long(DeviceMemory::allocatedBytesOnDevice())); + } } - _TYPE_* device_ptr = static_cast<_TYPE_*>(Kokkos::kokkos_malloc(bytes)); // Map host_ptr with device_ptr: DeviceMemory::add(ptr, device_ptr, size * sizeof(_TYPE_)); - if (statistics().is_gpu_verbose_on() && Process::je_suis_maitre()) - { - std::string clock(Process::is_parallel() ? "[clock]#"+std::to_string(Process::me()) : "[clock] "); - double ms = 1000 * statistics().get_time_since_last_open(STD_COUNTERS::gpu_malloc_free); - printf("%s %7.3f ms [Data] Allocate on device [%9s] %6ld Bytes (%ld/%ldGB free) Currently allocated: %6ld\n", clock.c_str(), ms, ptrToString(ptr).c_str(), long(bytes), free_bytes/(1024*1024*1024), total_bytes/(1024*1024*1024), long(DeviceMemory::allocatedBytesOnDevice())); - } statistics().end_count(STD_COUNTERS::gpu_malloc_free); #ifndef NDEBUG const _TYPE_ INVALIDE_ = (std::is_same<_TYPE_,double>::value) ? DMAXFLOAT*0.999 : ( (std::is_same<_TYPE_,int>::value) ? INT_MIN : 0); // Identique a TRUSTArray<_TYPE_>::fill_default_value() @@ -509,15 +503,16 @@ std::string start_gpu_timer(std::string str, int bytes) #ifdef TRUST_USE_GPU if (!statistics().get_init_device()) return str; + //std::cerr << "Provisoire start_gpu_timer " << str << std::endl; if (statistics().get_gpu_timer()) - Process::exit("A GPU KERNEL is still running, you can't open a new one yet"); + { + Cerr << "A GPU KERNEL is still running, you can't open a new one (" << str << ") yet." << finl; + Cerr <<"Probably you forgot to define a end_gpu_timer(...) call." << finl; + Cerr <<"Or more subtil bug: you copy a C++ object on the device which has at least one TRUST array as attribute." << finl; + Process::exit(); + } statistics().start_gpu_timer(); statistics().add_to_gpu_timer_counter(1); -#ifndef NDEBUG - if (statistics().get_gpu_timer_counter()>1) - Cerr << "[Kokkos] timer_counter=" << statistics().get_gpu_timer_counter() << " : start_gpu_timer() not closed by end_gpu_timer() !" << finl; - //Process::exit("Error, start_gpu_timer() not closed by end_gpu_timer() !"); -#endif if (bytes == -1) statistics().begin_count(STD_COUNTERS::gpu_kernel,statistics().get_last_opened_counter_level()+1); #ifdef TRUST_USE_CUDA @@ -535,12 +530,8 @@ void end_gpu_timer(const std::string& str, int onDevice, int bytes) // Return in #ifdef TRUST_USE_GPU if (!statistics().get_init_device()) return; + //std::cerr << "Provisoire end_gpu_timer " << str << std::endl; statistics().add_to_gpu_timer_counter(-1); -#ifndef NDEBUG - if (statistics().get_gpu_timer_counter()!=0) - Cerr << "[Kokkos] timer_counter=" << statistics().get_gpu_timer_counter() << " : end_gpu_timer() not opened by start_gpu_timer() !" << finl; - //Process::exit("Error, start_gpu_timer() not closed by end_gpu_timer() !"); -#endif if (onDevice) { #ifdef TRUST_USE_UVM diff --git a/src/Kernel/Utilitaires/Schema_Comm_Vecteurs.cpp b/src/Kernel/Utilitaires/Schema_Comm_Vecteurs.cpp index 1eb5d28f15..cd7b41146a 100644 --- a/src/Kernel/Utilitaires/Schema_Comm_Vecteurs.cpp +++ b/src/Kernel/Utilitaires/Schema_Comm_Vecteurs.cpp @@ -94,7 +94,7 @@ Schema_Comm_Vecteurs::Schema_Comm_Vecteurs() if (getenv("MPICH_GPU_SUPPORT_ENABLED") == nullptr) Process::exit("You try to enable GPU communications on Cray MPICH with TRUST_USE_MPI_GPU_AWARE=1 but forgot to set also MPICH_GPU_SUPPORT_ENABLED=1 !"); #endif - std::cerr << "[MPI] Enabling GPU capability to communicate between devices." << std::endl; + //std::cerr << "[MPI] Enabling GPU capability to communicate between devices." << std::endl; //Cerr << "[MPI] Warning! Only MPI calls with device pointers will benefit. Classic MPI calls with host pointers will be slower..." << finl; } } diff --git a/src/Kernel/Utilitaires/View_Types.h b/src/Kernel/Utilitaires/View_Types.h index bb59387dd5..24f34d868b 100644 --- a/src/Kernel/Utilitaires/View_Types.h +++ b/src/Kernel/Utilitaires/View_Types.h @@ -43,6 +43,8 @@ using host_mirror_space = Kokkos::HostSpace; // The execution space (=where code is run): on the device if compiled for GPU, else CPU. using execution_space = DeviceView::execution_space; +using HostSpace = Kokkos::DefaultHostExecutionSpace; +using DeviceSpace = Kokkos::DefaultExecutionSpace; // Typedefs for range policies in kernels using range_1D = Kokkos::RangePolicy; diff --git a/src/Kernel/Utilitaires/kokkos++.h b/src/Kernel/Utilitaires/kokkos++.h index 5f88d1e426..12b2493f3b 100644 --- a/src/Kernel/Utilitaires/kokkos++.h +++ b/src/Kernel/Utilitaires/kokkos++.h @@ -44,8 +44,13 @@ #pragma diag_warning 47 #endif +// CUDA device lambdas (KOKKOS_LAMBDA) can only capture members of public classes. +// These macros preserve the intended C++ access level in CPU builds while forcing +// public visibility in CUDA builds so that Kokkos kernels defined in these sections compile. #ifdef TRUST_USE_CUDA -#define public_for_cuda public: +#define protected_but_public_for_cuda public: +#define private_but_public_for_cuda public: #else -#define public_for_cuda protected: +#define protected_but_public_for_cuda protected: +#define private_but_public_for_cuda private: #endif diff --git a/src/Kernel/Utilitaires/kokkos_test.cpp b/src/Kernel/Utilitaires/kokkos_test.cpp index aaca1a6922..18578b4979 100644 --- a/src/Kernel/Utilitaires/kokkos_test.cpp +++ b/src/Kernel/Utilitaires/kokkos_test.cpp @@ -153,7 +153,6 @@ void kokkos_self_test() // C++ object in Kokkos region { ArrOfDouble f(nb_elem); - f = 0; std::string expr("2*x+2"); // Parser sur le device; ParserView parser(expr, 1); diff --git a/src/Kernel/VF/Champs/Champ_Fonc_reprise.cpp b/src/Kernel/VF/Champs/Champ_Fonc_reprise.cpp index bc81db1ad6..5cbd965057 100644 --- a/src/Kernel/VF/Champs/Champ_Fonc_reprise.cpp +++ b/src/Kernel/VF/Champs/Champ_Fonc_reprise.cpp @@ -466,7 +466,7 @@ void Champ_Fonc_reprise::read_field_from_file(Entree& jdd, Entree& file, const P champ_moyen.reprendre(file); // On remplit le champ - le_champ().valeurs() = champ_moyen.calculer_valeurs(); + champ_moyen.calculer(le_champ().valeurs()); } else if (reprend_modele_k_eps) { diff --git a/src/Kernel/VF/Champs/Champ_front_recyclage.cpp b/src/Kernel/VF/Champs/Champ_front_recyclage.cpp index d1960cca75..c85b1562d7 100644 --- a/src/Kernel/VF/Champs/Champ_front_recyclage.cpp +++ b/src/Kernel/VF/Champs/Champ_front_recyclage.cpp @@ -454,6 +454,7 @@ void Champ_front_recyclage::get_coord_faces(const Frontiere_dis_base& fr_vf, const int dim = xv2.dimension(1); coords.resize(nb_faces2,dim); + ToDo_Kokkos("critical"); for (int i = 0; i < nb_faces2; i++) for (int j = 0; j < dim; j++) coords(i,j) = xv2(i+ndeb2,j) + delt_dist(j); @@ -625,6 +626,7 @@ int Champ_front_recyclage::initialiser(double temps, const Champ_Inc_base& inco) int nb_remote_faces = 0; // Loop on local faces on the process pe: + ToDo_Kokkos("critical"); for (int face = 0; face < nb_faces_on_pe; face++) { const int elem = elem_list[face]; @@ -659,6 +661,7 @@ int Champ_front_recyclage::initialiser(double temps, const Champ_Inc_base& inco) //Cerr << index_to_recv << finl; } bool error_1 = false, error_2 = false; + ToDo_Kokkos("critical"); for (int i = 0; i < nb_faces2; i++) { if (count[i] < 1) @@ -756,6 +759,7 @@ void Champ_front_recyclage::mettre_a_jour(double temps) calcul_moyenne_imposee(tab,temps); calcul_moyenne_recyclee(tab,temps); + ToDo_Kokkos("critical"); for (int i=0; i mat(n_ext); //matrices std::vector N, ne_tot; //composantes, nombre d'elements total par pb std::vector> domaine; //domaines @@ -127,6 +126,7 @@ void Op_Diff_PolyMAC_HFV_Elem::dimensionner_blocs_ext(int aux_only, matrices_t m std::vector> diffu, inco; //inconnues, normales aux faces, positions elems / faces / sommets std::deque v_part; //blocs de chaque inconnue std::vector stencil(n_ext); //stencils par matrice + decltype(stencil[0].dimension(0)) n_sten = 0; for (i = 0, M = 0; i < n_ext; M = std::max(M, N[i]), i++) { std::string nom_mat = i ? nom_inco + "/" + op_ext[i]->equation().probleme().le_nom().getString() : nom_inco; @@ -180,6 +180,7 @@ void Op_Diff_PolyMAC_HFV_Elem::dimensionner_blocs_ext(int aux_only, matrices_t m } /* problemes distants : pour les Echange_contact */ const Echange_contact_PolyMAC_HFV *pcl; + long p; if (!semi) for (i = 0; i < cls[0].get().size(); i++) if ((pcl = sub_type(Echange_contact_PolyMAC_HFV, cls[0].get()[i].valeur()) ? &ref_cast(Echange_contact_PolyMAC_HFV, cls[0].get()[i].valeur()) : nullptr)) diff --git a/src/PolyMAC_family/Operateurs/Op_Diff_Dift/Op_Diff_PolyMAC_MPFA_Elem.cpp b/src/PolyMAC_family/Operateurs/Op_Diff_Dift/Op_Diff_PolyMAC_MPFA_Elem.cpp index 44949d5f37..d2245797ef 100644 --- a/src/PolyMAC_family/Operateurs/Op_Diff_Dift/Op_Diff_PolyMAC_MPFA_Elem.cpp +++ b/src/PolyMAC_family/Operateurs/Op_Diff_Dift/Op_Diff_PolyMAC_MPFA_Elem.cpp @@ -246,7 +246,7 @@ void Op_Diff_PolyMAC_MPFA_Elem::dimensionner_blocs(matrices_t matrices, const ta *mat = mat2; } - int n_sten = stencil.dimension(0); + auto n_sten = stencil.dimension(0); const double elem_t = static_cast(domaine.domaine().md_vector_elements()->nb_items_seq_tot()), face_t = static_cast(domaine.md_vector_faces()->nb_items_seq_tot()); diff --git a/src/PolyMAC_family/Operateurs/Op_Diff_Dift/Op_Diff_PolyMAC_MPFA_Face.cpp b/src/PolyMAC_family/Operateurs/Op_Diff_Dift/Op_Diff_PolyMAC_MPFA_Face.cpp index 9ea42b83f2..92f4c0f167 100644 --- a/src/PolyMAC_family/Operateurs/Op_Diff_Dift/Op_Diff_PolyMAC_MPFA_Face.cpp +++ b/src/PolyMAC_family/Operateurs/Op_Diff_Dift/Op_Diff_PolyMAC_MPFA_Face.cpp @@ -213,10 +213,10 @@ void Op_Diff_PolyMAC_MPFA_Face::dimensionner_blocs(matrices_t matrices, const ta } tableau_trier_retirer_doublons(stencil); +#ifndef TRUST_USE_GPU const double face_t = static_cast(domaine.md_vector_faces()->nb_items_seq_tot()), elem_t = static_cast(domaine.domaine().md_vector_elements()->nb_items_seq_tot()); const double width = mp_sum_as_double(stencil.dimension(0)) / (N * (face_t + D * elem_t)); -#ifndef TRUST_USE_GPU const double perc = mp_somme_vect_as_double(tpfa) * 100. / (N * face_t); Cerr << "width " << width << " " << perc << "% TPFA " << finl; #endif diff --git a/src/ThHyd/Dilatable/Common/Cond_Lim/Entree_fluide_temperature_imposee_H.h b/src/ThHyd/Dilatable/Common/Cond_Lim/Entree_fluide_temperature_imposee_H.h index 5aa3901c21..9a1d8c5ce4 100644 --- a/src/ThHyd/Dilatable/Common/Cond_Lim/Entree_fluide_temperature_imposee_H.h +++ b/src/ThHyd/Dilatable/Common/Cond_Lim/Entree_fluide_temperature_imposee_H.h @@ -1,5 +1,5 @@ /**************************************************************************** -* Copyright (c) 2024, CEA +* Copyright (c) 2025, CEA * All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: @@ -29,10 +29,12 @@ class Entree_fluide_temperature_imposee_H : public Entree_fluide_temperature_im { Declare_instanciable(Entree_fluide_temperature_imposee_H); public : + using Dirichlet::val_imp; void completer() override; double val_imp(int i) const override; double val_imp(int i, int j) const override; - + double val_imp_au_temps(double temps, int i) const override { return val_imp(i); } + double val_imp_au_temps(double temps, int i, int j) const override { return val_imp(i,j); } protected : OBS_PTR(Fluide_Dilatable_base) le_fluide; }; diff --git a/src/ThHyd/Dilatable/Common/Cond_Lim/Neumann_sortie_libre_Temp_H.cpp b/src/ThHyd/Dilatable/Common/Cond_Lim/Neumann_sortie_libre_Temp_H.cpp index 50e7874c82..e9503ce61e 100644 --- a/src/ThHyd/Dilatable/Common/Cond_Lim/Neumann_sortie_libre_Temp_H.cpp +++ b/src/ThHyd/Dilatable/Common/Cond_Lim/Neumann_sortie_libre_Temp_H.cpp @@ -1,5 +1,5 @@ /**************************************************************************** -* Copyright (c) 2024, CEA +* Copyright (c) 2025, CEA * All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: @@ -43,38 +43,6 @@ void Neumann_sortie_libre_Temp_H::completer() modifier_val_imp = 1; } -/*! @brief Renvoie la valeur de la i-eme composante du champ impose a l'exterieur de la frontiere. - * - * @param (int i) indice suivant la premiere dimension du champ - * @return (double) la valeur imposee sur la composante du champ specifiee - * @throws deuxieme dimension du champ de frontiere superieur a 1 - */ -double Neumann_sortie_libre_Temp_H::val_ext(int i) const -{ - if (le_champ_ext->valeurs().size() == 1) - { - if (modifier_val_imp == 1) - return le_fluide->calculer_H(le_champ_ext->valeurs()(0, 0)); - else - return le_champ_ext->valeurs()(0, 0); - } - else if (le_champ_ext->valeurs().dimension(1) == 1) - { - if (modifier_val_imp == 1) - return le_fluide->calculer_H(le_champ_ext->valeurs()(i, 0)); - else - return le_champ_ext->valeurs()(i, 0); - } - else - { - Cerr << "Neumann_sortie_libre_Temp_H::val_ext" << finl; - Cerr << le_champ_ext << finl; - } - - abort(); - return 0.; -} - /*! @brief Renvoie la valeur de la (i,j)-eme composante du champ impose a l'exterieur de la frontiere. * * @param (int i) indice suivant la premiere dimension du champ diff --git a/src/ThHyd/Dilatable/Common/Cond_Lim/Neumann_sortie_libre_Temp_H.h b/src/ThHyd/Dilatable/Common/Cond_Lim/Neumann_sortie_libre_Temp_H.h index c264c32393..eafc636de5 100644 --- a/src/ThHyd/Dilatable/Common/Cond_Lim/Neumann_sortie_libre_Temp_H.h +++ b/src/ThHyd/Dilatable/Common/Cond_Lim/Neumann_sortie_libre_Temp_H.h @@ -1,5 +1,5 @@ /**************************************************************************** -* Copyright (c) 2024, CEA +* Copyright (c) 2025, CEA * All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: @@ -33,7 +33,7 @@ class Neumann_sortie_libre_Temp_H : public Neumann_sortie_libre Declare_instanciable(Neumann_sortie_libre_Temp_H); public: void completer() override; - double val_ext(int i) const override; + double val_ext(int i) const override { return val_ext(i,0); }; double val_ext(int i,int j) const override; protected : diff --git a/src/ThHyd/Dilatable/Common/Cond_Lim/Temperature_imposee_paroi_H.h b/src/ThHyd/Dilatable/Common/Cond_Lim/Temperature_imposee_paroi_H.h index 61d7576e6a..2b02ef9ff0 100644 --- a/src/ThHyd/Dilatable/Common/Cond_Lim/Temperature_imposee_paroi_H.h +++ b/src/ThHyd/Dilatable/Common/Cond_Lim/Temperature_imposee_paroi_H.h @@ -1,5 +1,5 @@ /**************************************************************************** -* Copyright (c) 2024, CEA +* Copyright (c) 2025, CEA * All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: @@ -30,6 +30,7 @@ class Temperature_imposee_paroi_H : public Temperature_imposee_paroi { Declare_instanciable(Temperature_imposee_paroi_H); public : + using Dirichlet::val_imp; void completer() override; double val_imp(int i) const override; double val_imp(int i, int j) const override; diff --git a/src/ThHyd/Dilatable/Common/Equations/Convection_Diffusion_Fluide_Dilatable_Proto.h b/src/ThHyd/Dilatable/Common/Equations/Convection_Diffusion_Fluide_Dilatable_Proto.h index 68727d7a71..627228f5f4 100644 --- a/src/ThHyd/Dilatable/Common/Equations/Convection_Diffusion_Fluide_Dilatable_Proto.h +++ b/src/ThHyd/Dilatable/Common/Equations/Convection_Diffusion_Fluide_Dilatable_Proto.h @@ -44,7 +44,7 @@ class Convection_Diffusion_Fluide_Dilatable_Proto virtual ~Convection_Diffusion_Fluide_Dilatable_Proto() {} protected: - public_for_cuda + protected_but_public_for_cuda void assembler_impl(Convection_Diffusion_Fluide_Dilatable_base& eqn, Matrice_Morse& mat_morse, const DoubleTab& present, DoubleTab& secmem); protected: diff --git a/src/ThHyd/Dilatable/Common/Equations/Navier_Stokes_Fluide_Dilatable_Proto.cpp b/src/ThHyd/Dilatable/Common/Equations/Navier_Stokes_Fluide_Dilatable_Proto.cpp index 6ca527a72b..c27951b073 100644 --- a/src/ThHyd/Dilatable/Common/Equations/Navier_Stokes_Fluide_Dilatable_Proto.cpp +++ b/src/ThHyd/Dilatable/Common/Equations/Navier_Stokes_Fluide_Dilatable_Proto.cpp @@ -456,7 +456,6 @@ void Navier_Stokes_Fluide_Dilatable_Proto::update_vpoint_on_boundaries(const Nav const DoubleTab& tab_rho_face_n = fluide_dil.rho_face_n(), &tab_rho_face_np1=fluide_dil.rho_face_np1(); const DoubleTab& tab_vit = eqn.vitesse().valeurs(); const Conds_lim& lescl = eqn.domaine_Cl_dis().les_conditions_limites(); - const IntTab& face_voisins = eqn.domaine_dis().face_voisins(); const int taille = tab_vpoint.line_size(); if (taille==1) @@ -470,30 +469,31 @@ void Navier_Stokes_Fluide_Dilatable_Proto::update_vpoint_on_boundaries(const Nav const Front_VF& la_front_dis = ref_cast(Front_VF,la_cl_base.frontiere_dis()); const Dirichlet& diri=ref_cast(Dirichlet,la_cl_base); const int ndeb = la_front_dis.num_premiere_face(), nfin = ndeb + la_front_dis.nb_faces(); - + CDoubleTabView val_imp = diri.tab_val_imp().view_ro(); + CDoubleArrView rho_face_np1 = static_cast(tab_rho_face_np1).view_ro(); + CDoubleArrView rho_face_n = static_cast(tab_rho_face_n).view_ro(); if (taille==1) // VDF // { - ToDo_Kokkos("critical"); - for (int num_face=ndeb; num_face(tab_vit).view_ro(); + DoubleArrView vpoint = static_cast(tab_vpoint).view_wo(); + Kokkos::parallel_for(start_gpu_timer(__KERNEL_NAME__), range_1D(ndeb, nfin), KOKKOS_LAMBDA(const int num_face) + { + int n0 = face_voisins(num_face, 0); + if (n0 == -1) n0 = face_voisins(num_face, 1); + // GF en cas de diffsion implicite vpoint!=0 on ignrore l'ancienne valeur + vpoint(num_face)=(val_imp(num_face-ndeb,orientation_VDF(num_face))*rho_face_np1(num_face)- + vit(num_face)*rho_face_n(num_face))/dt_; + }); + end_gpu_timer(__KERNEL_NAME__); } else // VEF // { int dim = Objet_U::dimension; - CDoubleTabView val_imp = diri.tab_val_imp().view_ro(); - CDoubleArrView rho_face_np1 = static_cast(tab_rho_face_np1).view_ro(); - CDoubleArrView rho_face_n = static_cast(tab_rho_face_n).view_ro(); CDoubleTabView vit = tab_vit.view_ro(); DoubleTabView vpoint = tab_vpoint.view_wo(); - Kokkos::MDRangePolicy> policy({ndeb, 0}, {nfin, dim}); - Kokkos::parallel_for(start_gpu_timer(__KERNEL_NAME__), policy, KOKKOS_LAMBDA(const int num_face, const int jj) + Kokkos::parallel_for(start_gpu_timer(__KERNEL_NAME__), range_2D({ndeb, 0}, {nfin, dim}), KOKKOS_LAMBDA(const int num_face, const int jj) { // GF en cas de diffusion implicite vpoint!=0 on ignrore l'ancienne valeur vpoint(num_face,jj)=(rho_face_np1(num_face)*val_imp(num_face-ndeb,jj) diff --git a/src/ThHyd/Dilatable/Common/Equations/Navier_Stokes_Fluide_Dilatable_Proto.h b/src/ThHyd/Dilatable/Common/Equations/Navier_Stokes_Fluide_Dilatable_Proto.h index 779295a780..d3a486898b 100644 --- a/src/ThHyd/Dilatable/Common/Equations/Navier_Stokes_Fluide_Dilatable_Proto.h +++ b/src/ThHyd/Dilatable/Common/Equations/Navier_Stokes_Fluide_Dilatable_Proto.h @@ -58,7 +58,7 @@ public : DoubleTab& secmem,DoubleTab& inc_pre,DoubleTab& vpoint ); void correct_and_compute_u_np1(Navier_Stokes_std& eqn,const Fluide_Dilatable_base& fluide_dil,DoubleTab& rhoU, DoubleTab& Mmoins1grad,DoubleTab& inc_pre,DoubleTab& gradP,DoubleTab& vpoint); - public_for_cuda + protected_but_public_for_cuda void prepare_and_solve_u_star(Navier_Stokes_std& eqn,const Fluide_Dilatable_base& fluide_dil,DoubleTab& rhoU,DoubleTab& vpoint); void update_vpoint_on_boundaries(const Navier_Stokes_std& eqn,const Fluide_Dilatable_base& fluide_dil,DoubleTab& vpoint); }; diff --git a/src/ThHyd/Dilatable/Common/Milieu/Loi_Etat_GP_base.cpp b/src/ThHyd/Dilatable/Common/Milieu/Loi_Etat_GP_base.cpp index 98cfdd280a..be8392f23c 100644 --- a/src/ThHyd/Dilatable/Common/Milieu/Loi_Etat_GP_base.cpp +++ b/src/ThHyd/Dilatable/Common/Milieu/Loi_Etat_GP_base.cpp @@ -153,21 +153,23 @@ void Loi_Etat_GP_base::calculer_alpha() if (champ_alpha.que_suis_je()=="Champ_Fonc_P0_VDF") isVDF = 1; int n=tab_alpha.size(); bool lambda_uniforme = sub_type(Champ_Uniforme,champ_lambda); + CDoubleTabView lambda = tab_lambda.view_ro(); + CDoubleTabView rho = tab_rho.view_ro(); + DoubleTabView alpha = tab_alpha.view_wo(); + double Cp = Cp_; // Cp_ attribute of class can't be used on device: so local copy if (isVDF) { - ToDo_Kokkos("critical"); - for (int i=0 ; ivitesse().domaine_dis_base()).elem_faces(); int nfe = tab_elem_faces.line_size(); - double Cp = Cp_; CIntTabView elem_faces = tab_elem_faces.view_ro(); - CDoubleTabView lambda = tab_lambda.view_ro(); - CDoubleTabView rho = tab_rho.view_ro(); - DoubleTabView alpha = tab_alpha.view_wo(); Kokkos::parallel_for(start_gpu_timer(__KERNEL_NAME__), n, KOKKOS_LAMBDA( const int i) { diff --git a/src/ThHyd/Dilatable/Common/Milieu/Loi_Etat_Mono_GP_base.h b/src/ThHyd/Dilatable/Common/Milieu/Loi_Etat_Mono_GP_base.h index 0ee866c9f2..85351e1a76 100644 --- a/src/ThHyd/Dilatable/Common/Milieu/Loi_Etat_Mono_GP_base.h +++ b/src/ThHyd/Dilatable/Common/Milieu/Loi_Etat_Mono_GP_base.h @@ -1,5 +1,5 @@ /**************************************************************************** -* Copyright (c) 2024, CEA +* Copyright (c) 2026, CEA * All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: @@ -30,6 +30,8 @@ class Loi_Etat_Mono_GP_base : public Loi_Etat_GP_base { Declare_base(Loi_Etat_Mono_GP_base); +public: + KOKKOS_INLINE_FUNCTION static double calculer_rho(double P, double T, double r) { return P / (r * T); } protected : OWN_PTR(Champ_base) rho_constant_pour_debug_; }; diff --git a/src/ThHyd/Dilatable/Common/Milieu/Loi_Etat_base.cpp b/src/ThHyd/Dilatable/Common/Milieu/Loi_Etat_base.cpp index 865e0383b2..0cf83b3adf 100644 --- a/src/ThHyd/Dilatable/Common/Milieu/Loi_Etat_base.cpp +++ b/src/ThHyd/Dilatable/Common/Milieu/Loi_Etat_base.cpp @@ -172,8 +172,14 @@ void Loi_Etat_base::calculer_nu() if (viscosite_cinematique.que_suis_je()=="Champ_Fonc_P0_VDF") { // VDF - for (int i=0 ; i #include #include +#include +#include Implemente_base(Source_Masse_Fluide_Dilatable_base, "Source_Masse_Fluide_Dilatable_base", Objet_U); // XD mass_source interprete nul 1 Mass source used in a dilatable simulation to add/reduce a mass at the boundary (volumetric source in the first cell of a given boundary). @@ -114,7 +116,6 @@ void Source_Masse_Fluide_Dilatable_base::mettre_a_jour(double temps) double sum_conv = 0.; std::vector sum_diff_vect(ncomp_); - const DoubleTab& val_flux0 = ch_front_source_->valeurs(); const Domaine_Cl_dis_base& zclb = domaine_cl_dis_.valeur(); const Domaine_VF& zvf = ref_cast(Domaine_VF, zclb.domaine_dis()); @@ -130,14 +131,17 @@ void Source_Masse_Fluide_Dilatable_base::mettre_a_jour(double temps) const int is_uniforme = sub_type(Champ_front_uniforme, ch_front_source_.valeur()); const int ndeb = le_bord.num_premiere_face(), nfin = ndeb + le_bord.nb_faces(); + CDoubleTabView val_flux0 = ch_front_source_->valeurs().view_ro(); + CDoubleArrView face_surfaces = zvf.face_surfaces().view_ro(); for (int i = 0; i < ncomp_; i++) { double sum_diff = 0.; - for (int f = ndeb; f < nfin; f++) - { - const double surf = zvf.face_surfaces(f); - sum_diff += is_uniforme ? val_flux0(0, i) * surf : val_flux0(f - ndeb, i) * surf; - } + Kokkos::parallel_reduce(start_gpu_timer(__KERNEL_NAME__), range_1D(ndeb, nfin), KOKKOS_LAMBDA(const int f, double& local_sum) + { + const double surf = face_surfaces(f); + local_sum += is_uniforme ? val_flux0(0, i) * surf : val_flux0(f - ndeb, i) * surf; + }, sum_diff); + end_gpu_timer(__KERNEL_NAME__); sum_diff_vect[i] = Process::mp_sum(sum_diff); } @@ -167,9 +171,8 @@ void Source_Masse_Fluide_Dilatable_base::set_temps_defaut(double temps) ch_front_source_->set_temps_defaut(temps); } -void Source_Masse_Fluide_Dilatable_base::fill_val_flux_tab(DoubleTrav& val_flux) const +void Source_Masse_Fluide_Dilatable_base::fill_val_flux_tab(DoubleTrav& tab_val_flux) const { - const DoubleTab& val_flux0 = ch_front_source_->valeurs(); /* * XXX Elie Saikali mai 2025 : soucis avec ICoCo ... * Attention : val_flux a dimension de nb_faces or val_flux0 a dimension de nb_faces du bord nom_bord_ ... @@ -186,10 +189,15 @@ void Source_Masse_Fluide_Dilatable_base::fill_val_flux_tab(DoubleTrav& val_flux) // Handle uniform case ... such a pain: const int is_uniforme = sub_type(Champ_front_uniforme, ch_front_source_.valeur()); const int ndeb = le_bord.num_premiere_face(), nfin = ndeb + le_bord.nb_faces(); - - for (int num_face = ndeb; num_face < nfin; num_face++) - for (int ncomp = 0; ncomp < val_flux0.line_size(); ncomp++) + const int ncomp_size = ch_front_source_->valeurs().line_size(); + CDoubleTabView val_flux0 = ch_front_source_->valeurs().view_ro(); + DoubleTabView val_flux = tab_val_flux.view_rw(); + Kokkos::parallel_for(start_gpu_timer(__KERNEL_NAME__), range_1D(ndeb, nfin), KOKKOS_LAMBDA(const int num_face) + { + for (int ncomp = 0; ncomp < ncomp_size; ncomp++) val_flux(num_face, 0) += is_uniforme ? val_flux0(0, ncomp) : val_flux0(num_face - ndeb, ncomp); + }); + end_gpu_timer(__KERNEL_NAME__); } } } diff --git a/src/ThHyd/Dilatable/Quasi_Compressible/Milieu/Fluide_Quasi_Compressible.h b/src/ThHyd/Dilatable/Quasi_Compressible/Milieu/Fluide_Quasi_Compressible.h index 2ad449d486..5491df5a15 100644 --- a/src/ThHyd/Dilatable/Quasi_Compressible/Milieu/Fluide_Quasi_Compressible.h +++ b/src/ThHyd/Dilatable/Quasi_Compressible/Milieu/Fluide_Quasi_Compressible.h @@ -45,7 +45,7 @@ protected : mutable DoubleTab tab_W_old_; private : - public_for_cuda + protected_but_public_for_cuda void remplir_champ_pression_tot(int n, const DoubleTab& PHydro, DoubleTab& PTot) override; }; diff --git a/src/ThHyd/Dilatable/Quasi_Compressible/Milieu/Loi_Etat_GP_QC.cpp b/src/ThHyd/Dilatable/Quasi_Compressible/Milieu/Loi_Etat_GP_QC.cpp index 7a8b9c06e7..39e78813bc 100644 --- a/src/ThHyd/Dilatable/Quasi_Compressible/Milieu/Loi_Etat_GP_QC.cpp +++ b/src/ThHyd/Dilatable/Quasi_Compressible/Milieu/Loi_Etat_GP_QC.cpp @@ -54,7 +54,7 @@ void Loi_Etat_GP_QC::compute_tab_rho(DoubleTab& tab_rho) DoubleArrView rho = static_cast(tab_rho).view_wo(); Kokkos::parallel_for(start_gpu_timer(__KERNEL_NAME__), tab_rho.size(), KOKKOS_LAMBDA(const int som) { - rho_np1(som) = rho_constant ? rho_constant : Loi_Etat_Mono_GP_base::calculer_masse_volumique(Pth, tab_ICh(som), R); + rho_np1(som) = rho_constant ? rho_constant : calculer_rho(Pth, tab_ICh(som), R); rho(som) = 0.5 * (rho_n(som) + rho_np1(som)); }); end_gpu_timer(__KERNEL_NAME__); diff --git a/src/ThHyd/Dilatable/Quasi_Compressible/Milieu/Loi_Etat_GP_QC.h b/src/ThHyd/Dilatable/Quasi_Compressible/Milieu/Loi_Etat_GP_QC.h index ea1106bf34..2b2baaadc9 100644 --- a/src/ThHyd/Dilatable/Quasi_Compressible/Milieu/Loi_Etat_GP_QC.h +++ b/src/ThHyd/Dilatable/Quasi_Compressible/Milieu/Loi_Etat_GP_QC.h @@ -35,7 +35,7 @@ public : void calculer_masse_volumique() override; double calculer_masse_volumique(double,double) const override; protected: - public_for_cuda + protected_but_public_for_cuda void compute_tab_rho(DoubleTab&) override; }; diff --git a/src/ThHyd/Dilatable/Quasi_Compressible/Sources/Source_QC_Chaleur.h b/src/ThHyd/Dilatable/Quasi_Compressible/Sources/Source_QC_Chaleur.h index 744f722365..724b540b69 100644 --- a/src/ThHyd/Dilatable/Quasi_Compressible/Sources/Source_QC_Chaleur.h +++ b/src/ThHyd/Dilatable/Quasi_Compressible/Sources/Source_QC_Chaleur.h @@ -37,7 +37,7 @@ class Source_QC_Chaleur : public Source_Chaleur_Fluide_Dilatable_base public: DoubleTab& ajouter(DoubleTab& ) const override; protected: - public_for_cuda + protected_but_public_for_cuda virtual DoubleTab& ajouter_(DoubleTab& ) const; }; diff --git a/src/ThHyd/Incompressible/Cond_Lim/Entree_fluide_T_h_imposee.h b/src/ThHyd/Incompressible/Cond_Lim/Entree_fluide_T_h_imposee.h index 907c7971e4..af607bd9cc 100644 --- a/src/ThHyd/Incompressible/Cond_Lim/Entree_fluide_T_h_imposee.h +++ b/src/ThHyd/Incompressible/Cond_Lim/Entree_fluide_T_h_imposee.h @@ -1,5 +1,5 @@ /**************************************************************************** -* Copyright (c) 2024, CEA +* Copyright (c) 2025, CEA * All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: @@ -29,6 +29,7 @@ class Entree_fluide_T_h_imposee: public Dirichlet_entree_fluide { Declare_instanciable(Entree_fluide_T_h_imposee); public: + using Dirichlet::val_imp; double val_imp(int i) const override; double val_imp(int i, int j) const override; inline void bascule_cond_lim_en_enthalpie() { type_cond_lim = 1; } diff --git a/src/ThHyd/Incompressible/Cond_Lim/Neumann_sortie_libre.cpp b/src/ThHyd/Incompressible/Cond_Lim/Neumann_sortie_libre.cpp index dd2c116bbd..a73d21f09d 100644 --- a/src/ThHyd/Incompressible/Cond_Lim/Neumann_sortie_libre.cpp +++ b/src/ThHyd/Incompressible/Cond_Lim/Neumann_sortie_libre.cpp @@ -1,5 +1,5 @@ /**************************************************************************** -* Copyright (c) 2026, CEA +* Copyright (c) 2025, CEA * All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: @@ -160,7 +160,7 @@ double Neumann_sortie_libre::val_ext(int i, int j) const return le_champ_ext->valeurs()(i, j); } -const DoubleTab& Neumann_sortie_libre::val_ext() const +const DoubleTab& Neumann_sortie_libre::tab_val_ext() const { const Front_VF& le_bord = ref_cast(Front_VF, frontiere_dis()); int nb_faces_tot = le_bord.nb_faces_tot(); diff --git a/src/ThHyd/Incompressible/Cond_Lim/Neumann_sortie_libre.h b/src/ThHyd/Incompressible/Cond_Lim/Neumann_sortie_libre.h index 5fe1c8b82a..56a3e7c0b2 100644 --- a/src/ThHyd/Incompressible/Cond_Lim/Neumann_sortie_libre.h +++ b/src/ThHyd/Incompressible/Cond_Lim/Neumann_sortie_libre.h @@ -1,5 +1,5 @@ /**************************************************************************** -* Copyright (c) 2024, CEA +* Copyright (c) 2025, CEA * All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: @@ -39,7 +39,7 @@ class Neumann_sortie_libre: public Neumann_val_ext double val_ext(int i) const override; double val_ext(int i, int j) const override; - const DoubleTab& val_ext() const; + const DoubleTab& tab_val_ext() const override; int initialiser(double temps) override; void associer_fr_dis_base(const Frontiere_dis_base&) override; void verifie_ch_init_nb_comp() const override; diff --git a/src/ThHyd/Incompressible/Equations/Navier_Stokes_std.cpp b/src/ThHyd/Incompressible/Equations/Navier_Stokes_std.cpp index 32437ef8c3..0881834899 100644 --- a/src/ThHyd/Incompressible/Equations/Navier_Stokes_std.cpp +++ b/src/ThHyd/Incompressible/Equations/Navier_Stokes_std.cpp @@ -1067,7 +1067,6 @@ void Navier_Stokes_std::mettre_a_jour(double temps) Debog::verifier("Navier_Stokes_std::mettre_a_jour : pression", la_pression->valeurs()); Debog::verifier("Navier_Stokes_std::mettre_a_jour : vitesse", la_vitesse->valeurs()); - if (la_vorticite) la_vorticite->mettre_a_jour(temps); if (critere_Q) critere_Q->mettre_a_jour(temps); if (Reynolds_maille) Reynolds_maille->mettre_a_jour(temps); if (Taux_cisaillement) Taux_cisaillement->mettre_a_jour(temps); @@ -1505,7 +1504,7 @@ const Champ_base& Navier_Stokes_std::get_champ(const Motcle& nom) const throw std::runtime_error(std::string("Field ") + nom.getString() + std::string(" not found !")); Champ_Fonc_base& ch = ref_cast_non_const(Champ_Fonc_base, la_vorticite.valeur()); - if ((ch.temps() == temps_init) && (la_vitesse->mon_equation_non_nul())) + if (((ch.temps() != la_vitesse->temps()) || (ch.temps() == temps_init)) && (la_vitesse->mon_equation_non_nul())) ch.mettre_a_jour(la_vitesse->temps()); return champs_compris_.get_champ(nom); } diff --git a/src/ThHyd/Incompressible/Problems/Generique/Pb_Fluide_base.cpp b/src/ThHyd/Incompressible/Problems/Generique/Pb_Fluide_base.cpp index d9bdfdbfa6..734e174778 100644 --- a/src/ThHyd/Incompressible/Problems/Generique/Pb_Fluide_base.cpp +++ b/src/ThHyd/Incompressible/Problems/Generique/Pb_Fluide_base.cpp @@ -103,6 +103,13 @@ int Pb_Fluide_base::expression_predefini(const Motcle& motlu, Nom& expression) expression += " energie_cinetique_elem } "; return 1; } + else if (motlu=="ENSTROPHIE_TOTALE") + { + expression = "predefini { pb_champ "; + expression += le_nom(); + expression += " enstrophie_totale } "; + return 1; + } else if (motlu=="VISCOUS_FORCE_X") { expression = "predefini { pb_champ "; diff --git a/src/ThHyd/Incompressible/Sources/Terme_Boussinesq_base.h b/src/ThHyd/Incompressible/Sources/Terme_Boussinesq_base.h index 6f1c25bc89..e8f31967b6 100644 --- a/src/ThHyd/Incompressible/Sources/Terme_Boussinesq_base.h +++ b/src/ThHyd/Incompressible/Sources/Terme_Boussinesq_base.h @@ -125,6 +125,12 @@ inline double valeur(const DoubleTab& valeurs, const int elem, const int dim) return valeurs(elem,dim); } +KOKKOS_INLINE_FUNCTION +double valeur(CDoubleTabView valeurs, const int elem, const int dim) +{ + return valeurs.rank()==1 ? valeurs(elem,0) : valeurs(elem,dim); +} + // Methode de calcul de la valeur sur une face encadree par elem1 et elem2 d'un champ uniforme ou non a plusieurs composantes inline double valeur(const DoubleTab& valeurs_champ, int elem1, int elem2, const int compo) { @@ -139,6 +145,22 @@ inline double valeur(const DoubleTab& valeurs_champ, int elem1, int elem2, const return 0.5*(valeurs_champ(elem1,compo)+valeurs_champ(elem2,compo)); } } + +KOKKOS_INLINE_FUNCTION +double valeur(CDoubleTabView valeurs_champ, int elem1, int elem2, const int compo) +{ + if (valeurs_champ.extent(0)==1) + return valeurs_champ(0,compo); // Champ uniforme + else + { + if (elem2<0) elem2 = elem1; // face frontiere + if (valeurs_champ.rank()==1) + return 0.5*(valeurs_champ(elem1,0)+valeurs_champ(elem2,0)); + else + return 0.5*(valeurs_champ(elem1,compo)+valeurs_champ(elem2,compo)); + } +} + KOKKOS_INLINE_FUNCTION double valeur(CDoubleTabView valeurs_champ, int valeurs_champ_dimension0, int nb_dim, int elem1, int elem2, const int compo, int nb_compo) { diff --git a/src/ThHyd/Incompressible/Traitement_particulier/Traitement_particulier_NS_EC.cpp b/src/ThHyd/Incompressible/Traitement_particulier/Traitement_particulier_NS_EC.cpp index 9265040d88..eea80c2f80 100644 --- a/src/ThHyd/Incompressible/Traitement_particulier/Traitement_particulier_NS_EC.cpp +++ b/src/ThHyd/Incompressible/Traitement_particulier/Traitement_particulier_NS_EC.cpp @@ -1,5 +1,5 @@ /**************************************************************************** -* Copyright (c) 2024, CEA +* Copyright (c) 2026, CEA * All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: @@ -172,73 +172,80 @@ void Traitement_particulier_NS_EC::post_traitement_particulier() static double trait_part_calculer_ec_faces(const int face_debut, const int nb_faces, const int frontiere, - const DoubleTab& vitesse, - const DoubleVect& volumes_entrelaces, - const DoubleTab& xv, - const DoubleTab& masse_volumique, - const ArrOfDouble& translation, - const ArrOfDouble& rotation, + const DoubleTab& tab_vitesse, + const DoubleVect& tab_volumes_entrelaces, + const DoubleTab& tab_xv, + const DoubleTab& tab_masse_volumique, + const ArrOfDouble& tab_translation, + const ArrOfDouble& tab_rotation, const int repere_mobile_, - const ArrOfInt& faces_doubles + const ArrOfInt& tab_faces_doubles ) { const int face_fin = face_debut + nb_faces; double ec = 0.; - double rho = 0.; - const int nb_dim_1 = (vitesse.line_size() == 1); + const int nb_dim_1 = (tab_vitesse.line_size() == 1); const int dim = Objet_U::dimension; - ArrOfDouble ve(Objet_U::dimension); - for (int face = face_debut; face < face_fin; face++) + if (nb_dim_1 && repere_mobile_) { - // Calcul de la vitesse d'entrainement - if (repere_mobile_) - { - ve[0]=translation[0]; - ve[1]=translation[1]; - if (Objet_U::dimension==3) - { - ve[2]=translation[2]; - ve[0]+=rotation[1]*xv(face,2)-rotation[2]*xv(face,1); - ve[1]+=rotation[2]*xv(face,0)-rotation[0]*xv(face,2); - ve[2]+=rotation[0]*xv(face,1)-rotation[1]*xv(face,0); - } - } - else - ve=0; - - double v2; - double volume; - if (nb_dim_1) - { - // Une composante de vitesse a la face (VDF) - const double v = vitesse(face); - if (repere_mobile_) - { - Cerr << "Le codage de l'energie cinetique calculee dans un repere fixe" <(face_debut, face_fin), + KOKKOS_LAMBDA(const int face, double& ec_) + { + // Calcul de la vitesse d'entrainement + double ve0 = 0., ve1 = 0., ve2 = 0.; + if (repere_mobile_) + { + ve0 = translation(0); + ve1 = translation(1); + if (dim == 3) + { + ve2 = translation(2); + ve0 += rotation(1)*xv(face,2) - rotation(2)*xv(face,1); + ve1 += rotation(2)*xv(face,0) - rotation(0)*xv(face,2); + ve2 += rotation(0)*xv(face,1) - rotation(1)*xv(face,0); + } + } + double v2, volume; + if (nb_dim_1) + { + // Une composante de vitesse a la face (VDF) + const double v = vitesse(face, 0); + v2 = v * v; + // En VDF, sur les frontieres, on ne prend que le 1/2 volume entrelace + volume = volume_factor * volumes_entrelaces(face); + } + else + { + // Deux ou trois composantes (VEFP1B) + v2 = 0.; + const double ve[3] = {ve0, ve1, ve2}; + for (int i = 0; i < dim; i++) + { + const double v_i = vitesse(face, i); + v2 += (v_i + ve[i]) * (v_i + ve[i]); + } + // En VEF, cela est incorrect, il faudrait les volumes etendus: + volume = volumes_entrelaces(face); + } + const int k = masse_vol_uniform ? 0 : face; + const double rho = masse_volumique(k, 0); + const double contribution = (faces_doubles(face) == 1) ? 0.5 : 1.; + ec_ += contribution * 0.5 * v2 * volume * rho; + }, Kokkos::Sum(ec)); + end_gpu_timer(__KERNEL_NAME__); return ec; } diff --git a/src/ThHyd/Multiphase/Schemas_Temps/SETS.cpp b/src/ThHyd/Multiphase/Schemas_Temps/SETS.cpp index d5d8766737..e92650fa18 100644 --- a/src/ThHyd/Multiphase/Schemas_Temps/SETS.cpp +++ b/src/ThHyd/Multiphase/Schemas_Temps/SETS.cpp @@ -212,7 +212,7 @@ void SETS::init_cv_ctx(const DoubleTab& secmem, const DoubleVect& norme) KSPConvergedDefaultCreate(&cv_ctx->defctx); } -#if PETSC_VERSION_GE(3,24,0) +#if PETSC_VERSION_GE(3,24,0) && PETSC_VERSION_LT(3,25,0) PetscErrorCode SETS::destroy_cvctx(void **mctx) { SETS::cv_test_t *ctx = (SETS::cv_test_t *)*mctx; @@ -224,6 +224,18 @@ PetscErrorCode SETS::destroy_cvctx(void **mctx) free(ctx); return err; } +#elif PETSC_VERSION_GE(3,25,0) +PetscErrorCode SETS::destroy_cvctx(void *mctx) +{ + SETS::cv_test_t *ctx = *(SETS::cv_test_t **)mctx; + if (ctx->v) + VecDestroy(&ctx->v); + if (ctx->t) + VecDestroy(&ctx->t); + PetscErrorCode err = KSPConvergedDefaultDestroy((void *)&ctx->defctx); + free(ctx); + return err; +} #else PetscErrorCode SETS::destroy_cvctx(void *mctx) { diff --git a/src/ThHyd/Multiphase/Schemas_Temps/SETS.h b/src/ThHyd/Multiphase/Schemas_Temps/SETS.h index 3887bba782..d70326b596 100644 --- a/src/ThHyd/Multiphase/Schemas_Temps/SETS.h +++ b/src/ThHyd/Multiphase/Schemas_Temps/SETS.h @@ -94,7 +94,7 @@ class SETS: public Simpler ArrOfTID ix; //indices pour recuperer le residu cv_test_t *cv_ctx = nullptr; void init_cv_ctx(const DoubleTab& secmem, const DoubleVect& norm); -#if PETSC_VERSION_GE(3,24,0) +#if PETSC_VERSION_GE(3,24,0) && PETSC_VERSION_LT(3,25,0) static PetscErrorCode destroy_cvctx(void **mctx); #else static PetscErrorCode destroy_cvctx(void *mctx); diff --git a/src/ThHyd/Rayonnement/Semi_transparent/Equations/Eq_rayo_semi_transp.cpp b/src/ThHyd/Rayonnement/Semi_transparent/Equations/Eq_rayo_semi_transp.cpp index e51857f808..d78c1115ff 100644 --- a/src/ThHyd/Rayonnement/Semi_transparent/Equations/Eq_rayo_semi_transp.cpp +++ b/src/ThHyd/Rayonnement/Semi_transparent/Equations/Eq_rayo_semi_transp.cpp @@ -299,18 +299,17 @@ void Eq_rayo_semi_transp::Mat_Morse_to_Mat_Bloc(Matrice& matrice_tmp) DoubleTab ligne_tmp(n1); for (int i = 0; i < n2; i++) { - int k; // On recopie le premier bloc de la matrice dans un tableau : // ligne_tmp = 0; - for (k = la_matrice_.get_tab1()(i) - 1; k < la_matrice_.get_tab1()(i + 1) - 1; k++) + for (auto k = la_matrice_.get_tab1()(i) - 1; k < la_matrice_.get_tab1()(i + 1) - 1; k++) ligne_tmp(la_matrice_.get_tab2()(k) - 1) = la_matrice_.get_coeff()(k); // On complete la partie reelle de la matrice - for (k = tab1RR(i) - 1; k < tab1RR(i + 1) - 1; k++) + for (auto k = tab1RR(i) - 1; k < tab1RR(i + 1) - 1; k++) coeffRR[k] = ligne_tmp(tab2RR[k] - 1); // On complete la partie virtuelle - for (k = tab1RV(i) - 1; k < tab1RV(i + 1) - 1; k++) + for (auto k = tab1RV(i) - 1; k < tab1RV(i + 1) - 1; k++) coeffRV[k] = ligne_tmp(n2 + tab2RV[k] - 1); } } @@ -347,13 +346,11 @@ void Eq_rayo_semi_transp::dimensionner_Mat_Bloc_Morse_Sym(Matrice& matrice_tmp) // On parcours les lignes de la_matrice pour compter les elements // non nuls de chaque ligne - int jcolonne; for (iligne = 0; iligne < n2; iligne++) { - int k; - for (k = tab1(iligne) - 1; k < tab1(iligne + 1) - 1; k++) + for (auto k = tab1(iligne) - 1; k < tab1(iligne + 1) - 1; k++) { - jcolonne = tab2(k) - 1; + auto jcolonne = tab2(k) - 1; if (jcolonne < n2) { // l'element correspondant est dans la partie RR de la_matrice @@ -384,15 +381,13 @@ void Eq_rayo_semi_transp::dimensionner_Mat_Bloc_Morse_Sym(Matrice& matrice_tmp) MBrv.dimensionner(n2, n1 - n2, tab1RV(n2) - 1); // On remplit tab2RR et tab2RV - int compteurRR, compteurRV; for (iligne = 0; iligne < n2; iligne++) { - int k; - compteurRR = tab1RR(iligne) - 1; - compteurRV = tab1RV(iligne) - 1; - for (k = tab1(iligne) - 1; k < tab1(iligne + 1) - 1; k++) + auto compteurRR = tab1RR(iligne) - 1; + auto compteurRV = tab1RV(iligne) - 1; + for (auto k = tab1(iligne) - 1; k < tab1(iligne + 1) - 1; k++) { - jcolonne = tab2(k) - 1; + auto jcolonne = tab2(k) - 1; if (jcolonne < n2) { // l'element correspondant est dans la partie RR de la_matrice diff --git a/src/ThHyd/Schemas_Temps/Simple.cpp b/src/ThHyd/Schemas_Temps/Simple.cpp index abb64bc2be..3bae04dc04 100644 --- a/src/ThHyd/Schemas_Temps/Simple.cpp +++ b/src/ThHyd/Schemas_Temps/Simple.cpp @@ -450,7 +450,16 @@ bool Simple::iterer_eqs(LIST(OBS_PTR(Equation_base)) eqs, int nb_iter, int& ok) DoubleTab_parts residu_parts(residus), inconnues_parts(inconnues), dudt_parts(dudt); //remplissage des inconnues - for(i = 0; i < eqs.size(); i++) inconnues_parts[i] = eqs[i]->inconnue().valeurs(); + // DoubleTab_parts share data_location_: allocating device memory for parts[0] marks all parts as + // Device, but parts[1..] have no exact entry in DeviceMemory, causing inconsistency. Ensure all + // unknowns are on host so no device allocation is triggered in copy_. + for(i = 0; i < eqs.size(); i++) + { + ToDo_Kokkos("Fix this D2H copy."); + eqs[i]->inconnue().valeurs().ensureDataOnHost(); + } + for(i = 0; i < eqs.size(); i++) + inconnues_parts[i] = eqs[i]->inconnue().valeurs(); dudt = inconnues; //remplissage des matrices diff --git a/src/ThHyd/Turbulence/Lois_Paroi/Turbulence_paroi_base.cpp b/src/ThHyd/Turbulence/Lois_Paroi/Turbulence_paroi_base.cpp index 3bde1b6935..3c59a2ac5a 100644 --- a/src/ThHyd/Turbulence/Lois_Paroi/Turbulence_paroi_base.cpp +++ b/src/ThHyd/Turbulence/Lois_Paroi/Turbulence_paroi_base.cpp @@ -124,24 +124,29 @@ const Champ_base& Turbulence_paroi_base::get_champ(const Motcle& nom) const if (champ_u_star_ && nom == champ_u_star_->le_nom()) { // Initialisation a 0 du champ volumique u_star - DoubleTab& valeurs = champ_u_star_->valeurs(); - valeurs = 0; + DoubleTab& tab_valeurs = champ_u_star_->valeurs(); + tab_valeurs = 0; const Equation_base& my_eqn = mon_modele_turb_hyd->equation(); if (tab_u_star_.size_array() > 0) { // Boucle sur les frontieres pour recuperer u_star si tab_u_star dimensionne int nb_front = my_eqn.domaine_dis().nb_front_Cl(); + CDoubleArrView u_star = tab_u_star_.view_ro(); + DoubleArrView valeurs = static_cast(tab_valeurs).view_rw(); for (int n_bord = 0; n_bord < nb_front; n_bord++) { const Cond_lim& la_cl = my_eqn.domaine_Cl_dis().les_conditions_limites(n_bord); const Front_VF& le_bord = ref_cast(Front_VF, la_cl->frontiere_dis()); - int ndeb = le_bord.num_premiere_face(); - int nfin = ndeb + le_bord.nb_faces(); - for (int num_face = ndeb; num_face < nfin; num_face++) - valeurs(num_face) = tab_u_star_(num_face); + const int ndeb = le_bord.num_premiere_face(); + const int nfin = ndeb + le_bord.nb_faces(); + Kokkos::parallel_for(start_gpu_timer(__KERNEL_NAME__), range_1D(ndeb, nfin), KOKKOS_LAMBDA(const int num_face) + { + valeurs(num_face) = u_star(num_face); + }); + end_gpu_timer(__KERNEL_NAME__); } } - valeurs.echange_espace_virtuel(); + tab_valeurs.echange_espace_virtuel(); // Met a jour le temps du champ: champ_u_star_->mettre_a_jour(my_eqn.schema_temps().temps_courant()); return champs_compris_.get_champ(nom); diff --git a/src/ThHyd/Turbulence/Lois_Paroi/Turbulence_paroi_scal_base.h b/src/ThHyd/Turbulence/Lois_Paroi/Turbulence_paroi_scal_base.h index 217915444d..d9d9bd1490 100644 --- a/src/ThHyd/Turbulence/Lois_Paroi/Turbulence_paroi_scal_base.h +++ b/src/ThHyd/Turbulence/Lois_Paroi/Turbulence_paroi_scal_base.h @@ -1,5 +1,5 @@ /**************************************************************************** -* Copyright (c) 2025, CEA +* Copyright (c) 2026, CEA * All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: @@ -106,7 +106,7 @@ class Turbulence_paroi_scal_base: public Champs_compris_interface, public Objet_ int calcul_ldp_en_flux_impose_; // flag defenissant si on utilise la ldp en flux impose 0 par defaut double Prdt_sur_kappa_; // Constante dans la loi de paroi KOKKOS_INLINE_FUNCTION - double T_plus(double y_plus, double Pr, double Prdt_sur_kappa); + static double T_plus(double y_plus, double Pr, double Prdt_sur_kappa); DoubleVects equivalent_distance_; // tableau des distances equivalentes sur chaque bord diff --git a/src/ThHyd/Turbulence/Modeles/scal/Modele_turbulence_scal_Prandtl.h b/src/ThHyd/Turbulence/Modeles/scal/Modele_turbulence_scal_Prandtl.h index 3f3086bfaf..668d2de307 100644 --- a/src/ThHyd/Turbulence/Modeles/scal/Modele_turbulence_scal_Prandtl.h +++ b/src/ThHyd/Turbulence/Modeles/scal/Modele_turbulence_scal_Prandtl.h @@ -39,7 +39,7 @@ class Modele_turbulence_scal_Prandtl: public Modele_turbulence_scal_diffturb_bas Nom definition_fonction_; // stockage de la chaine du jdd Parser_U fonction_; // fonction de calcul de alpha_t Parser_U fonction1_; // fonction de calcul de Prandtl variant en espace - public_for_cuda + protected_but_public_for_cuda virtual Champ_Fonc_base& calculer_diffusivite_turbulente(); }; diff --git a/src/VDF/Champs/Champ_Face_VDF.cpp b/src/VDF/Champs/Champ_Face_VDF.cpp index f1c6d26b6f..b9f802d8ae 100644 --- a/src/VDF/Champs/Champ_Face_VDF.cpp +++ b/src/VDF/Champs/Champ_Face_VDF.cpp @@ -72,6 +72,7 @@ Champ_base& Champ_Face_VDF::affecter_(const Champ_base& ch) int ndeb_int = domaine_VDF.premiere_face_int(); const IntTab& f_e = domaine_VDF.face_voisins(); + ToDo_Kokkos("critical"); for (int f = 0; f < ndeb_int; f++) { const int ori = orientation(f); @@ -80,6 +81,7 @@ Champ_base& Champ_Face_VDF::affecter_(const Champ_base& ch) val(f, n) = v(e, N * ori + n); } + ToDo_Kokkos("critical"); for (int f = ndeb_int; f < domaine_VDF.nb_faces(); f++) { const int ori = orientation(f); @@ -94,6 +96,7 @@ Champ_base& Champ_Face_VDF::affecter_(const Champ_base& ch) if (unif) eval = ch.valeurs(); else eval.resize(val.dimension(0), N * D), ch.valeur_aux(domaine_VDF.xv(), eval); + ToDo_Kokkos("critical"); for (int f = 0; f < domaine_VDF.nb_faces(); f++) for (int n = 0; n < N; n++) val(f, n) = eval(unif ? 0 : f, N * orientation(f) + n); @@ -154,6 +157,7 @@ const Champ_Proto& Champ_Face_VDF::affecter(const DoubleTab& v) { if (v.dimension(1) == dimension) { + ToDo_Kokkos("critical"); if (v.dimension(0) == val.size()) for (int num_face = 0; num_face < val.size(); num_face++) val(num_face) = v(num_face, orientation(num_face)); @@ -179,8 +183,7 @@ void Champ_Face_VDF::verifie_valeurs_cl() { const Domaine_Cl_dis_base& zcl = domaine_Cl_dis(); int nb_cl = zcl.nb_cond_lim(); - DoubleTab& ch_tab = valeurs(); - int ndeb, nfin, num_face; + DoubleTab& tab_ch = valeurs(); for (int i = 0; i < nb_cl; i++) { @@ -189,24 +192,24 @@ void Champ_Face_VDF::verifie_valeurs_cl() { const Periodique& la_cl_perio = ref_cast(Periodique, la_cl); const Front_VF& le_bord = ref_cast(Front_VF, la_cl.frontiere_dis()); - ndeb = le_bord.num_premiere_face(); - nfin = ndeb + le_bord.nb_faces(); - int voisine; - double moy; - - for (num_face = ndeb; num_face < nfin; num_face++) - { - voisine = la_cl_perio.face_associee(num_face - ndeb) + ndeb; - if (ch_tab[num_face] != ch_tab[voisine]) - { - moy = 0.5 * (ch_tab[num_face] + ch_tab[voisine]); - ch_tab[num_face] = moy; - ch_tab[voisine] = moy; - } - } + const int ndeb = le_bord.num_premiere_face(); + const int nfin = ndeb + le_bord.nb_faces(); + CIntArrView face_associee = la_cl_perio.face_associee().view_ro(); + DoubleArrView ch_tab = static_cast(tab_ch).view_rw(); + Kokkos::parallel_for(start_gpu_timer(__KERNEL_NAME__), range_1D(ndeb, nfin), KOKKOS_LAMBDA(const int num_face) + { + const int voisine = face_associee(num_face - ndeb) + ndeb; + if (ch_tab(num_face) != ch_tab(voisine)) + { + const double moy = 0.5 * (ch_tab(num_face) + ch_tab(voisine)); + ch_tab(num_face) = moy; + ch_tab(voisine) = moy; + } + }); + end_gpu_timer(__KERNEL_NAME__); } } - ch_tab.echange_espace_virtuel(); + tab_ch.echange_espace_virtuel(); } /*! @brief Renvoie la valeur que devrait avoir le champ sur une face de bord, si on en croit les conditions aux limites. @@ -220,7 +223,7 @@ void Champ_Face_VDF::verifie_valeurs_cl() double Champ_Face_VDF::val_imp_face_bord_private(int face, int comp) const { const Domaine_Cl_VDF& zclo = ref_cast(Domaine_Cl_VDF, equation().domaine_Cl_dis()); - return Champ_Face_get_val_imp_face_bord_sym(valeurs(), temps(), face, comp, zclo); + return Champ_Face_get_val_imp_face_bord(temps(), face, comp, zclo, &valeurs()); } // WEC : jamais appele !! @@ -260,14 +263,10 @@ void Champ_Face_VDF::calculer_rotationnel_ordre2_centre_element(DoubleTab& rot) { const DoubleTab& val = valeurs(); const Domaine_VDF& domaine_VDF = domaine_vdf(); - int nb_elem = domaine_VDF.nb_elem(); - const IntTab& face_voisins = domaine_VDF.face_voisins(); - const IntTab& elem_faces = domaine_VDF.elem_faces(); - if (dimension == 2) - calrotord2centelemdim2(rot, val, domaine_VDF, nb_elem, face_voisins, elem_faces); + calrotord2centelemdim2(rot, val, domaine_VDF); else if (dimension == 3) - calrotord2centelemdim3(rot, val, domaine_VDF, nb_elem, face_voisins, elem_faces); + calrotord2centelemdim3(rot, val, domaine_VDF); } int Champ_Face_VDF::imprime(Sortie& os, int ncomp) const @@ -276,7 +275,7 @@ int Champ_Face_VDF::imprime(Sortie& os, int ncomp) const return 1; } -void Champ_Face_VDF::calcul_critere_Q(DoubleTab& Q, const Domaine_Cl_VDF& domaine_Cl_VDF) +void Champ_Face_VDF::calcul_critere_Q(DoubleTab& tab_Q, const Domaine_Cl_VDF& domaine_Cl_VDF) { // Q=0.5*(\Omega_{ij}*\Omega_{ij}-S_{ij}*S_{ij})=-0.25*du_i/dx_j*du_j/dx_i @@ -285,55 +284,45 @@ void Champ_Face_VDF::calcul_critere_Q(DoubleTab& Q, const Domaine_Cl_VDF& domain const DoubleTab& vitesse = valeurs(); const int nb_elem = domaine_VDF.nb_elem(); const int nb_elem_tot = domaine_VDF.nb_elem_tot(); - int num_elem, i, j, N = vitesse.line_size(); - double crit, deriv1, deriv2; - + const int N = vitesse.line_size(); + const int dim = Objet_U::dimension; if (N!=1) Process::exit(que_suis_je() + "::calcul_critere_Q : the velocity field must be single phase !!"); - DoubleTab gradient_elem(nb_elem_tot, dimension, dimension, N); - gradient_elem = 0.; - - vit.calcul_duidxj(vitesse, gradient_elem, domaine_Cl_VDF); - - for (num_elem = 0; num_elem < nb_elem; num_elem++) - { - crit = 0.; - for (i = 0; i < dimension; i++) - for (j = 0; j < dimension; j++) - { - deriv1 = gradient_elem(num_elem, i, j, 0); - deriv2 = gradient_elem(num_elem, j, i, 0); - - crit += -0.25 * deriv1 * deriv2; - } - Q[num_elem] = crit; - } + DoubleTrav tab_gradient_elem(nb_elem_tot, dim, dim, N); + vit.calcul_duidxj(vitesse, tab_gradient_elem, domaine_Cl_VDF); + CDoubleTabView4 gradient_elem = tab_gradient_elem.view_ro<4>(); + DoubleArrView Q = static_cast(tab_Q).view_wo(); + Kokkos::parallel_for(start_gpu_timer(__KERNEL_NAME__), range_1D(0, nb_elem), KOKKOS_LAMBDA(const int num_elem) + { + double crit = 0.; + for (int i = 0; i < dim; i++) + for (int j = 0; j < dim; j++) + crit += -0.25 * gradient_elem(num_elem, i, j, 0) * gradient_elem(num_elem, j, i, 0); + Q(num_elem) = crit; + }); + end_gpu_timer(__KERNEL_NAME__); } -void Champ_Face_VDF::calcul_y_plus(DoubleTab& y_plus, const Domaine_Cl_VDF& domaine_Cl_VDF) +void Champ_Face_VDF::calcul_y_plus(DoubleTab& tab_y_plus, const Domaine_Cl_VDF& domaine_Cl_VDF) { // On initialise le champ y_plus avec une valeur negative, // comme ca lorsqu'on veut visualiser le champ pres de la paroi, // on n'a qu'a supprimer les valeurs negatives et n'apparaissent // que les valeurs aux parois. - int ndeb, nfin, elem, ori, l_unif; - double norm_tau, u_etoile, norm_v = 0, dist, val0, val1, val2, d_visco = 0, visco = 1.; - y_plus = -1.; + int ndeb, nfin, l_unif; + double visco = 1.; const Champ_Face_VDF& vit = *this; const Domaine_VDF& domaine_VDF = domaine_vdf(); - const IntTab& face_voisins = domaine_VDF.face_voisins(); - const IntVect& orientation = domaine_VDF.orientation(); const Equation_base& eqn_hydr = equation(); const Fluide_base& le_fluide = ref_cast(Fluide_base, eqn_hydr.milieu()); const Champ_Don_base& ch_visco_cin = le_fluide.viscosite_cinematique(); - const DoubleTab& tab_visco = ch_visco_cin.valeurs(); - //DoubleTab& tab_visco = ch_visco_cin.valeurs(); + const DoubleTab& tab_visco_cin = ch_visco_cin.valeurs(); if (sub_type(Champ_Uniforme, ch_visco_cin)) { - visco = tab_visco(0, 0); + visco = tab_visco_cin(0, 0); l_unif = 1; } else @@ -343,15 +332,15 @@ void Champ_Face_VDF::calcul_y_plus(DoubleTab& y_plus, const Domaine_Cl_VDF& doma /* GF on a pas a change tab_visco ici ! if (!l_unif) { - const int n = tab_visco.size_array(); - ArrOfDouble& v = tab_visco; + const int n = tab_visco_cin.size_array(); + ArrOfDouble& v = tab_visco_cin; for (int i = 0; i < n; i++) if (v[i] < DMINFLOAT) v[i] = DMINFLOAT; } */ - DoubleTab yplus_faces(1, 1); // will contain yplus values if available + DoubleTrav tab_yplus_faces(1, 1); // will contain yplus values if available int yplus_already_computed = 0; // flag const RefObjU& modele_turbulence = eqn_hydr.get_modele(TURBULENCE); @@ -361,12 +350,28 @@ void Champ_Face_VDF::calcul_y_plus(DoubleTab& y_plus, const Domaine_Cl_VDF& doma const Turbulence_paroi_base& loipar = mod_turb.loi_paroi(); if (loipar.use_shear()) { - yplus_faces.resize(domaine_vdf().nb_faces_tot()); - yplus_faces.ref(loipar.tab_d_plus()); + tab_yplus_faces.resize(domaine_vdf().nb_faces_tot()); + tab_yplus_faces.ref(loipar.tab_d_plus()); yplus_already_computed = 1; } } - + const int dim = Objet_U::dimension; + const int is_axi = Objet_U::axi; + + Domaine_VDF_View dom_vdf(domaine_VDF); + CIntTabView face_voisins = domaine_VDF.face_voisins().view_ro(); + CIntArrView orientation = domaine_VDF.orientation().view_ro(); + CIntTabView elem_faces = domaine_VDF.elem_faces().view_ro(); + CDoubleArrView vitesse = static_cast(vit.valeurs()).view_ro(); + CDoubleArrView yplus_faces; + if (yplus_already_computed) yplus_faces = static_cast(tab_yplus_faces).view_ro(); + CDoubleTabView tab_visco; + if (!l_unif) tab_visco = tab_visco_cin.view_ro(); + DoubleTrav tab_counter(tab_y_plus.size_array()); + tab_counter = 0; + tab_y_plus = 0; + DoubleArrView y_plus = static_cast(tab_y_plus).view_rw(); + DoubleArrView counter = static_cast(tab_counter).view_wo(); for (int n_bord = 0; n_bord < domaine_VDF.nb_front_Cl(); n_bord++) { const Cond_lim& la_cl = domaine_Cl_VDF.les_conditions_limites(n_bord); @@ -376,52 +381,47 @@ void Champ_Face_VDF::calcul_y_plus(DoubleTab& y_plus, const Domaine_Cl_VDF& doma const Front_VF& le_bord = ref_cast(Front_VF, la_cl->frontiere_dis()); ndeb = le_bord.num_premiere_face(); nfin = ndeb + le_bord.nb_faces(); - - for (int num_face = ndeb; num_face < nfin; num_face++) - { - - if (face_voisins(num_face, 0) != -1) - elem = face_voisins(num_face, 0); - else - elem = face_voisins(num_face, 1); - - if (yplus_already_computed) - { - // y+ is only defined on faces so we take the face value to put in the element - y_plus(elem) = yplus_faces(num_face); - } - else - { - if (dimension == 2) - { - ori = orientation(num_face); - norm_v = norm_2D_vit(vit.valeurs(), elem, ori, domaine_VDF, val0); - } - else if (dimension == 3) - { - ori = orientation(num_face); - norm_v = norm_3D_vit(vit.valeurs(), elem, ori, domaine_VDF, val1, val2); - } // dim 3 - - if (axi) - dist = domaine_VDF.dist_norm_bord_axi(num_face); - else - dist = domaine_VDF.dist_norm_bord(num_face); - if (l_unif) - d_visco = visco; - else - d_visco = tab_visco[elem]; - - // PQ : 01/10/03 : corrections par rapport a la version premiere - norm_tau = d_visco * norm_v / dist; - - u_etoile = sqrt(norm_tau); - y_plus(elem) = dist * u_etoile / d_visco; - - } // else yplus already computed - } // loop on faces + Kokkos::parallel_for(start_gpu_timer(__KERNEL_NAME__), range_1D(ndeb, nfin), KOKKOS_LAMBDA(const int num_face) + { + const int elem = face_voisins(num_face, 0) != -1 ? face_voisins(num_face, 0) : face_voisins(num_face, 1); + if (yplus_already_computed) + { + // y+ is only defined on faces so we take the face value to put in the element + Kokkos::atomic_add(&y_plus(elem), yplus_faces(num_face)); + } + else + { + const int ori = orientation(num_face); + double norm_v = 0; + if (dim == 2) + { + double val0; + norm_v = norm_2D_vit(vitesse, elem, ori, elem_faces, val0); + } + else + { + double val1, val2; + norm_v = norm_3D_vit(vitesse, elem, ori, elem_faces, val1, val2); + } + const double dist = is_axi ? dom_vdf.dist_norm_bord_axi(num_face) : dom_vdf.dist_norm_bord(num_face); + const double d_visco = l_unif ? visco : tab_visco(elem, 0); + const double norm_tau = d_visco * norm_v / dist; + Kokkos::atomic_add(&y_plus(elem), dist * Kokkos::sqrt(norm_tau) / d_visco); + } + Kokkos::atomic_add(&counter(elem), 1.); + }); // loop on faces + end_gpu_timer(__KERNEL_NAME__); } // Fin paroi fixe } // Fin boucle sur les bords + + Kokkos::parallel_for(start_gpu_timer(__KERNEL_NAME__), range_1D(0, tab_y_plus.size_array()), KOKKOS_LAMBDA(const int elem) + { + if (counter(elem) > 0.) + y_plus(elem) /= counter(elem); + else + y_plus(elem) = -1.; + }); + end_gpu_timer(__KERNEL_NAME__); } /*! @brief Methode qui renvoie gij aux elements a partir de la vitesse aux faces (gij represente la derivee partielle dui/dxj) @@ -429,298 +429,314 @@ void Champ_Face_VDF::calcul_y_plus(DoubleTab& y_plus, const Domaine_Cl_VDF& doma * A partir de gij, on peut calculer Sij = 0.5(gij(i,j)+gij(j,i)) * */ -DoubleTab& Champ_Face_VDF::calcul_duidxj(const DoubleTab& vitesse, DoubleTab& gij, const Domaine_Cl_VDF& domaine_Cl_VDF) const +DoubleTab& Champ_Face_VDF::calcul_duidxj(const DoubleTab& tab_vitesse, DoubleTab& tab_gij, const Domaine_Cl_VDF& domaine_Cl_VDF) const { const Champ_Face_VDF& vit = ref_cast(Champ_Face_VDF, mon_equation->inconnue()); const Domaine_Cl_VDF& dclvdf = ref_cast(Domaine_Cl_VDF, vit.domaine_Cl_dis()); const Domaine_VDF& domaine_VDF = domaine_vdf(); - const int nb_elem = domaine_VDF.domaine().nb_elem_tot(), N = vitesse.line_size(); - const IntTab& face_voisins = domaine_VDF.face_voisins(), &elem_faces = domaine_VDF.elem_faces(), &Qdm = domaine_VDF.Qdm(); - const IntVect& orientation = domaine_VDF.orientation(); + const int nb_elem = domaine_VDF.domaine().nb_elem_tot(), N = tab_vitesse.line_size(); const int prem_am = domaine_VDF.premiere_arete_mixte(), dern_am = prem_am + domaine_VDF.nb_aretes_mixtes(); const int prem_ai = domaine_VDF.premiere_arete_interne(), dern_ai = prem_ai + domaine_VDF.nb_aretes_internes(); - IntVect element(4); - gij = 0.; + tab_gij = 0.; // On parcourt toutes les aretes qui permettent de calculer les termes croises du_i/dx_j // (les termes non-croises sont calcules en bouclant sur les elements) + const IntTab& tab_Qdm = domaine_VDF.Qdm(); + + // Calcul de val_imp_face_bord_ + Champ_Face_get_val_imp_face_bord(vit.temps(), val_imp_face_bord_, dclvdf, &tab_vitesse); + const bool traitement_gradients = Option_VDF::traitement_gradients; + const bool traitement_coins_opt = Option_VDF::traitement_coins; + const int dim = Objet_U::dimension; + Domaine_VDF_View dom_vdf(domaine_VDF); + CIntTabView Qdm = domaine_VDF.Qdm().view_ro(); + CIntTabView face_voisins = domaine_VDF.face_voisins().view_ro(); + CIntTabView elem_faces = domaine_VDF.elem_faces().view_ro(); + CIntArrView orientation = domaine_VDF.orientation().view_ro(); + CDoubleTabView vitesse = tab_vitesse.view_ro(); + CDoubleTabView val_imp = val_imp_face_bord_.view_ro(); + DoubleTabView4 gij = tab_gij.view_rw<4>(); // On commence par les bords int ndeb = domaine_VDF.premiere_arete_bord(), nfin = ndeb + domaine_VDF.nb_aretes_bord(); - for (int num_arete = ndeb; num_arete < nfin; num_arete++) - for (int n=0; n= 0) + { + // 1) 0.25 : on distribue le gradient de vitesse sur les 4 elements qui l'entourent. + Kokkos::atomic_add(&gij(element[k], i, j, n), temp1 * 0.25); + Kokkos::atomic_add(&gij(element[k], j, i, n), temp2 * 0.25); + } + } + else if (traitement_gradients && (n_type == 5 || n_type == 6)) + Kokkos::abort("Issue in Champ_Face_VDF::calcul_duidxj: n_type 5/6 not handled"); + else /* les autres aretes bords ... */ + { + const int num0 = Qdm(num_arete, 0), num1 = Qdm(num_arete, 1), num2 = Qdm(num_arete, 2), signe = Qdm(num_arete, 3); + const int i = orientation(num0), j = orientation(num2); + const double temp1 = (vitesse(num1, n) - vitesse(num0, n)) / dom_vdf.dist_face_period(num0, num1, j); // du_i/dx_j + const double vit_imp = 0.5 * (val_imp(num0, N*j+n) + val_imp(num1, N*j+n)); // vitesse tangentielle + //Dans cette partie, on conserve le codage de Hyd_SGE_Wale_VDF (num1 et non num2) pour calculer la distance entre le centre de la maille et le bord. + const double temp2 = -signe * (vitesse(num2, n) - vit_imp) / dom_vdf.dist_norm_bord(num1); + element[0] = face_voisins(num2, 0); + element[1] = face_voisins(num2, 1); + for (int k = 0; k < 2; k++) + if (element[k] >= 0) + { + Kokkos::atomic_add(&gij(element[k], i, j, n), temp1 * 0.25); + Kokkos::atomic_add(&gij(element[k], j, i, n), temp2 * 0.25); + } + } + }); // fin aretes bords + end_gpu_timer(__KERNEL_NAME__); // On continue avec les coins ndeb = domaine_VDF.premiere_arete_coin(), nfin = ndeb + domaine_VDF.nb_aretes_coin(); - - for (int num_arete = ndeb; num_arete < nfin; num_arete++) - for (int n=0; n -1 ? num0 : num1, f2 = num2 > -1 ? num2 : num3; + for (int n = 0; n < N; n++) { - // 1) 0.5 : pour la periodicite, car on distribuera deux fois sur les elements qui "touchent" cette arete puisqu'elle existe en double. - // 2) 0.5 : idem ci-dessus, car cette fois-ci on a un coin perio-perio. - // 3) 0.25 : on distribue le gradient de vitesse sur les 4 elements qui l'entourent. - gij(element(k), i, j, n) += temp1 * 0.5 * 0.5 * 0.25; - gij(element(k), j, i, n) += temp2 * 0.5 * 0.5 * 0.25; - } - } - - if (n_type == 1) // arete de type perio-paroi - { - const int num0 = Qdm(num_arete, 0), num1 = Qdm(num_arete, 1), num2 = Qdm(num_arete, 2), signe = Qdm(num_arete, 3); - const int i = orientation(num1), j = orientation(num2); - - const double temp1 = (vitesse(num1, n) - vitesse(num0, n)) / domaine_VDF.dist_face_period(num0, num1, j); // du_i / dx_j - const double vit_imp = 0.5 * (vit.val_imp_face_bord_private(num0, N*j+n) + vit.val_imp_face_bord_private(num1, N*j+n)); // vitesse tangentielle - - const double temp2 = -signe * (vitesse(num2, n) - vit_imp) / domaine_VDF.dist_norm_bord(num1); - - element(0) = face_voisins(num2, 0); - element(1) = face_voisins(num2, 1); - - for (int k = 0; k < 2; k++) - { - // 1) 0.5 : pour la periodicite, car on distribuera deux fois sur les elements qui "touchent" cette arete puisqu'elle existe en double. - // 2) 0.25 : on distribue le gradient de vitesse sur les 4 elements qui l'entourent. - gij(element(k), i, j, n) += temp1 * 0.5 * 0.25; - gij(element(k), j, i, n) += temp2 * 0.5 * 0.25; - } - } - - // XXX : Elie Saikali : j'ajoute ca pour les coins juste si option_vdf active pour le moment ... - - if (Option_VDF::traitement_gradients && Option_VDF::traitement_coins) - { - if (n_type == 14 || n_type == 15) // arete de type fluide-paroi ou paroi-fluide - { - const int num0 = Qdm(num_arete, 0), num1 = Qdm(num_arete, 1), num2 = Qdm(num_arete, 2), signe = Qdm(num_arete, 3); - const int i = orientation(num1), j = orientation(num2); - - const double temp1 = (vitesse(num1, n) - vitesse(num0, n)) / domaine_VDF.dist_face_period(num0, num1, j); // du_i / dx_j - const double vit_imp = 0.5 * (vit.val_imp_face_bord_private(num0, N*j+n) + vit.val_imp_face_bord_private(num1, N*j+n)); // vitesse tangentielle - - const double temp2 = -signe * (vitesse(num2, n) - vit_imp) / domaine_VDF.dist_norm_bord(num1); - - element(0) = face_voisins(num2, 0); - element(1) = face_voisins(num2, 1); - - for (int k = 0; k < 2; k++) - if (element(k) != -1) - { - gij(element(k), i, j, n) += temp1 * 0.25; - gij(element(k), j, i, n) += temp2 * 0.25; - } - } - else if (n_type == 3 || n_type == 4 || n_type == 8) // arete de type fluide-navier - { - const int num0 = Qdm(num_arete, 0), num1 = Qdm(num_arete, 1), num2 = Qdm(num_arete, 2), num3 = Qdm(num_arete, 3); - const int f1 = num0 > -1 ? num0 : num1, f2 = num2 > -1 ? num2 : num3; - const int i = orientation(f1), j = orientation(f2); - - const double coeff_frot1 = Champ_Face_coeff_frottement_grad_face_bord(f1, n, dclvdf), coeff_frot2 = Champ_Face_coeff_frottement_grad_face_bord(f2, n, dclvdf); - -// int signe = f2 == num3 ? -1 : 1; -// const double temp1 = coeff_frot2 * signe * vitesse(f1, n); -// const double temp2 = coeff_frot1 * signe * vitesse(f2, n); - const double temp1 = coeff_frot2 * (face_voisins(f2, 0)==-1 ? 1:-1)* vitesse(f1, n); - const double temp2 = coeff_frot1 * (face_voisins(f1, 0)==-1 ? 1:-1)* vitesse(f2, n); - - - element(0) = face_voisins(f1, 0); - element(1) = face_voisins(f1, 1); - - for (int k = 0; k < 2; k++) - if (element(k) != -1) - { - gij(element(k), i, j, n) += temp1 * 0.25; - gij(element(k), j, i, n) += temp2 * 0.25; - } + tab_coeff_frot_f1_coin(num_arete - ndeb, n) = Champ_Face_coeff_frottement_grad_face_bord(f1, n, dclvdf); + tab_coeff_frot_f2_coin(num_arete - ndeb, n) = Champ_Face_coeff_frottement_grad_face_bord(f2, n, dclvdf); } } } - // On continue avec les aretes mixtes + CIntArrView type_arete_coin = domaine_Cl_VDF.type_arete_coin().view_ro(); + CDoubleTabView coeff_frot_f1_coin = tab_coeff_frot_f1_coin.view_ro(); + CDoubleTabView coeff_frot_f2_coin = tab_coeff_frot_f2_coin.view_ro(); + int ndeb_coin = ndeb; - for (int num_arete = prem_am; num_arete < dern_am; num_arete++) - for (int n=0; n= 0) { - // 1) 0.25 : on distribue le gradient de vitesse sur les 3 elements qui l'entourent. - // C'est pour cela que l'on regarde si element(k)!=-1, car dans ce cas la, c'est qu'il s'agit de "la case qui manque" ! - gij(element(k), i, j, n) += temp1 * 0.25; - gij(element(k), j, i, n) += temp2 * 0.25; + // 1) 0.5 : pour la periodicite, car on distribuera deux fois puisqu'elle existe en double. + // 2) 0.5 : idem, car cette fois-ci on a un coin perio-perio. + // 3) 0.25 : on distribue le gradient de vitesse sur les 4 elements qui l'entourent. + Kokkos::atomic_add(&gij(element[k], i, j, n), temp1 * 0.5 * 0.5 * 0.25); + Kokkos::atomic_add(&gij(element[k], j, i, n), temp2 * 0.5 * 0.5 * 0.25); } } - // On continue avec les aretes internes - - for (int num_arete = prem_ai; num_arete < dern_ai; num_arete++) - for (int n=0; n= 0) + { + // 1) 0.5 : pour la periodicite. + // 2) 0.25 : on distribue le gradient de vitesse sur les 4 elements qui l'entourent. + Kokkos::atomic_add(&gij(element[k], i, j, n), temp1 * 0.5 * 0.25); + Kokkos::atomic_add(&gij(element[k], j, i, n), temp2 * 0.5 * 0.25); + } + } - for (int k = 0; k < 4; k++) + // XXX : Elie Saikali : j'ajoute ca pour les coins juste si option_vdf active pour le moment ... + if (traitement_gradients && traitement_coins_opt) + { + if (n_type == 14 || n_type == 15) // arete de type fluide-paroi ou paroi-fluide + { + const int num0 = Qdm(num_arete, 0), num1 = Qdm(num_arete, 1), num2 = Qdm(num_arete, 2), signe = Qdm(num_arete, 3); + const int i = orientation(num1), j = orientation(num2); + const double temp1 = (vitesse(num1, n) - vitesse(num0, n)) / dom_vdf.dist_face_period(num0, num1, j); // du_i / dx_j + const double vit_imp = 0.5 * (val_imp(num0, N*j+n) + val_imp(num1, N*j+n)); // vitesse tangentielle + const double temp2 = -signe * (vitesse(num2, n) - vit_imp) / dom_vdf.dist_norm_bord(num1); + element[0] = face_voisins(num2, 0); + element[1] = face_voisins(num2, 1); + for (int k = 0; k < 2; k++) + if (element[k] != -1) + { + Kokkos::atomic_add(&gij(element[k], i, j, n), temp1 * 0.25); + Kokkos::atomic_add(&gij(element[k], j, i, n), temp2 * 0.25); + } + } + else if (n_type == 3 || n_type == 4 || n_type == 8) // arete de type fluide-navier { - // 1) 0.25 : on distribue le gradient de vitesse sur les 4 elements qui l'entourent. - gij(element(k), i, j, n) += temp1 * 0.25; - gij(element(k), j, i, n) += temp2 * 0.25; + const int num0 = Qdm(num_arete, 0), num1 = Qdm(num_arete, 1); + const int num2 = Qdm(num_arete, 2), num3 = Qdm(num_arete, 3); + const int f1 = num0 > -1 ? num0 : num1, f2 = num2 > -1 ? num2 : num3; + const int i = orientation(f1), j = orientation(f2); + const double coeff_frot1 = coeff_frot_f1_coin(num_arete - ndeb_coin, n); + const double coeff_frot2 = coeff_frot_f2_coin(num_arete - ndeb_coin, n); + const double temp1 = coeff_frot2 * (face_voisins(f2, 0) == -1 ? 1 : -1) * vitesse(f1, n); + const double temp2 = coeff_frot1 * (face_voisins(f1, 0) == -1 ? 1 : -1) * vitesse(f2, n); + element[0] = face_voisins(f1, 0); + element[1] = face_voisins(f1, 1); + for (int k = 0; k < 2; k++) + if (element[k] != -1) + { + Kokkos::atomic_add(&gij(element[k], i, j, n), temp1 * 0.25); + Kokkos::atomic_add(&gij(element[k], j, i, n), temp2 * 0.25); + } } } + }); // fin aretes coins + end_gpu_timer(__KERNEL_NAME__); + + // On continue avec les aretes mixtes + Kokkos::parallel_for(start_gpu_timer(__KERNEL_NAME__), range_2D({prem_am, 0}, {dern_am, N}), + KOKKOS_LAMBDA(const int num_arete, const int n) + { + const int num0 = Qdm(num_arete, 0), num1 = Qdm(num_arete, 1), num2 = Qdm(num_arete, 2), num3 = Qdm(num_arete, 3); + const int i = orientation(num0), j = orientation(num2); + const double temp1 = (vitesse(num1, n) - vitesse(num0, n)) / dom_vdf.dist_face_period(num0, num1, j); // du_i / dx_j + const double temp2 = (vitesse(num3, n) - vitesse(num2, n)) / dom_vdf.dist_face_period(num2, num3, i); // du_j / dx_i + const int element[4] = { face_voisins(num0, 0), face_voisins(num0, 1), face_voisins(num1, 0), face_voisins(num1, 1) }; + for (int k = 0; k < 4; k++) + if (element[k] != -1) + { + // 1) 0.25 : on distribue le gradient de vitesse sur les 3 elements qui l'entourent. + Kokkos::atomic_add(&gij(element[k], i, j, n), temp1 * 0.25); + Kokkos::atomic_add(&gij(element[k], j, i, n), temp2 * 0.25); + } + }); // fin aretes mixtes + end_gpu_timer(__KERNEL_NAME__); + // On continue avec les aretes internes + Kokkos::parallel_for(start_gpu_timer(__KERNEL_NAME__), range_2D({prem_ai, 0}, {dern_ai, N}), + KOKKOS_LAMBDA(const int num_arete, const int n) + { + const int num0 = Qdm(num_arete, 0), num1 = Qdm(num_arete, 1), num2 = Qdm(num_arete, 2), num3 = Qdm(num_arete, 3); + const int i = orientation(num0), j = orientation(num2); + const double temp1 = (vitesse(num1, n) - vitesse(num0, n)) / dom_vdf.dist_face_period(num0, num1, j); // du_i / dx_j + const double temp2 = (vitesse(num3, n) - vitesse(num2, n)) / dom_vdf.dist_face_period(num2, num3, i); // du_j / dx_i + const int element[4] = { face_voisins(num0, 0), face_voisins(num0, 1), face_voisins(num1, 0), face_voisins(num1, 1) }; + for (int k = 0; k < 4; k++) + { + // 1) 0.25 : on distribue le gradient de vitesse sur les 4 elements qui l'entourent. + Kokkos::atomic_add(&gij(element[k], i, j, n), temp1 * 0.25); + Kokkos::atomic_add(&gij(element[k], j, i, n), temp2 * 0.25); + } + }); // fin aretes internes + end_gpu_timer(__KERNEL_NAME__); // XXX : Elie Saikali : HACK pour coins fluides-fluides // pour ce cas (j'avoue cas rare), attention soucis avec les valeurs de la vitesse sur les coins ... par exemple un champ_fonc_xyz x+y+z donne pas le bon truc sur les coins // On continue avec les coins - ndeb = domaine_VDF.premiere_arete_coin(), nfin = ndeb + domaine_VDF.nb_aretes_coin(); - - for (int num_arete = ndeb; num_arete < nfin; num_arete++) - for (int n=0; n d = (a+b+c)/3 + if (el0 != -1) { - const int num1 = Qdm(num_arete, 1), num2 = Qdm(num_arete, 2); - const int i = orientation(num1), j = orientation(num2); - - element(0) = face_voisins(num2, 0); - element(1) = face_voisins(num2, 1); - - for (int k = 0; k < 2; k++) - if (element(k) != -1) - { - // XXX : 1/3 car on veut un truc comme ca : (a+b+c+d)/4 = (a+b+c)/3 => d = (a+b+c)/3 - gij(element(k), i, j, n) += gij(element(k), i, j, n) / 3.; - gij(element(k), j, i, n) += gij(element(k), j, i, n) / 3.; - } + Kokkos::atomic_add(&gij(el0, i, j, n), gij(el0, i, j, n) / 3.); + Kokkos::atomic_add(&gij(el0, j, i, n), gij(el0, j, i, n) / 3.); } - } + if (el1 != -1) + { + Kokkos::atomic_add(&gij(el1, i, j, n), gij(el1, i, j, n) / 3.); + Kokkos::atomic_add(&gij(el1, j, i, n), gij(el1, j, i, n) / 3.); + } + } + }); // fin coins fluides-fluides + end_gpu_timer(__KERNEL_NAME__); // 2eme partie : boucle sur les elements et remplissage de Sij pour les derivees non croisees (du_i / dx_i). // En fait dans ces cas la, on calcul directement le gradient dans l'element et on ne redistribue pas. + Kokkos::parallel_for(start_gpu_timer(__KERNEL_NAME__), range_2D({0, 0}, {nb_elem, N}), + KOKKOS_LAMBDA(const int elem, const int n) + { + for (int i = 0; i < dim; i++) + { + const double temp1 = (vitesse(elem_faces(elem, i), n) - vitesse(elem_faces(elem, i + dim), n)) / dom_vdf.dim_elem(elem, orientation(elem_faces(elem, i))); + gij(elem, i, i, n) = -temp1; + } + }); // fin elements + end_gpu_timer(__KERNEL_NAME__); - for (int elem = 0; elem < nb_elem; elem++) - for (int n=0; n(); + DoubleArrView SMA_barre = static_cast(tab_SMA_barre).view_rw(); + Kokkos::parallel_for(start_gpu_timer(__KERNEL_NAME__), range_1D(0, nb_elem), KOKKOS_LAMBDA(const int elem) + { + for (int n = 0; n < N; n++) { - temp = 0.; - for (i = 0; i < dimension; i++) - for (j = 0; j < dimension; j++) + double temp = 0.; + for (int i = 0; i < dim; i++) + for (int j = 0; j < dim; j++) { - Sij = 0.5 * (duidxj(elem, i, j, n) + duidxj(elem, j, i, n)); + double Sij = 0.5 * (tab_duidxj(elem, i, j, n) + tab_duidxj(elem, j, i, n)); temp += Sij * Sij; } SMA_barre(elem) = 2. * temp; } - - return SMA_barre; - + }); + end_gpu_timer(__KERNEL_NAME__); + return tab_SMA_barre; } -DoubleTab& Champ_Face_VDF::calcul_S_barre_Multiphase(const DoubleTab& vitesse, DoubleTab& SMA_barre, const Domaine_Cl_VDF& domaine_Cl_VDF) const +DoubleTab& Champ_Face_VDF::calcul_S_barre_Multiphase(const DoubleTab& tab_vitesse, DoubleTab& tab_SMA_barre, const Domaine_Cl_VDF& domaine_Cl_VDF) const { const Domaine_VDF& domaine_VDF = domaine_vdf(); const int nb_elem_tot = domaine_VDF.nb_elem_tot(); const int nb_elem = domaine_VDF.nb_elem(); - const int N = vitesse.line_size(); - - int i, j; - int elem; - double Sij, temp; - - DoubleTab duidxj(nb_elem_tot, dimension, dimension, N); - - calcul_duidxj(vitesse, duidxj, domaine_Cl_VDF); - - for (elem = 0; elem < nb_elem; elem++) - for (int n=0; n(); + DoubleTabView SMA_barre = tab_SMA_barre.view_rw(); + Kokkos::parallel_for(start_gpu_timer(__KERNEL_NAME__), range_1D(0, nb_elem), KOKKOS_LAMBDA(const int elem) + { + for (int n = 0; n < N; n++) { - temp = 0.; - for (i = 0; i < dimension; i++) - for (j = 0; j < dimension; j++) + double temp = 0.; + for (int i = 0; i < dim; i++) + for (int j = 0; j < dim; j++) { - Sij = 0.5 * (duidxj(elem, i, j, n) + duidxj(elem, j, i, n)); + double Sij = 0.5 * (tab_duidxj(elem, i, j, n) + tab_duidxj(elem, j, i, n)); temp += Sij * Sij; } - SMA_barre(elem,n) = 2. * temp; + SMA_barre(elem, n) = 2. * temp; } - - return SMA_barre; - + }); + end_gpu_timer(__KERNEL_NAME__); + return tab_SMA_barre; } -void Champ_Face_VDF::calcul_grad_u(const DoubleTab& vitesse, DoubleTab& grad_u, const Domaine_Cl_VDF& domaine_Cl_VDF) +void Champ_Face_VDF::calcul_grad_u(const DoubleTab& vitesse, DoubleTab& tab_grad_u, const Domaine_Cl_VDF& domaine_Cl_VDF) { const Domaine_VDF& domaine_VDF = domaine_vdf(); const int nb_elem = domaine_VDF.nb_elem(); const int nb_elem_tot = domaine_VDF.nb_elem_tot(), N = vitesse.line_size(); - - DoubleTab gradient_elem(nb_elem_tot, dimension, dimension, N); - gradient_elem = 0.; - - calcul_duidxj(vitesse, gradient_elem, domaine_Cl_VDF); - - for (int elem = 0; elem < nb_elem; elem++) - for (int n=0; n(); + DoubleTabView grad_u = tab_grad_u.view_rw(); + Kokkos::parallel_for(start_gpu_timer(__KERNEL_NAME__), range_1D(0, nb_elem), KOKKOS_LAMBDA(const int elem) + { + for (int n = 0; n < N; n++) + for (int i = 0; i < dim; i++) + for (int j = 0; j < dim; j++) + grad_u(elem, N * (dim*i+j) + n) = gradient_elem(elem, i, j, n); + }); + end_gpu_timer(__KERNEL_NAME__); } void Champ_Face_VDF::calculer_dscald_centre_element(DoubleTab& dscald) const @@ -1137,6 +1151,7 @@ void Champ_Face_VDF::calculer_dercov_axi(const Domaine_Cl_VDF& domaine_Cl_VDF) int fx0, fx1, fy0, fy1; int num_elem; + ToDo_Kokkos("critical"); for (num_elem = 0; num_elem < domaine_VDF.nb_elem(); num_elem++) { fx0 = elem_faces(num_elem, 0); @@ -1207,20 +1222,18 @@ void Champ_Face_VDF::calculer_dercov_axi(const Domaine_Cl_VDF& domaine_Cl_VDF) signe = Qdm(n_arete, 3); ori1 = orientation(fac1); ori3 = orientation(fac3); - int rang1 = fac1 - domaine_VDF.premiere_face_bord(); - int rang2 = fac2 - domaine_VDF.premiere_face_bord(); double vit_imp; if (n_type == TypeAreteBordVDF::PAROI_FLUIDE) // arete paroi_fluide :il faut determiner qui est la face fluide { if (est_egal(inco[fac1], 0)) - vit_imp = val_imp_face_bord_private(rang2, ori3); + vit_imp = val_imp_face_bord_private(fac2, ori3); else - vit_imp = val_imp_face_bord_private(rang1, ori3); + vit_imp = val_imp_face_bord_private(fac1, ori3); } else - vit_imp = 0.5 * (val_imp_face_bord_private(rang1, ori3) + val_imp_face_bord_private(rang2, ori3)); + vit_imp = 0.5 * (val_imp_face_bord_private(fac1, ori3) + val_imp_face_bord_private(fac2, ori3)); if (ori1 == 0) // bord d'equation R = cte { @@ -1365,126 +1378,175 @@ void Champ_Face_VDF::calculer_dercov_axi(const Domaine_Cl_VDF& domaine_Cl_VDF) /* METHODES UTILES MAIS HORS CLASSE */ /* ***************************************************** */ -double Champ_Face_get_val_imp_face_bord_sym(const DoubleTab& tab_valeurs, const double temp, int face, int comp, const Domaine_Cl_VDF& zclo) +void Champ_Face_get_val_imp_face_bord(const double temps, DoubleTab& val_imp_face_bord_, const Domaine_Cl_VDF& zcl, const DoubleTab* tab_inco) { - const Domaine_VDF& domaine_vdf = zclo.domaine_VDF(); - int face_locale = -123; - const int face_globale = face + domaine_vdf.premiere_face_bord(); // Maintenant numero dans le tableau global des faces. - const Domaine_Cl_dis_base& zcl = zclo; //equation().domaine_Cl_dis(); - // On recupere la CL associee a la face et le numero local de la face dans la frontiere. - //assert(equation().domaine_Cl_dis()==zclo); - - const Cond_lim_base& cl = (face < domaine_vdf.nb_faces()) ? zcl.condition_limite_de_la_face_reelle(face_globale, face_locale) : - zcl.condition_limite_de_la_face_virtuelle(face_globale, face_locale); - - const IntTab& face_voisins = domaine_vdf.face_voisins(); - const IntTab& elem_faces = domaine_vdf.elem_faces(); - const DoubleVect& porosite = zclo.equation().milieu().porosite_face(); - const int ori = domaine_vdf.orientation()(face_globale); - - if (sub_type(Navier, cl)) + const Domaine_VDF& domaine_vdf = zcl.domaine_VDF(); + // ToDo_Kokkos("Reduce size of val_imp_face_bord_ !"); + int dim = Objet_U::dimension; + int N = tab_inco ? tab_inco->line_size() : 1; + if (val_imp_face_bord_.size() == 0) + val_imp_face_bord_.resize(domaine_vdf.nb_faces_tot(), N * dim); + for (int n_bord = 0; n_bord < zcl.nb_cond_lim(); n_bord++) { - const int N = tab_valeurs.line_size(); - const int n=comp%N, comploc = (comp-n)/N; - if (comploc == ori) - return 0; + const Cond_lim_base& cl = zcl.les_conditions_limites(n_bord).valeur(); + const Front_VF& le_bord = ref_cast(Front_VF, cl.frontiere_dis()); + if (sub_type(Navier, cl)) + { + CIntTabView face_voisins = domaine_vdf.face_voisins().view_ro(); + CIntTabView elem_faces = domaine_vdf.elem_faces().view_ro(); + CDoubleArrView porosite = zcl.equation().milieu().porosite_face().view_ro(); + CIntArrView orientation = domaine_vdf.orientation().view_ro(); + CDoubleTabView inco = tab_inco->view_ro(); + CIntArrView le_bord_num_face = le_bord.num_face().view_ro(); + DoubleTabView val_imp_face_bord = val_imp_face_bord_.view_wo(); + Kokkos::parallel_for(start_gpu_timer(__KERNEL_NAME__), le_bord.nb_faces_tot(), KOKKOS_LAMBDA(const int ind_face) + { + int face = le_bord_num_face(ind_face); + int ori = orientation(face); + int nb_comp = N * dim; + for (int comp = 0; comp < nb_comp; comp++) + { + double val_imp; + int n = comp % N, comp1 = comp / N; + if (comp1 == ori) + val_imp = 0; + else + { + int comp2 = comp1 + dim; + int elem = face_voisins(face, 0); + if (elem == -1) elem = face_voisins(face, 1); + int fac1 = elem_faces(elem, comp1); + int fac2 = elem_faces(elem, comp2); + double sum = porosite(fac1) + porosite(fac2); + val_imp = sum==0 ? 0 : (inco(fac1, n) * porosite(fac1) + inco(fac2, n) * porosite(fac2)) / sum; + } + val_imp_face_bord(face, comp) = val_imp; + } + }); + end_gpu_timer(__KERNEL_NAME__); + } + else if (sub_type(Dirichlet_entree_fluide, cl) || sub_type(Dirichlet_paroi_defilante, cl)) + { + CDoubleTabView vals = ref_cast(Dirichlet, cl).tab_val_imp(temps).view_ro(); + CIntArrView le_bord_num_face = le_bord.num_face().view_ro(); + DoubleTabView val_imp_face_bord = val_imp_face_bord_.view_wo(); + const bool ch_unif = vals.extent(0) == 1; + const int nb_comp = (int)vals.extent(1); + Kokkos::parallel_for(start_gpu_timer(__KERNEL_NAME__), le_bord.nb_faces_tot(), KOKKOS_LAMBDA(const int ind_face) + { + int face = le_bord_num_face(ind_face); + for (int comp = 0; comp < nb_comp; comp++) + val_imp_face_bord(face, comp) = vals(ch_unif ? 0 : ind_face, comp); + }); + end_gpu_timer(__KERNEL_NAME__); + } else { - int elem = 0; - if (face_voisins(face_globale, 0) != -1) - elem = face_voisins(face_globale, 0); - else - elem = face_voisins(face_globale, 1); - const int comp2 = comploc + Objet_U::dimension; - return (tab_valeurs(elem_faces(elem, comploc), n) * porosite[elem_faces(elem, comploc)] + tab_valeurs(elem_faces(elem, comp2), n) * porosite[elem_faces(elem, comp2)]) - / (porosite[elem_faces(elem, comploc)] + porosite[elem_faces(elem, comp2)]); + CIntArrView le_bord_num_face = le_bord.num_face().view_ro(); + DoubleTabView val_imp_face_bord = val_imp_face_bord_.view_wo(); + Kokkos::parallel_for(start_gpu_timer(__KERNEL_NAME__), le_bord.nb_faces_tot(), KOKKOS_LAMBDA(const int ind_face) + { + int face = le_bord_num_face(ind_face); + int nb_comp = N * dim; + for (int comp = 0; comp < nb_comp; comp++) + val_imp_face_bord(face, comp) = 0; + }); + end_gpu_timer(__KERNEL_NAME__); } } - - if (!cl.champ_front().has_valeurs_au_temps(temp)) // si pas encore initialise !! - return 0.; - - const DoubleTab& vals = cl.champ_front().valeurs_au_temps(temp); - const int face_de_vals = vals.dimension(0) == 1 ? 0 : face_locale; - - if (sub_type(Dirichlet_entree_fluide, cl)) - return vals(face_de_vals, comp); - else if (sub_type(Dirichlet_paroi_fixe, cl)) - return 0.; - else if (sub_type(Dirichlet_paroi_defilante, cl)) - return vals(face_de_vals, comp); - - return 0.; // All other cases } -double Champ_Face_get_val_imp_face_bord(const double temp, int face, int comp, const Domaine_Cl_VDF& zclo) +double Champ_Face_get_val_imp_face_bord(const double temp, int face_globale, int comp, const Domaine_Cl_VDF& zcl, const DoubleTab* tab_valeurs) { - const Domaine_VDF& domaine_vdf = zclo.domaine_VDF(); + const Domaine_VDF& domaine_vdf = zcl.domaine_VDF(); + int face_locale = -123; - const int face_globale = face + domaine_vdf.premiere_face_bord(); // Maintenant numero dans le tableau global des faces. - const Domaine_Cl_dis_base& zcl = zclo; //equation().domaine_Cl_dis(); - // On recupere la CL associee a la face et le numero local de la face dans la frontiere. - //assert(equation().domaine_Cl_dis()==zclo); + const Cond_lim_base& cl = (face_globale < domaine_vdf.nb_faces()) ? zcl.condition_limite_de_la_face_reelle(face_globale, face_locale) : zcl.condition_limite_de_la_face_virtuelle(face_globale, face_locale); - const Cond_lim_base& cl = (face < domaine_vdf.nb_faces()) ? zcl.condition_limite_de_la_face_reelle(face_globale, face_locale) : - zcl.condition_limite_de_la_face_virtuelle(face_globale, face_locale); - const int ori = domaine_vdf.orientation()(face_globale); + if (!cl.champ_front().has_valeurs_au_temps(temp)) // si pas encore initialise !! + return 0.; if (sub_type(Navier, cl)) { + double val_imp; + const int ori = domaine_vdf.orientation()(face_globale); if (comp == ori) - return 0.; + val_imp = 0.; else { - Process::exit("You should call Champ_Face_get_val_imp_face_bord_sym and not Champ_Face_get_val_imp_face_bord\n"); - return 1.e9; + const int N = tab_valeurs->line_size(); + const int n = comp % N, comploc = (comp - n) / N; + if (comploc == ori) + val_imp = 0.; + else + { + const IntTab& face_voisins = domaine_vdf.face_voisins(); + const IntTab& elem_faces = domaine_vdf.elem_faces(); + const DoubleVect& porosite = zcl.equation().milieu().porosite_face(); + int elem = 0; + if (face_voisins(face_globale, 0) != -1) + elem = face_voisins(face_globale, 0); + else + elem = face_voisins(face_globale, 1); + const int comp2 = comploc + Objet_U::dimension; + val_imp = ((*tab_valeurs)(elem_faces(elem, comploc), n) * porosite[elem_faces(elem, comploc)] + + (*tab_valeurs)(elem_faces(elem, comp2), n) * porosite[elem_faces(elem, comp2)]) + / (porosite[elem_faces(elem, comploc)] + porosite[elem_faces(elem, comp2)]); + } } + return val_imp; } - - if (!cl.champ_front().has_valeurs_au_temps(temp)) // si pas encore initialise !! - return 0.; - - const DoubleTab& vals = cl.champ_front().valeurs_au_temps(temp); - int face_de_vals = vals.dimension(0) == 1 ? 0 : face_locale; - - if (sub_type(Dirichlet_entree_fluide, cl)) - return vals(face_de_vals, comp); - else if (sub_type(Dirichlet_paroi_fixe, cl)) - return 0.; - else if (sub_type(Dirichlet_paroi_defilante, cl)) - return vals(face_de_vals, comp); - - return 0.; // All other cases -} - -double Champ_Face_get_val_imp_face_bord(const double temp, int face, int comp, int comp2, const Domaine_Cl_VDF& zclo) -{ - Process::exit("Champ_Face_VDF::val_imp_face_bord(,,) exit\n"); - return 0.; // For compilers + else if (sub_type(Dirichlet_entree_fluide, cl) || sub_type(Dirichlet_paroi_defilante, cl)) + { + const DoubleTab& vals = cl.champ_front().valeurs_au_temps(temp); + return vals(vals.dimension(0) == 1 ? 0 : face_locale, comp); + } + else + return 0.; // All other cases } -double Champ_Face_coeff_frottement_face_bord(const int f, const int n, const Domaine_Cl_VDF& zclo) +void Champ_Face_coeff_frottement_face_bord(DoubleTab& coeff_frottement_face_bord_, const Domaine_Cl_VDF& zcl) { - const Domaine_VDF& domaine_vdf = zclo.domaine_VDF(); - const Domaine_Cl_dis_base& zcl = zclo; - const int face_globale = f + domaine_vdf.premiere_face_bord(); // Maintenant numero dans le tableau global des faces. - - int face_locale = -123; - const Cond_lim_base& cl = (f < domaine_vdf.nb_faces()) ? zcl.condition_limite_de_la_face_reelle(face_globale, face_locale) : - zcl.condition_limite_de_la_face_virtuelle(face_globale, face_locale); - - return sub_type(Navier, cl) ? ref_cast(Navier, cl).coefficient_frottement(face_locale,n) : 0.; + const Domaine_VDF& domaine_vdf = zcl.domaine_VDF(); + //ToDo_Kokkos("Reduce size of coeff_frottement_face_bord_ !"); + if (coeff_frottement_face_bord_.size() == 0) + coeff_frottement_face_bord_.resize(domaine_vdf.nb_faces_tot(), Objet_U::dimension); + + // Compute coeff_frottement_face_bord_ + int dim = Objet_U::dimension; + for (int n_bord = 0; n_bord < domaine_vdf.nb_front_Cl(); n_bord++) + { + const Cond_lim_base& cl = zcl.les_conditions_limites(n_bord).valeur(); + const Front_VF& le_bord = ref_cast(Front_VF, cl.frontiere_dis()); + if (sub_type(Navier, cl)) + { + int nb_comp = dim; + const Navier& la_cl = ref_cast(Navier, cl); + CIntArrView le_bord_num_face = le_bord.num_face().view_ro(); + CDoubleTabView coefficient_frottement; + if (la_cl.coefficient_frottement()) + { + coefficient_frottement = la_cl.coefficient_frottement()->view_ro(); + nb_comp = (int)coefficient_frottement.extent(1); + } + DoubleTabView coeff_frottement_face_bord = coeff_frottement_face_bord_.view_wo(); + Kokkos::parallel_for(start_gpu_timer(__KERNEL_NAME__), le_bord.nb_faces_tot(), KOKKOS_LAMBDA(const int ind_face) + { + int face = le_bord_num_face(ind_face); + for (int comp = 0; comp < nb_comp; comp++) + coeff_frottement_face_bord(face, comp) = coefficient_frottement.data() ? coefficient_frottement(ind_face, comp) : 0; + }); + end_gpu_timer(__KERNEL_NAME__); + } + } } -double Champ_Face_coeff_frottement_grad_face_bord(const int f, const int n, const Domaine_Cl_VDF& zclo) +double Champ_Face_coeff_frottement_grad_face_bord(const int face_globale, const int n, const Domaine_Cl_VDF& zcl) { - const Domaine_VDF& domaine_vdf = zclo.domaine_VDF(); - const Domaine_Cl_dis_base& zcl = zclo; - const int face_globale = f + domaine_vdf.premiere_face_bord(); // Maintenant numero dans le tableau global des faces. + const Domaine_VDF& domaine_vdf = zcl.domaine_VDF(); int face_locale = -123; - const Cond_lim_base& cl = (f < domaine_vdf.nb_faces()) ? zcl.condition_limite_de_la_face_reelle(face_globale, face_locale) : - zcl.condition_limite_de_la_face_virtuelle(face_globale, face_locale); + const Cond_lim_base& cl = (face_globale < domaine_vdf.nb_faces()) ? zcl.condition_limite_de_la_face_reelle(face_globale, face_locale) : zcl.condition_limite_de_la_face_virtuelle(face_globale, face_locale); return sub_type(Navier, cl) ? ref_cast(Navier, cl).coefficient_frottement_grad(face_locale,n) : 0.; } diff --git a/src/VDF/Champs/Champ_Face_VDF.h b/src/VDF/Champs/Champ_Face_VDF.h index 3205d4e880..f4bb1b7c6d 100644 --- a/src/VDF/Champs/Champ_Face_VDF.h +++ b/src/VDF/Champs/Champ_Face_VDF.h @@ -92,6 +92,11 @@ class Champ_Face_VDF : public Champ_Face_base, public Champ_Face_VDF_implementat return Champ_Face_VDF_implementation::valeur_a_elem_compo(position, le_poly, ncomp); } + inline DoubleTab& valeur_aux_centres_de_gravite(const Domaine& dom, DoubleTab& tab_valeurs) const override + { + return Champ_Face_VDF_implementation::valeur_aux_centres_de_gravite(tab_valeurs); + } + inline DoubleTab& valeur_aux_elems(const DoubleTab& positions, const IntVect& les_polys, DoubleTab& tab_valeurs) const override { return Champ_Face_VDF_implementation::valeur_aux_elems(positions, les_polys, tab_valeurs); @@ -165,12 +170,12 @@ class Champ_Face_VDF : public Champ_Face_base, public Champ_Face_VDF_implementat DoubleTab tau_diag_; // termes diagonaux du tenseur Grad DoubleTab tau_croises_; // termes extradiagonaux du tenseur Grad + mutable DoubleTab val_imp_face_bord_; // Tableau de travail qui stocke les valeurs imposees aux faces de bord. Utile pour le GPU }; -double Champ_Face_coeff_frottement_face_bord(const int, const int , const Domaine_Cl_VDF& zclo); +void Champ_Face_coeff_frottement_face_bord(DoubleTab&, const Domaine_Cl_VDF& zcl); +void Champ_Face_get_val_imp_face_bord(const double temp, DoubleTab&, const Domaine_Cl_VDF& zcl, const DoubleTab* tab_valeurs=nullptr); double Champ_Face_coeff_frottement_grad_face_bord(const int, const int , const Domaine_Cl_VDF& zclo); -double Champ_Face_get_val_imp_face_bord_sym(const DoubleTab& tab_valeurs, const double temp,int face,int comp, const Domaine_Cl_VDF& zclo); -double Champ_Face_get_val_imp_face_bord( const double temp,int face,int comp, const Domaine_Cl_VDF& zclo) ; -double Champ_Face_get_val_imp_face_bord( const double temp,int face,int comp, int comp2, const Domaine_Cl_VDF& zclo) ; +double Champ_Face_get_val_imp_face_bord(const double temp,int face,int comp, const Domaine_Cl_VDF& zclo, const DoubleTab* tab_valeurs=nullptr); #endif /* Champ_Face_VDF_included */ diff --git a/src/VDF/Champs/Champ_Face_VDF_implementation.cpp b/src/VDF/Champs/Champ_Face_VDF_implementation.cpp index 272cc32075..dc7ed330a5 100644 --- a/src/VDF/Champs/Champ_Face_VDF_implementation.cpp +++ b/src/VDF/Champs/Champ_Face_VDF_implementation.cpp @@ -19,6 +19,8 @@ #include #include #include +#include +#include DoubleTab& Champ_Face_VDF_implementation::valeur_aux_elems(const DoubleTab& positions, const IntVect& les_polys, DoubleTab& val_elem) const { @@ -35,53 +37,94 @@ DoubleVect& Champ_Face_VDF_implementation::valeur_a_elem(const DoubleVect& posit return valeur_a_elem_(le_champ().valeurs(), position, val, e); } -DoubleTab& Champ_Face_VDF_implementation::valeur_aux_elems_(const DoubleTab& val_face, const DoubleTab& positions, const IntVect& les_polys, DoubleTab& val_elem) const +DoubleTab& Champ_Face_VDF_implementation::valeur_aux_centres_de_gravite(DoubleTab& tab_val_elem) const { - if (val_elem.nb_dim() > 2) + if (tab_val_elem.nb_dim() > 2) { Cerr << "Erreur TRUST dans Champ_Face_implementation::valeur_aux_elems()" << finl; Cerr << "Le DoubleTab val a plus de 2 entrees" << finl; Process::exit(); } - - - const int N = val_face.line_size(), D = Objet_U::dimension, M = le_champ().nb_comp(); - DoubleVect val_e(N * D), x(D); - val_elem = 0.0; - //assert(val_elem.line_size()==N * std::min(D, M)); - - for (int p = 0; p < les_polys.size(); p++) - { - for (int d = 0; d < D; d++) x(d) = positions(p, d); - valeur_a_elem_(val_face, x, val_e, les_polys(p)); - for (int i = 0; i < N * std::min(D, M); i++) val_elem(p, i) = val_e(i); - } - - return val_elem; + const DoubleTab& tab_val_face = le_champ().valeurs(); + const int N = tab_val_face.line_size(), D = Objet_U::dimension; + const int nb_comp = le_champ().nb_comp(); + tab_val_elem = 0.0; + int size = tab_val_elem.dimension(0); + CIntTabView e_f = domaine_vdf().elem_faces().view_ro(); + CDoubleTabView val_face = tab_val_face.view_ro(); + DoubleTabView val_elem = tab_val_elem.view_rw(); + Kokkos::parallel_for(start_gpu_timer(__KERNEL_NAME__), range_1D(0, size), KOKKOS_LAMBDA(const int e) + { + // TODO : FIXME : cas avec line_size 1 mais nb_dim != 2 ... vu dans cathare3D + for (int d = 0; d < D; d++) + { + for (int n = 0; n < N; n++) + { + const double v1 = val_face(e_f(e, d), n); + const double v2 = val_face(e_f(e, d + D), n); + const double interp = 0.5 * (v1 + v2); + if (nb_comp == 1) + { + } + else if (d < nb_comp) + val_elem(e, N * d + n) = interp; + } + } + }); + end_gpu_timer(__KERNEL_NAME__); + return tab_val_elem; } -/* Elie SAIKALI : utilise pour CGNS => passer champ face a un champ vect aux faces ! */ -DoubleTab& Champ_Face_VDF_implementation::valeur_aux_faces_post_impl(const Domaine_VDF& vdf, DoubleTab& result) const +DoubleTab& Champ_Face_VDF_implementation::valeur_aux_elems_(const DoubleTab& tab_val_face, const DoubleTab& tab_positions, const IntVect& tab_les_polys, DoubleTab& tab_val_elem) const { - const Champ_base& cha = le_champ(); - const DoubleTab& val = cha.valeurs(); - const int nb_compo = cha.nb_comp(), N = val.line_size(), D = Objet_U::dimension; - - if (nb_compo == 1) - Process::exit("TRUST error in Champ_Face_VDF_implementation::valeur_aux_faces_post_impl : A scalar field cannot be of Champ_Face type !"); - - const int nb_faces = vdf.nb_faces(); - - assert(nb_faces == val.dimension(0)); - - result.resize(nb_faces, N * D); + if (tab_val_elem.nb_dim() > 2) + { + Cerr << "Erreur TRUST dans Champ_Face_implementation::valeur_aux_elems()" << finl; + Cerr << "Le DoubleTab val a plus de 2 entrees" << finl; + Process::exit(); + } - for (int f = 0; f < nb_faces; f++) + const int N = tab_val_face.line_size(), D = Objet_U::dimension; + const int nb_comp = le_champ().nb_comp(); + tab_val_elem = 0.0; + const Domaine_VDF& domaine_VDF = domaine_vdf(); + const Domaine& domaine_geom = get_domaine_geom(); + int size = tab_les_polys.size(); + CIntTabView f_s = domaine_VDF.face_sommets().view_ro(); + CIntTabView e_f = domaine_VDF.elem_faces().view_ro(); + CDoubleTabView coord = domaine_geom.coord_sommets().view_ro(); + CDoubleTabView positions = tab_positions.view_ro(); + CIntArrView les_polys = tab_les_polys.view_ro(); + CDoubleTabView val_face = tab_val_face.view_ro(); + DoubleTabView val_elem = tab_val_elem.view_rw(); + const double precision_geom = Objet_U::precision_geom; + Kokkos::parallel_for(start_gpu_timer(__KERNEL_NAME__), range_1D(0, size), KOKKOS_LAMBDA(const int p) + { + // TODO : FIXME : cas avec line_size 1 mais nb_dim != 2 ... vu dans cathare3D + const int e = les_polys(p); + if (e == -1) return; for (int d = 0; d < D; d++) - for (int n = 0; n < N; n++) - result(f, N * d + n) = val(f, n) * vdf.face_normales(f, d) / vdf.face_surfaces(f); - - return result; + { + const int som0 = f_s(e_f(e, d), 0); + const int som1 = f_s(e_f(e, d + D), 0); + const double psi = (positions(p, d) - coord(som0, d)) / (coord(som1, d) - coord(som0, d)); + for (int n = 0; n < N; n++) + { + const double v1 = val_face(e_f(e, d), n); + const double v2 = val_face(e_f(e, d + D), n); + const double interp = interpolation(v1, v2, psi); + if (nb_comp == 1) + { + if (Kokkos::fabs(psi) < precision_geom || Kokkos::fabs(1. - psi) < precision_geom) + val_elem(p, 0) = interp; + } + else if (d < nb_comp) + val_elem(p, N * d + n) = interp; + } + } + }); + end_gpu_timer(__KERNEL_NAME__); + return tab_val_elem; } DoubleVect& Champ_Face_VDF_implementation::valeur_a_elem_(const DoubleTab& val_face, const DoubleVect& position, DoubleVect& val, int e) const @@ -115,16 +158,47 @@ DoubleVect& Champ_Face_VDF_implementation::valeur_a_elem_(const DoubleTab& val_f return val; } -DoubleVect& Champ_Face_VDF_implementation::valeur_aux_elems_compo(const DoubleTab& positions, const IntVect& les_polys, DoubleVect& val, int ncomp) const +/* Elie SAIKALI : utilise pour CGNS => passer champ face a un champ vect aux faces ! */ +DoubleTab& Champ_Face_VDF_implementation::valeur_aux_faces_post_impl(const Domaine_VDF& vdf, DoubleTab& result) const +{ + const Champ_base& cha = le_champ(); + const DoubleTab& val = cha.valeurs(); + const int nb_compo = cha.nb_comp(), N = val.line_size(), D = Objet_U::dimension; + + if (nb_compo == 1) + Process::exit("TRUST error in Champ_Face_VDF_implementation::valeur_aux_faces_post_impl : A scalar field cannot be of Champ_Face type !"); + + const int nb_faces = vdf.nb_faces(); + + assert(nb_faces == val.dimension(0)); + + result.resize(nb_faces, N * D); + ToDo_Kokkos("critical"); + for (int f = 0; f < nb_faces; f++) + for (int d = 0; d < D; d++) + for (int n = 0; n < N; n++) + result(f, N * d + n) = val(f, n) * vdf.face_normales(f, d) / vdf.face_surfaces(f); + + return result; +} + +DoubleVect& Champ_Face_VDF_implementation::valeur_aux_elems_compo(const DoubleTab& tab_positions, const IntVect& tab_les_polys, DoubleVect& tab_val, int ncomp) const { - assert(val.size_totale() >= les_polys.size()); + assert(tab_val.size_totale() >= tab_les_polys.size()); const int D = Objet_U::dimension; - const DoubleTab& coord = domaine_vdf().domaine().coord_sommets(); - const IntTab& f_s = domaine_vdf().face_sommets(), &e_f = domaine_vdf().elem_faces(); - const DoubleTab& vals = le_champ().valeurs(); - int size = les_polys.size(); - for(int p = 0; p < size; p++) - { + const DoubleTab& tab_coord = domaine_vdf().domaine().coord_sommets(); + const IntTab& tab_f_s = domaine_vdf().face_sommets(), &tab_e_f = domaine_vdf().elem_faces(); + const DoubleTab& tab_vals = le_champ().valeurs(); + int size = tab_les_polys.size(); + + CDoubleTabView positions = tab_positions.view_ro(); + CIntArrView les_polys = tab_les_polys.view_ro(); + DoubleArrView val = tab_val.view_wo(); + CDoubleArrView vals = static_cast(tab_vals).view_ro(); + CIntTabView e_f = tab_e_f.view_ro(); + CIntTabView f_s = tab_f_s.view_ro(); + CDoubleTabView coord = tab_coord.view_ro(); + Kokkos::parallel_for(start_gpu_timer(__KERNEL_NAME__),range_1D(0, size), KOKKOS_LAMBDA(const int p){ int e = les_polys(p); if (e<0) { @@ -132,13 +206,15 @@ DoubleVect& Champ_Face_VDF_implementation::valeur_aux_elems_compo(const DoubleTa } else { - const double val1 = vals(e_f(e, ncomp)), val2 = vals(e_f(e, D + ncomp)); + // const double val1 = tab_vals(tab_e_f(e, ncomp)), val2 = tab_vals(tab_e_f(e, D + ncomp)); + const double val1 = vals(e_f(e, ncomp)), val2 = vals(e_f(e, D + ncomp)); // I'm a bit worried about the layout consistency here moving between host (LayoutRight) and device (LayoutLeft)! const int som0 = f_s(e_f(e, ncomp), 0), som1 = f_s(e_f(e, D + ncomp), 0); const double psi = (positions(p, ncomp) - coord(som0, ncomp)) / (coord(som1, ncomp) - coord(som0, ncomp)); val(p) = interpolation(val1, val2, psi); } - } - return val; + }); + end_gpu_timer(__KERNEL_NAME__); + return tab_val; } double Champ_Face_VDF_implementation::valeur_a_elem_compo(const DoubleVect& position, int e, int d) const @@ -177,28 +253,52 @@ DoubleTab& Champ_Face_VDF_implementation::valeur_aux_sommets(const Domaine& dom, const int nb_elem_tot = dom.nb_elem_tot(), nb_som = dom.nb_som(), nb_som_elem = dom.nb_som_elem(); const int N = le_champ().valeurs().line_size(), D = Objet_U::dimension; - IntVect compteur(nb_som); - ch_som = 0, compteur = 0; + ArrOfInt tab_compteur(nb_som); + ch_som = 0, tab_compteur = 0; - DoubleVect position(D), val_e(N * D); + // Count queries (only nodes local to this process) + int nb_queries = 0; for (int e = 0; e < nb_elem_tot; e++) - for (int j = 0, s; j < nb_som_elem; j++) - if ((s = dom.sommet_elem(e, j)) < nb_som) - { - for(int d = 0; d < D; d++) - position(d) = dom.coord(s, d); + for (int j = 0; j < nb_som_elem; j++) + if (dom.sommet_elem(e, j) < nb_som) + nb_queries++; - compteur[s]++; - valeur_a_elem(position, val_e, e); - for (int n = 0; n < N; n++) + DoubleTab tab_positions(nb_queries, D); + IntVect tab_les_polys(nb_queries); + IntVect tab_sommets(nb_queries); + + int q = 0; + for (int e = 0; e < nb_elem_tot; e++) + for (int j = 0; j < nb_som_elem; j++) + { + const int s = dom.sommet_elem(e, j); + if (s < nb_som) + { for (int d = 0; d < D; d++) - ch_som(s, N * d + n) += val_e(N * d + n); - } + tab_positions(q, d) = dom.coord(s, d); + tab_les_polys(q) = e; + tab_sommets(q) = s; + q++; + } + } + + DoubleTab tab_val_elem(nb_queries, N * D); + tab_val_elem = 0.; + valeur_aux_elems_(le_champ().valeurs(), tab_positions, tab_les_polys, tab_val_elem); + + for (int qi = 0; qi < nb_queries; qi++) + { + const int s = tab_sommets(qi); + tab_compteur[s]++; + for (int n = 0; n < N; n++) + for (int d = 0; d < D; d++) + ch_som(s, N * d + n) += tab_val_elem(qi, N * d + n); + } for (int s = 0; s < nb_som; s++) for (int n = 0; n < N; n++) for (int d = 0; d < D; d++) - ch_som(s, N * d + n) /= compteur[s]; + ch_som(s, N * d + n) /= tab_compteur[s]; return ch_som; } @@ -223,6 +323,7 @@ DoubleVect& Champ_Face_VDF_implementation::valeur_aux_sommets_compo(const Domain compteur = 0; DoubleVect position(Objet_U::dimension); + ToDo_Kokkos("critical"); for (num_elem=0; num_elem(tab_y).view_ro(); + CIntArrView ori = tab_ori.view_ro(); + if (tab_x.dimension(1) == 1) { - for (i=0; i +#include class Frontiere_dis_base; class Domaine_VDF; @@ -27,6 +28,7 @@ class Champ_Face_VDF_implementation : public Champ_implementation_divers public: DoubleVect& valeur_a_elem(const DoubleVect& position, DoubleVect& val, int le_poly) const override; double valeur_a_elem_compo(const DoubleVect& position, int le_poly, int ncomp) const override; + DoubleTab& valeur_aux_centres_de_gravite(DoubleTab& valeurs) const; DoubleTab& valeur_aux_elems(const DoubleTab& positions, const IntVect& les_polys, DoubleTab& valeurs) const override; DoubleTab& valeur_aux_elems_passe(const DoubleTab& positions, const IntVect& les_polys, DoubleTab& valeurs) const; DoubleVect& valeur_aux_elems_compo(const DoubleTab& positions, const IntVect& les_polys, DoubleVect& valeurs, int ncomp) const override; @@ -38,21 +40,23 @@ class Champ_Face_VDF_implementation : public Champ_implementation_divers DoubleTab& valeur_aux_faces_post_impl(const Domaine_VDF&, DoubleTab& result) const; + protected_but_public_for_cuda + DoubleTab& valeur_aux_elems_(const DoubleTab& val_face, const DoubleTab& positions, const IntVect& les_polys, DoubleTab& valeurs) const; + protected: virtual const Domaine_VDF& domaine_vdf() const = 0; - inline double interpolation(const double, const double, const double) const; + KOKKOS_INLINE_FUNCTION static double interpolation(const double, const double, const double); DoubleTab& trace(const Frontiere_dis_base& fr, const DoubleTab& y, DoubleTab& x, int distant) const; private: - DoubleTab& valeur_aux_elems_(const DoubleTab& val_face, const DoubleTab& positions, const IntVect& les_polys, DoubleTab& valeurs) const; DoubleVect& valeur_a_elem_(const DoubleTab& val_face, const DoubleVect& position, DoubleVect& val, int le_poly) const; }; -inline double Champ_Face_VDF_implementation::interpolation(const double val1, const double val2, const double psi) const +KOKKOS_INLINE_FUNCTION double Champ_Face_VDF_implementation::interpolation(const double val1, const double val2, const double psi) { double epsilon=1.e-12; - if (std::fabs(psi) < epsilon) + if (Kokkos::fabs(psi) < epsilon) return val1 ; - else if (std::fabs(1.-psi) < epsilon) + else if (Kokkos::fabs(1.-psi) < epsilon) return val2 ; else return val1 + psi * (val2-val1) ; diff --git a/src/VDF/Champs/Champ_Fonc_Face_VDF.cpp b/src/VDF/Champs/Champ_Fonc_Face_VDF.cpp index 98ef8d7afa..9c3d33d6a2 100644 --- a/src/VDF/Champs/Champ_Fonc_Face_VDF.cpp +++ b/src/VDF/Champs/Champ_Fonc_Face_VDF.cpp @@ -1,5 +1,5 @@ /**************************************************************************** -* Copyright (c) 2023, CEA +* Copyright (c) 2026, CEA * All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: @@ -63,6 +63,7 @@ Champ_base& Champ_Fonc_Face_VDF::affecter_(const Champ_base& ch) if (sub_type(Champ_Uniforme, ch)) { + ToDo_Kokkos("critical"); for (int num_face = 0; num_face < nb_faces; num_face++) val(num_face) = v(0, orientation(num_face)); } @@ -72,6 +73,7 @@ Champ_base& Champ_Fonc_Face_VDF::affecter_(const Champ_base& ch) const IntTab& face_voisins = domaine_VDF.face_voisins(); int num_face; + ToDo_Kokkos("critical"); for (num_face = 0; num_face < ndeb_int; num_face++) { ori = orientation(num_face); @@ -82,6 +84,7 @@ Champ_base& Champ_Fonc_Face_VDF::affecter_(const Champ_base& ch) val(num_face) = v(face_voisins(num_face, 1), ori); } + ToDo_Kokkos("critical"); for (num_face = ndeb_int; num_face < nb_faces; num_face++) { ori = orientation(num_face); @@ -106,6 +109,7 @@ Champ_base& Champ_Fonc_Face_VDF::affecter_(const Champ_base& ch) int nbz = 0; int num_face, k; + ToDo_Kokkos("critical"); for (num_face = 0; num_face < nb_faces; num_face++) { ori = orientation(num_face); @@ -134,6 +138,7 @@ Champ_base& Champ_Fonc_Face_VDF::affecter_(const Champ_base& ch) if (dimension == 3) ch.valeur_aux_compo(positionZ, W, 2); nbx = nby = nbz = 0; + ToDo_Kokkos("critical"); for (num_face = 0; num_face < nb_faces; num_face++) { ori = orientation(num_face); diff --git a/src/VDF/Champs/Champ_Fonc_Face_VDF.h b/src/VDF/Champs/Champ_Fonc_Face_VDF.h index 07ef7d0605..322bc24675 100644 --- a/src/VDF/Champs/Champ_Fonc_Face_VDF.h +++ b/src/VDF/Champs/Champ_Fonc_Face_VDF.h @@ -1,5 +1,5 @@ /**************************************************************************** -* Copyright (c) 2025, CEA +* Copyright (c) 2026, CEA * All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: @@ -48,6 +48,11 @@ class Champ_Fonc_Face_VDF : public Champ_Fonc_base, public Champ_Face_VDF_implem return Champ_Face_VDF_implementation::valeur_a_elem_compo(position, le_poly, ncomp); } + inline DoubleTab& valeur_aux_centres_de_gravite(const Domaine& dom, DoubleTab& tab_valeurs) const override + { + return Champ_Face_VDF_implementation::valeur_aux_centres_de_gravite(tab_valeurs); + } + inline DoubleTab& valeur_aux_elems(const DoubleTab& positions, const IntVect& les_polys, DoubleTab& tab_valeurs) const override { return Champ_Face_VDF_implementation::valeur_aux_elems(positions, les_polys, tab_valeurs); diff --git a/src/VDF/Champs/Champ_Fonc_Q1_VDF.cpp b/src/VDF/Champs/Champ_Fonc_Q1_VDF.cpp index 88547783d3..589e247cfa 100644 --- a/src/VDF/Champs/Champ_Fonc_Q1_VDF.cpp +++ b/src/VDF/Champs/Champ_Fonc_Q1_VDF.cpp @@ -1,5 +1,5 @@ /**************************************************************************** -* Copyright (c) 2022, CEA +* Copyright (c) 2026, CEA * All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: @@ -37,6 +37,7 @@ int Champ_Fonc_Q1_VDF::imprime(Sortie& os, int ncomp) const const DoubleTab& val = valeurs(); int som; os << nb_som << finl; + ToDo_Kokkos("critical"); for (som = 0; som < nb_som; som++) { if (dimension == 3) diff --git a/src/VDF/Champs/Champ_Fonc_Tabule_P0_VDF.cpp b/src/VDF/Champs/Champ_Fonc_Tabule_P0_VDF.cpp index ba3887fb36..acc18e15f5 100644 --- a/src/VDF/Champs/Champ_Fonc_Tabule_P0_VDF.cpp +++ b/src/VDF/Champs/Champ_Fonc_Tabule_P0_VDF.cpp @@ -14,8 +14,10 @@ *****************************************************************************/ #include +#include #include #include +#include Implemente_instanciable(Champ_Fonc_Tabule_P0_VDF, "Champ_Fonc_Tabule_P0_VDF", Champ_Fonc_P0_VDF); @@ -31,46 +33,7 @@ void Champ_Fonc_Tabule_P0_VDF::associer_param(const VECT(OBS_PTR(Champ_base)) &l void Champ_Fonc_Tabule_P0_VDF::mettre_a_jour(double t) { - const Domaine_VF& domaine_VF = le_dom_VF.valeur(); - const Table& table = la_table.valeur(); - DoubleTab& mes_valeurs = valeurs(); - const int nb_elem = domaine_VF.nb_elem(), nb_elem_tot = domaine_VF.nb_elem_tot(), nb_param = les_ch_param.size(); - DoubleTabs val_params_aux_elems; - for (int i = 0; i < nb_param; i++) - { - assert(les_ch_param[i]->valeurs().dimension(1) == 1 || les_ch_param[i]->valeurs().dimension(1) == mes_valeurs.dimension(1)); - DoubleTab vp(nb_elem_tot, les_ch_param[i]->valeurs().dimension(1)); - val_params_aux_elems.add(vp); - } - const DoubleTab& centres_de_gravites = domaine_VF.xp(); - IntVect les_polys(nb_elem_tot); - for (int elem = 0; elem < nb_elem_tot; elem++) - les_polys(elem) = elem; - - // Estimate the field parameter on cells: - for (int i = 0; i < nb_param; i++) - les_ch_param[i]->valeur_aux_elems(centres_de_gravites, les_polys, val_params_aux_elems[i]); - // Compute the field according to the parameter field - if (table.isfonction() != 2) - { - const int nbcomp = mes_valeurs.dimension(1); - std::vector vals; - vals.reserve(nb_param); // Pre-allocate space once - for (int num_elem = 0; num_elem < nb_elem; num_elem++) - for (int ncomp = 0; ncomp < nbcomp; ncomp++) - { - vals.clear(); - for (int n = 0; n < nb_param; n++) - vals.push_back(val_params_aux_elems[n](num_elem, les_ch_param[n]->valeurs().dimension(1) == 1 ? 0 : ncomp)); - mes_valeurs(num_elem, ncomp) = table.val(vals, ncomp); - } - } - else - { - table.valeurs(val_params_aux_elems[0], centres_de_gravites, t, mes_valeurs); - } - - Champ_Fonc_base::mettre_a_jour(t); + Champ_Fonc_P0_base::mettre_a_jour(t, la_table.valeur(), les_ch_param); } int Champ_Fonc_Tabule_P0_VDF::initialiser(const double un_temps) diff --git a/src/VDF/Champs/Champ_P0_VDF.cpp b/src/VDF/Champs/Champ_P0_VDF.cpp index 41e72794c2..269dd4efa2 100644 --- a/src/VDF/Champs/Champ_P0_VDF.cpp +++ b/src/VDF/Champs/Champ_P0_VDF.cpp @@ -1,5 +1,5 @@ /**************************************************************************** -* Copyright (c) 2023, CEA +* Copyright (c) 2026, CEA * All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: @@ -59,6 +59,7 @@ DoubleVect Champ_P0_VDF::moyenne(const DoubleVect& porosite_elem) const moy = 0; double coef, sum_vol = 0; + ToDo_Kokkos("critical"); for (int num_elem = 0; num_elem < domaine_VDF().nb_elem(); num_elem++) { coef = porosite_elem(num_elem) * volumes(num_elem); @@ -82,6 +83,7 @@ double Champ_P0_VDF::moyenne(const DoubleVect& porosite_elem, int ncomp) const double moy = 0; double coef, sum_vol = 0; + ToDo_Kokkos("critical"); for (int num_elem = 0; num_elem < domaine_VDF().nb_elem(); num_elem++) { coef = porosite_elem(num_elem) * volumes(num_elem); @@ -232,6 +234,7 @@ double Champ_P0_VDF::integrale_espace(int ncomp) const const DoubleTab& val = valeurs(); assert(ncomp < val.line_size()); + ToDo_Kokkos("critical"); for (elem = 0; elem < nb_elem; elem++) integr += val(elem, ncomp) * volumes(elem); diff --git a/src/VDF/Champs/Champ_P0_VDF.h b/src/VDF/Champs/Champ_P0_VDF.h index 62ea5b38ca..037c787b27 100644 --- a/src/VDF/Champs/Champ_P0_VDF.h +++ b/src/VDF/Champs/Champ_P0_VDF.h @@ -1,5 +1,5 @@ /**************************************************************************** -* Copyright (c) 2023, CEA +* Copyright (c) 2026, CEA * All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: @@ -35,6 +35,7 @@ public : DoubleTab& remplir_coord_noeuds(DoubleTab& ) const override; int imprime(Sortie& os, int nb_compo_) const override; double integrale_espace(int ncomp) const override; + DoubleTab& valeur_aux_centres_de_gravite(const Domaine& dom, DoubleTab& result) const override { return Champ_implementation_P0::valeur_aux_centres_de_gravite(dom, result); } }; #endif /* Champ_P0_VDF_included */ diff --git a/src/VDF/Champs/Champ_front_debit_QC.cpp b/src/VDF/Champs/Champ_front_debit_QC.cpp index 080f4a3b21..f9b11b3092 100644 --- a/src/VDF/Champs/Champ_front_debit_QC.cpp +++ b/src/VDF/Champs/Champ_front_debit_QC.cpp @@ -129,21 +129,24 @@ void Champ_front_debit_QC::mettre_a_jour(double tps) int nfin = ndeb + nb_faces; const DoubleTab& tab_rhonp1P0 =fluide->loi_etat()->rho_np1(); if (ismoyen==0) + { + ToDo_Kokkos("critical"); + for (int num_face=ndeb; num_faceloi_etat()->rho_np1(); if (ismoyen==0) + { + ToDo_Kokkos("critical"); + for (int num_face=ndeb; num_facenb_comp()); mon_champ_Vec_->valeur_aux_centres_de_gravite(dom, valeurs_Vec); + ToDo_Kokkos("critical"); for(int elem=0; elempas_de_temps(); + ToDo_Kokkos("critical"); for (int face = 0; face < nb_faces; face++) { // Calcul de la taille de maille entourant la face diff --git a/src/VDF/Champs/Reynolds_maille_Champ_Face.cpp b/src/VDF/Champs/Reynolds_maille_Champ_Face.cpp index 3a18dca308..3b9c517e3a 100644 --- a/src/VDF/Champs/Reynolds_maille_Champ_Face.cpp +++ b/src/VDF/Champs/Reynolds_maille_Champ_Face.cpp @@ -1,5 +1,5 @@ /**************************************************************************** -* Copyright (c) 2024, CEA +* Copyright (c) 2026, CEA * All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: @@ -47,6 +47,7 @@ void Reynolds_maille_Champ_Face::mettre_a_jour(double tps) { const int nb_faces = domaine_vdf().nb_faces(); DoubleTab& re = valeurs(); // Reynolds de maille + ToDo_Kokkos("critical"); for (int face = 0; face < nb_faces; face++) { // Calcul de la viscosite face diff --git a/src/VDF/Champs/T_paroi_Champ_P0_VDF.cpp b/src/VDF/Champs/T_paroi_Champ_P0_VDF.cpp index 3f08952437..79c567467f 100644 --- a/src/VDF/Champs/T_paroi_Champ_P0_VDF.cpp +++ b/src/VDF/Champs/T_paroi_Champ_P0_VDF.cpp @@ -1,5 +1,5 @@ /**************************************************************************** -* Copyright (c) 2025, CEA +* Copyright (c) 2026, CEA * All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: @@ -79,6 +79,7 @@ void T_paroi_Champ_P0_VDF::me_calculer(double tps) if (sub_type(Periodique, la_cl.valeur())) { + ToDo_Kokkos("critical"); for (int num_face = ndeb; num_face < nfin; num_face++) { const int elem1 = face_voisins(num_face, 0), elem2 = face_voisins(num_face, 1); @@ -92,6 +93,7 @@ void T_paroi_Champ_P0_VDF::me_calculer(double tps) } else if (sub_type(Dirichlet, la_cl.valeur())) { + ToDo_Kokkos("critical"); for (int num_face = ndeb, num_face_cl = 0; num_face < nfin; num_face++, num_face_cl++) { int elem = face_voisins(num_face, 0); @@ -105,6 +107,7 @@ void T_paroi_Champ_P0_VDF::me_calculer(double tps) } else if (sub_type(Dirichlet_homogene, la_cl.valeur())) { + ToDo_Kokkos("critical"); for (int num_face = ndeb; num_face < nfin; num_face++) { int elem = face_voisins(num_face, 0); @@ -117,6 +120,7 @@ void T_paroi_Champ_P0_VDF::me_calculer(double tps) } else if (sub_type(Neumann_homogene, la_cl.valeur()) || sub_type(Navier, la_cl.valeur())) // grad nulle { + ToDo_Kokkos("critical"); for (int num_face = ndeb; num_face < nfin; num_face++) { int elem = face_voisins(num_face, 0); @@ -130,6 +134,7 @@ void T_paroi_Champ_P0_VDF::me_calculer(double tps) else if (sub_type(Echange_externe_impose, la_cl.valeur())) { const Echange_externe_impose& la_cl_ext = ref_cast(Echange_externe_impose, la_cl.valeur()); + ToDo_Kokkos("critical"); for (int num_face = ndeb, num_face_cl = 0; num_face < nfin; num_face++, num_face_cl++) { const int elem1 = face_voisins(num_face, 0), elem2 = face_voisins(num_face, 1); @@ -160,6 +165,7 @@ void T_paroi_Champ_P0_VDF::me_calculer(double tps) else if (sub_type(Echange_global_impose, la_cl.valeur())) { const Echange_global_impose& la_cl_glob = ref_cast(Echange_global_impose, la_cl.valeur()); + ToDo_Kokkos("critical"); for (int num_face = ndeb, num_face_cl = 0; num_face < nfin; num_face++, num_face_cl++) { const int elem1 = face_voisins(num_face, 0), elem2 = face_voisins(num_face, 1); @@ -175,20 +181,24 @@ void T_paroi_Champ_P0_VDF::me_calculer(double tps) } } else if (sub_type(Neumann_paroi, la_cl.valeur())) - for (int num_face = ndeb, num_face_cl = 0; num_face < nfin; num_face++, num_face_cl++) - { - const int elem1 = face_voisins(num_face, 0), elem2 = face_voisins(num_face, 1); - const double signe = elem1 > -1 ? -1.0 : 1.0; - const int elem = elem1 > -1 ? elem1 : elem2; - const double e = Objet_U::axi ? dvdf.dist_norm_bord_axi(num_face) : dvdf.dist_norm_bord(num_face); - const double nu = eval.nu_2_impl(elem, k), t_elem = temp(elem, k); - - val(elem, k) += signe * e * ref_cast(Neumann_paroi, la_cl.valeur()).flux_impose(num_face_cl, k) / nu + t_elem; - indx_pond(elem, k)++; - } + { + ToDo_Kokkos("critical"); + for (int num_face = ndeb, num_face_cl = 0; num_face < nfin; num_face++, num_face_cl++) + { + const int elem1 = face_voisins(num_face, 0), elem2 = face_voisins(num_face, 1); + const double signe = elem1 > -1 ? -1.0 : 1.0; + const int elem = elem1 > -1 ? elem1 : elem2; + const double e = Objet_U::axi ? dvdf.dist_norm_bord_axi(num_face) : dvdf.dist_norm_bord(num_face); + const double nu = eval.nu_2_impl(elem, k), t_elem = temp(elem, k); + + val(elem, k) += signe * e * ref_cast(Neumann_paroi, la_cl.valeur()).flux_impose(num_face_cl, k) / nu + t_elem; + indx_pond(elem, k)++; + } + } } // On moyenne la contribution + ToDo_Kokkos("critical"); for (int elem = 0; elem < n_elem; elem++) for (int k = 0; k < N; k++) // pour multiphase if (indx_pond(elem, k) > 0) diff --git a/src/VDF/Champs/Taux_cisaillement_P0_VDF.cpp b/src/VDF/Champs/Taux_cisaillement_P0_VDF.cpp index a981348b27..f295f664cf 100644 --- a/src/VDF/Champs/Taux_cisaillement_P0_VDF.cpp +++ b/src/VDF/Champs/Taux_cisaillement_P0_VDF.cpp @@ -17,6 +17,8 @@ #include #include #include +#include +#include Implemente_instanciable(Taux_cisaillement_P0_VDF, "Taux_cisaillement_P0_VDF", Champ_Fonc_P0_VDF); @@ -35,13 +37,18 @@ void Taux_cisaillement_P0_VDF::mettre_a_jour(double tps) int nb_elem = le_dom_VF->nb_elem(); int N = vitesse_->valeurs().line_size(); - DoubleTab tmp(nb_elem,N); - vitesse_->calcul_S_barre_Multiphase(vitesse_->valeurs(), tmp, le_dom_Cl_VDF.valeur()); - - DoubleTab& S = valeurs(); // Shear rate - for (int n = 0; n < N; n++) - for (int i = 0; i < nb_elem; i++) - S(i,n) = sqrt(tmp(i,n)); + DoubleTrav tab_S_barre(nb_elem,N); + vitesse_->calcul_S_barre_Multiphase(vitesse_->valeurs(), tab_S_barre, le_dom_Cl_VDF.valeur()); + + // Convert to Kokkos parallel_for + CDoubleTabView S_barre = tab_S_barre.view_ro(); + DoubleTabView S = valeurs().view_wo(); // Shear rate + Kokkos::parallel_for(start_gpu_timer(__KERNEL_NAME__), range_1D(0, nb_elem), KOKKOS_LAMBDA(const int i) + { + for (int n = 0; n < N; n++) + S(i,n) = Kokkos::sqrt(S_barre(i,n)); + }); + end_gpu_timer(__KERNEL_NAME__); changer_temps(tps); Champ_Fonc_base::mettre_a_jour(tps); diff --git a/src/VDF/Cond_Lim/Echange_contact_Correlation_VDF.cpp b/src/VDF/Cond_Lim/Echange_contact_Correlation_VDF.cpp index 128073a5e4..0c5a30ee9b 100644 --- a/src/VDF/Cond_Lim/Echange_contact_Correlation_VDF.cpp +++ b/src/VDF/Cond_Lim/Echange_contact_Correlation_VDF.cpp @@ -670,6 +670,7 @@ void Echange_contact_Correlation_VDF::calculer_h_solide(DoubleTab& tab,const Equ e.resize(front_vf.nb_faces()); + ToDo_Kokkos("critical"); for (int face=ndeb; facevaleur_au_bord(face); } diff --git a/src/VDF/Cond_Lim/Sortie_libre_Pression_imposee_Orlansky.cpp b/src/VDF/Cond_Lim/Sortie_libre_Pression_imposee_Orlansky.cpp index 129238203a..4cece14a46 100644 --- a/src/VDF/Cond_Lim/Sortie_libre_Pression_imposee_Orlansky.cpp +++ b/src/VDF/Cond_Lim/Sortie_libre_Pression_imposee_Orlansky.cpp @@ -104,6 +104,7 @@ void Sortie_libre_Pression_imposee_Orlansky::mettre_a_jour(double temps) int face, compo; + ToDo_Kokkos("critical"); for (face = ndeb; face < ndeb + nb_faces_loc; face++) { int i = face - ndeb; @@ -154,53 +155,59 @@ void Sortie_libre_Pression_imposee_Orlansky::mettre_a_jour(double temps) //Debog::verifier_bord("Orlansky::mettre_a_jour() : vitesse_moins_deux : " , vitesse_moins_deux, ndeb); //Debog::verifier_bord("Orlansky::mettre_a_jour() : vitesse_moins_un : " , vitesse_moins_un, ndeb); + ToDo_Kokkos("critical"); for (compo = 0; compo < dimension; compo++) - for (face = ndeb; face < ndeb + nb_faces_loc; face++) - { - int i = face - ndeb; - - int ori = zvdf.orientation(face); - - vitesse_temps_moins_deux(i, compo) = vitesse_temps_moins_un(i, compo); - vitesse_temps_moins_un(i, compo) = vit_ext(i, compo); - vitesse_moins_un_temps_moins_deux(i, compo) = vitesse_moins_un_temps_moins_un(i, compo); - - vitesse_moins_un_temps_moins_un(i, compo) = vitesse_moins_un(i, compo); - vitesse_moins_deux_temps_moins_un(i, compo) = vitesse_moins_deux(i, compo); - - int elem_un = zvdf.face_voisins(face, 0); - if (elem_un < 0) - elem_un = zvdf.face_voisins(face, 1); - int face_moins_un = zvdf.elem_faces(elem_un, ori); - if (face_moins_un == face) - face_moins_un = zvdf.elem_faces(elem_un, ori + dimension); - double vit = 0.5 * (vitesse(zvdf.elem_faces(elem_un, compo)) + vitesse(zvdf.elem_faces(elem_un, compo + dimension))); - - vitesse_moins_un(i, compo) = vit; - - int elem_deux = zvdf.face_voisins(face_moins_un, 0); - if (elem_deux == elem_un) - elem_deux = zvdf.face_voisins(face_moins_un, 1); - vit = 0.5 * (vitesse(zvdf.elem_faces(elem_deux, compo)) + vitesse(zvdf.elem_faces(elem_deux, compo + dimension))); - vitesse_moins_deux(i, compo) = vit; - - double pre_m_un_t_m_deux = vitesse_moins_un_temps_moins_deux(i, compo); - double pre_m_deux_t_m_un = vitesse_moins_deux_temps_moins_un(i, compo); - double pre_m_un = vitesse_moins_un(i, compo); - - if (pre_m_un_t_m_deux == pre_m_un) - VPhiV(i, compo) = 0; - else - VPhiV(i, compo) = (pre_m_un_t_m_deux - pre_m_un) / (pre_m_un + pre_m_un_t_m_deux - 2 * pre_m_deux_t_m_un); - if (VPhiV(i, compo) <= 1.e-24) - VPhiV(i, compo) = 0.0; - if (VPhiV(i, compo) > 1.) - VPhiV(i, compo) = 1.0; - assert(VPhiV(i, compo) < 1.e12); - - vit_ext(i, compo) = (1 - VPhiV(i, compo)) / (1 + VPhiV(i, compo)) * vitesse_temps_moins_un(i, compo) + (2 * VPhiV(i, compo) / (1 + VPhiV(i, compo))) * vitesse_moins_un(i, compo); - - } + { + for (face = ndeb; face < ndeb + nb_faces_loc; face++) + { + int i = face - ndeb; + + int ori = zvdf.orientation(face); + + vitesse_temps_moins_deux(i, compo) = vitesse_temps_moins_un(i, compo); + vitesse_temps_moins_un(i, compo) = vit_ext(i, compo); + vitesse_moins_un_temps_moins_deux(i, compo) = vitesse_moins_un_temps_moins_un(i, compo); + + vitesse_moins_un_temps_moins_un(i, compo) = vitesse_moins_un(i, compo); + vitesse_moins_deux_temps_moins_un(i, compo) = vitesse_moins_deux(i, compo); + + int elem_un = zvdf.face_voisins(face, 0); + if (elem_un < 0) + elem_un = zvdf.face_voisins(face, 1); + int face_moins_un = zvdf.elem_faces(elem_un, ori); + if (face_moins_un == face) + face_moins_un = zvdf.elem_faces(elem_un, ori + dimension); + double vit = 0.5 * (vitesse(zvdf.elem_faces(elem_un, compo)) + + vitesse(zvdf.elem_faces(elem_un, compo + dimension))); + + vitesse_moins_un(i, compo) = vit; + + int elem_deux = zvdf.face_voisins(face_moins_un, 0); + if (elem_deux == elem_un) + elem_deux = zvdf.face_voisins(face_moins_un, 1); + vit = 0.5 * + (vitesse(zvdf.elem_faces(elem_deux, compo)) + vitesse(zvdf.elem_faces(elem_deux, compo + dimension))); + vitesse_moins_deux(i, compo) = vit; + + double pre_m_un_t_m_deux = vitesse_moins_un_temps_moins_deux(i, compo); + double pre_m_deux_t_m_un = vitesse_moins_deux_temps_moins_un(i, compo); + double pre_m_un = vitesse_moins_un(i, compo); + + if (pre_m_un_t_m_deux == pre_m_un) + VPhiV(i, compo) = 0; + else + VPhiV(i, compo) = (pre_m_un_t_m_deux - pre_m_un) / (pre_m_un + pre_m_un_t_m_deux - 2 * pre_m_deux_t_m_un); + if (VPhiV(i, compo) <= 1.e-24) + VPhiV(i, compo) = 0.0; + if (VPhiV(i, compo) > 1.) + VPhiV(i, compo) = 1.0; + assert(VPhiV(i, compo) < 1.e12); + + vit_ext(i, compo) = (1 - VPhiV(i, compo)) / (1 + VPhiV(i, compo)) * vitesse_temps_moins_un(i, compo) + + (2 * VPhiV(i, compo) / (1 + VPhiV(i, compo))) * vitesse_moins_un(i, compo); + + } + } Debog::verifier_bord("Orlansky::mettre_a_jour() : vit_ext : ", vit_ext, ndeb); Debog::verifier_bord("Orlansky::mettre_a_jour() : VPhiV : ", VPhiV, ndeb); } diff --git a/src/VDF/Geometrie/Domaine_Cl_VDF.cpp b/src/VDF/Geometrie/Domaine_Cl_VDF.cpp index 3f86323c61..769482a038 100644 --- a/src/VDF/Geometrie/Domaine_Cl_VDF.cpp +++ b/src/VDF/Geometrie/Domaine_Cl_VDF.cpp @@ -353,8 +353,6 @@ void Domaine_Cl_VDF::imposer_cond_lim(Champ_Inc_base& ch, double temps) { Champ_Face_VDF& ch_face = ref_cast(Champ_Face_VDF, ch); const Domaine_VDF& mon_dom_VDF = ch_face.domaine_vdf(); - int ndeb,nfin, num_face; - for(int i=0; i(ch_tab).view_rw(); + Kokkos::parallel_for(start_gpu_timer(__KERNEL_NAME__), range_1D(ndeb, nfin), KOKKOS_LAMBDA(const int num_face) + { + int voisine = face_associee(num_face-ndeb) + ndeb; + if (tab(num_face) != tab(voisine)) + { + double moy = 0.5 * (tab(num_face) + tab(voisine)); + // Atomic operations to avoid race conditions when multiple threads + // try to update the same location (if two faces reference each other) + Kokkos::atomic_store(&tab(num_face), moy); + Kokkos::atomic_store(&tab(voisine), moy); + } + }); + end_gpu_timer(__KERNEL_NAME__); } } - else if( sub_type(Navier,la_cl) ) + else if( sub_type(Navier,la_cl) || sub_type(Dirichlet_paroi_fixe,la_cl) || sub_type(Dirichlet_paroi_defilante,la_cl) ) { const Front_VF& le_bord = ref_cast(Front_VF,la_cl.frontiere_dis()); - ndeb = le_bord.num_premiere_face(); - nfin = ndeb + le_bord.nb_faces(); - for (num_face=ndeb; num_face 0) + return dist; + else + return dist + l; + } + KOKKOS_INLINE_FUNCTION + double dist_face_elem1_period(int num_face,int n1,double l) const + { + int ori = orientation_[num_face]; + double dist = xp_(n1,ori) - xv_(num_face,ori); + if (dist > 0) + return dist; + else + return dist + l; + } +}; +#endif diff --git a/src/VDF/Geometrie/Faces_VDF.cpp b/src/VDF/Geometrie/Faces_VDF.cpp index 350aa0f3d3..17cd8ed6f8 100644 --- a/src/VDF/Geometrie/Faces_VDF.cpp +++ b/src/VDF/Geometrie/Faces_VDF.cpp @@ -1,5 +1,5 @@ /**************************************************************************** -* Copyright (c) 2023, CEA +* Copyright (c) 2026, CEA * All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: @@ -46,6 +46,7 @@ void Faces_VDF::calculer_orientation(IntVect& tab_orientation, const Domaine& dom=mondomaine; double dx=0, dy=0, dz=1.e30; + ToDo_Kokkos("critical"); for(int face=0; face 0) @@ -85,34 +56,13 @@ double norm_2D_vit(const DoubleVect& vit, int elem, int iori, const Domaine_VDF& void moy_3D_vit(const DoubleVect& vit, int elem, int iori, const Domaine_VDF& domaine, double& val1, double& val2) { - int num1, num2, num3, num4; - if (iori == 0) - { - num1 = domaine.elem_faces(elem, 1); - num2 = domaine.elem_faces(elem, 4); - num3 = domaine.elem_faces(elem, 2); - num4 = domaine.elem_faces(elem, 5); - } - else if (iori == 1) - { - num1 = domaine.elem_faces(elem, 0); - num2 = domaine.elem_faces(elem, 3); - num3 = domaine.elem_faces(elem, 2); - num4 = domaine.elem_faces(elem, 5); - } - else if (iori == 2) - { - num1 = domaine.elem_faces(elem, 0); - num2 = domaine.elem_faces(elem, 3); - num3 = domaine.elem_faces(elem, 1); - num4 = domaine.elem_faces(elem, 4); - } - else - { - Cerr << "valeur de iori " << iori << " impossible en 3D" << finl; - Process::exit(); - num1 = num2 = num3 = num4 = -1; - } + assert(iori==0 || iori==1 || iori==2); + int i = iori==0 ? 1 : 0; + int j = iori==2 ? 0 : 1; + int num1 = domaine.elem_faces(elem, 0+i); + int num2 = domaine.elem_faces(elem, 3+i); + int num3 = domaine.elem_faces(elem, 1+j); + int num4 = domaine.elem_faces(elem, 4+j); val1 = 0.5 * (vit(num1) + vit(num2)); val2 = 0.5 * (vit(num3) + vit(num4)); } @@ -130,8 +80,8 @@ double norm_3D_vit(const DoubleVect& vit, int elem, int iori, const Domaine_VDF& double norm_3D_vit(const DoubleVect& vit, int elem, int iori, const Domaine_VDF& domaine, double u_paroi, double v_paroi, double w_paroi, double& val1, double& val2) { - double v1, v2, norm_vit; moy_3D_vit(vit, elem, iori, domaine, val1, val2); + double v1, v2; if (iori == 0) { v1 = val1 - v_paroi; // EB 28/08/25 : for a wall of normal x, val1 is the velocity in y direction and val2 in z direction @@ -148,14 +98,13 @@ double norm_3D_vit(const DoubleVect& vit, int elem, int iori, const Domaine_VDF& v2 = val2 - v_paroi; } //Fin modif YB - else { Cerr << "valeur de iori " << iori << " impossible en 3D" << finl; Process::exit(); v1 = v2 = 0; } - norm_vit = sqrt(v1 * v1 + v2 * v2); + double norm_vit = sqrt(v1 * v1 + v2 * v2); val1 = v1 / (norm_vit + DMINFLOAT); val2 = v2 / (norm_vit + DMINFLOAT); return norm_vit; @@ -170,14 +119,14 @@ double norm_vit(const DoubleVect& vit, int elem, int ori, const Domaine_VDF& dom } -void calcul_interne2D(int num_elem, int elx0, int elx1, int ely0, int ely1, const Domaine_VDF& domaine_VDF, const DoubleTab& val, DoubleTab& rot) +KOKKOS_INLINE_FUNCTION +void calcul_interne2D(int num_elem, int elx0, int elx1, int ely0, int ely1, const Domaine_VDF_View& domaine_VDF, CIntTabView elem_faces, CDoubleTabView val, DoubleTabView rot) { - const IntTab& elem_faces = domaine_VDF.elem_faces(); double delta_x_0, delta_x_1, delta_y_0, delta_y_1; double delta_x, delta_y; double deriv_vx, deriv_uy; - int N = val.line_size(), n; + int N = (int)val.extent(1), n; deriv_vx = 0; deriv_uy = 0; @@ -189,7 +138,6 @@ void calcul_interne2D(int num_elem, int elx0, int elx1, int ely0, int ely1, cons delta_x = (delta_x_1 - delta_x_0) * (delta_x_1 + delta_x_0) / (delta_x_1 * delta_x_0); delta_y = (delta_y_1 - delta_y_0) * (delta_y_1 + delta_y_0) / (delta_y_1 * delta_y_0); - for (n=0; n 0 ? 1 : -1); + return n_v; +} + +KOKKOS_INLINE_FUNCTION void moy_3D_vit(CDoubleArrView vit, int elem, int iori, CIntTabView elem_faces, double& val1, double& val2) +{ + assert(iori==0 || iori==1 || iori==2); + int i = iori==0 ? 1 : 0; + int j = iori==2 ? 0 : 1; + val1 = 0.5 * (vit(elem_faces(elem, i)) + vit(elem_faces(elem, 3+i))); + val2 = 0.5 * (vit(elem_faces(elem, 1+j)) + vit(elem_faces(elem, 4+j))); +} + +KOKKOS_INLINE_FUNCTION double norm_3D_vit(CDoubleArrView vit, int elem, int iori, CIntTabView elem_faces, double& val1, double& val2) +{ + moy_3D_vit(vit, elem, iori, elem_faces, val1, val2); + double v1 = Kokkos::fabs(val1), v2 = Kokkos::fabs(val2); + double norm_vit = Kokkos::sqrt(v1 * v1 + v2 * v2); + val1 = v1 / (norm_vit + DMINFLOAT); + val2 = v2 / (norm_vit + DMINFLOAT); + return norm_vit; +} + +KOKKOS_INLINE_FUNCTION double norm_3D_vit(CDoubleArrView vit, int elem, int iori, CIntTabView elem_faces, double u_paroi, double v_paroi, double w_paroi, double& val1, double& val2) +{ + moy_3D_vit(vit, elem, iori, elem_faces, val1, val2); + double v1 = val1 - (iori==0 ? v_paroi : u_paroi); + double v2 = val2 - (iori==2 ? v_paroi : w_paroi); + double norm_vit = Kokkos::sqrt(v1 * v1 + v2 * v2); + val1 = v1 / (norm_vit + DMINFLOAT); + val2 = v2 / (norm_vit + DMINFLOAT); + return norm_vit; +} + +KOKKOS_INLINE_FUNCTION void calcul_interne2D(int num_elem, int elx0, int elx1, int ely0, int ely1, const Domaine_VDF_View& domaine_VDF, CIntTabView elem_faces, CDoubleTabView val, DoubleTabView rot); +KOKKOS_INLINE_FUNCTION void calcul_bord2D(int num_elem, int elx0, int elx1, int ely0, int ely1, const Domaine_VDF_View& domaine_VDF, CIntTabView elem_faces, CDoubleTabView val, DoubleTabView rot); +void calrotord2centelemdim2(DoubleTab& rot, const DoubleTab& val, const Domaine_VDF& domaine_VDF); +KOKKOS_INLINE_FUNCTION void calcul_interne3D(int num_elem, int elx0, int elx1, int ely0, int ely1, int elz0, int elz1, const Domaine_VDF_View& domaine_VDF, CIntTabView elem_faces, CDoubleTabView val, DoubleTabView rot); +KOKKOS_INLINE_FUNCTION void calcul_bord3D(int num_elem, int elx0, int elx1, int ely0, int ely1, int elz0, int elz1, const Domaine_VDF_View& domaine_VDF, CIntTabView elem_faces, CDoubleTabView val, DoubleTabView rot); +void calrotord2centelemdim3(DoubleTab& rot, const DoubleTab& val, const Domaine_VDF& domaine_VDF); // Calcul du produit scalaire du tenseur des vitesses de deformation en coordonnees cartesiennes : calcul 2D puis 3D. void calcul_dscald_interne2D(int num_elem, int elx0, int elx1, int ely0, int ely1, const Domaine_VDF& domaine_VDF, const DoubleTab& val, DoubleTab& dscald); diff --git a/src/VDF/Milieu/EDO_Pression_th_VDF.cpp b/src/VDF/Milieu/EDO_Pression_th_VDF.cpp index de532824e3..2fa2164d2c 100644 --- a/src/VDF/Milieu/EDO_Pression_th_VDF.cpp +++ b/src/VDF/Milieu/EDO_Pression_th_VDF.cpp @@ -1,5 +1,5 @@ /**************************************************************************** -* Copyright (c) 2024, CEA +* Copyright (c) 2025, CEA * All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: @@ -17,6 +17,8 @@ #include #include #include +#include +#include Implemente_base(EDO_Pression_th_VDF, "EDO_Pression_th_VDF", EDO_Pression_th_base); @@ -90,27 +92,27 @@ double EDO_Pression_th_VDF::masse_totale(const DoubleTab& P, const DoubleTab& T) return M; } -void EDO_Pression_th_VDF::calculer_grad(const DoubleTab& inco, DoubleTab& resu) +void EDO_Pression_th_VDF::calculer_grad(const DoubleTab& tab_inco, DoubleTab& tab_resu) { - int face, n0, n1, ori; - double coef; const Domaine_VDF& dom = ref_cast(Domaine_VDF, le_dom.valeur()); - const IntTab& face_voisins = dom.face_voisins(); - const IntVect& orientation = dom.orientation(); - const DoubleVect& porosite_surf = le_fluide_->porosite_face(); - const DoubleTab& xp = dom.xp(); - const DoubleVect& volume_entrelaces = le_dom->volumes_entrelaces(); + CIntTabView face_voisins = dom.face_voisins().view_ro(); + CIntArrView orientation = dom.orientation().view_ro(); + CDoubleArrView porosite_surf = le_fluide_->porosite_face().view_ro(); + CDoubleTabView xp = dom.xp().view_ro(); + CDoubleArrView volume_entrelaces = le_dom->volumes_entrelaces().view_ro(); + CDoubleArrView inco = static_cast(tab_inco).view_ro(); + DoubleArrView resu = static_cast(tab_resu).view_rw(); // Boucle sur les faces internes - ToDo_Kokkos("critical"); - for (face = dom.premiere_face_int(); face < dom.nb_faces(); face++) - { - n0 = face_voisins(face, 0); - n1 = face_voisins(face, 1); - ori = orientation(face); - coef = volume_entrelaces(face) * porosite_surf(face); - coef = 1; - resu(face) += coef * (inco(n1) - inco(n0)) / (xp(n1, ori) - xp(n0, ori)); - } + Kokkos::parallel_for(start_gpu_timer(__KERNEL_NAME__), range_1D(dom.premiere_face_int(), dom.nb_faces()), KOKKOS_LAMBDA(const int face) + { + const int n0 = face_voisins(face, 0); + const int n1 = face_voisins(face, 1); + const int ori = orientation(face); + double coef = volume_entrelaces(face) * porosite_surf(face); + coef = 1; + resu(face) += coef * (inco(n1) - inco(n0)) / (xp(n1, ori) - xp(n0, ori)); + }); + end_gpu_timer(__KERNEL_NAME__); } diff --git a/src/VDF/Milieu/EDO_Pression_th_VDF_Gaz_Parfait.cpp b/src/VDF/Milieu/EDO_Pression_th_VDF_Gaz_Parfait.cpp index 13bd387699..4eff72d8ce 100644 --- a/src/VDF/Milieu/EDO_Pression_th_VDF_Gaz_Parfait.cpp +++ b/src/VDF/Milieu/EDO_Pression_th_VDF_Gaz_Parfait.cpp @@ -65,6 +65,7 @@ double EDO_Pression_th_VDF_Gaz_Parfait::resoudre(double Pth_n) const DoubleTab& tempn = le_fluide_->inco_chaleur().passe(); double cn1 = 0, cn = 0, v; int elem, nb_elem = le_dom->nb_elem(); + ToDo_Kokkos("critical"); for (elem = 0; elem < nb_elem; elem++) { v = le_dom->volumes(elem); @@ -87,6 +88,7 @@ double EDO_Pression_th_VDF_Gaz_Parfait::resoudre(double Pth_n) const Front_VF& la_front_dis = ref_cast(Front_VF, la_cl.frontiere_dis()); int ndeb = la_front_dis.num_premiere_face(); int nfin = ndeb + la_front_dis.nb_faces(); + ToDo_Kokkos("critical"); for (int num_face = ndeb; num_face < nfin; num_face++) { int n0 = face_voisins(num_face, 0); @@ -148,6 +150,7 @@ double EDO_Pression_th_VDF_Gaz_Parfait::resoudre(double Pth_n) ref_cast(Navier_Stokes_std,le_fluide_->vitesse().equation()).operateur_divergence().calculer(tab_vit, divU); DoubleTrav gradT(tab_vit.dimension(0)); DoubleTrav Tstar(tab_vit.dimension(0)); + ToDo_Kokkos("critical"); for (elem = 0; elem < nb_elem; elem++) { Tstar(elem) = .5 * (tempn(elem) + tempnp1(elem)); @@ -155,6 +158,7 @@ double EDO_Pression_th_VDF_Gaz_Parfait::resoudre(double Pth_n) calculer_grad(Tstar, gradT); DoubleTab u_gradT(nb_elem); int f1, f2, i; + ToDo_Kokkos("critical"); for (elem = 0; elem < nb_elem; elem++) { u_gradT(elem) = 0; @@ -167,6 +171,7 @@ double EDO_Pression_th_VDF_Gaz_Parfait::resoudre(double Pth_n) } } + ToDo_Kokkos("critical"); for (elem = 0; elem < nb_elem; elem++) { v = le_dom->volumes(elem); @@ -186,6 +191,7 @@ double EDO_Pression_th_VDF_Gaz_Parfait::resoudre(double Pth_n) ndeb = frontiere_dis.num_premiere_face(); nfin = ndeb + frontiere_dis.nb_faces(); //if (sub_type(Neumann_sortie_libre, la_cl.valeur()) || sub_type(Dirichlet_entree_fluide, la_cl.valeur())) { + ToDo_Kokkos("critical"); for (face = ndeb; face < nfin; face++) { elem = le_dom->face_voisins(face, 0); @@ -236,6 +242,7 @@ void EDO_Pression_th_VDF_Gaz_Parfait::resoudre(DoubleTab& Pth_n) double cn1 = 0., cn = 0., v = -123.; + ToDo_Kokkos("critical"); for (int elem = 0; elem < le_dom->nb_elem(); elem++) { v = le_dom->volumes(elem); @@ -257,6 +264,7 @@ void EDO_Pression_th_VDF_Gaz_Parfait::resoudre(DoubleTab& Pth_n) const Front_VF& la_front_dis = ref_cast(Front_VF, la_cl.frontiere_dis()); int ndeb = la_front_dis.num_premiere_face(); int nfin = ndeb + la_front_dis.nb_faces(); + ToDo_Kokkos("critical"); for (int num_face = ndeb; num_face < nfin; num_face++) { int n0 = face_voisins(num_face, 0); @@ -281,6 +289,7 @@ void EDO_Pression_th_VDF_Gaz_Parfait::resoudre(DoubleTab& Pth_n) double cnt = cn, cn1t = cn1, cmt = cm; mp_sum_for_each(cnt, cn1t, cmt); + ToDo_Kokkos("critical"); for (int elem = 0; elem < le_dom->nb_elem(); elem++) Pth_n(elem) = Pth_n(elem) * cnt / cn1t / (1. + dt / cn1t * cmt); } diff --git a/src/VDF/Milieu/EDO_Pression_th_VDF_Gaz_Reel.cpp b/src/VDF/Milieu/EDO_Pression_th_VDF_Gaz_Reel.cpp index 433dda371f..d3cf4c7703 100644 --- a/src/VDF/Milieu/EDO_Pression_th_VDF_Gaz_Reel.cpp +++ b/src/VDF/Milieu/EDO_Pression_th_VDF_Gaz_Reel.cpp @@ -1,5 +1,5 @@ /**************************************************************************** -* Copyright (c) 2024, CEA +* Copyright (c) 2026, CEA * All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: @@ -66,6 +66,7 @@ double EDO_Pression_th_VDF_Gaz_Reel::resoudre(double Pth_n) ref_cast(Navier_Stokes_std,le_fluide_->vitesse().equation()).operateur_divergence().calculer(tab_vit, divU); DoubleTrav gradh(tab_vit.dimension(0)); DoubleTrav Hstar(tab_vit.dimension(0)); + ToDo_Kokkos("critical"); for (elem = 0; elem < nb_elem; elem++) { Hstar(elem) = .5 * (tab_hn(elem) + tab_hnp1(elem)); @@ -73,6 +74,7 @@ double EDO_Pression_th_VDF_Gaz_Reel::resoudre(double Pth_n) calculer_grad(Hstar, gradh); DoubleTab u_gradh(nb_elem); int f1, f2; + ToDo_Kokkos("critical"); for (elem = 0; elem < nb_elem; elem++) { u_gradh(elem) = 0; @@ -84,6 +86,7 @@ double EDO_Pression_th_VDF_Gaz_Reel::resoudre(double Pth_n) } } + ToDo_Kokkos("critical"); for (elem = 0; elem < nb_elem; elem++) { v = dom.volumes(elem); @@ -111,6 +114,7 @@ double EDO_Pression_th_VDF_Gaz_Reel::resoudre(double Pth_n) { tmp = Pth; Fnp1 = 0; + ToDo_Kokkos("critical"); for (elem = 0; elem < nb_elem; elem++) { v = dom.volumes(elem); diff --git a/src/VDF/Milieu/EOS_Tools_VDF.cpp b/src/VDF/Milieu/EOS_Tools_VDF.cpp index fefee85d96..2677ef1c15 100644 --- a/src/VDF/Milieu/EOS_Tools_VDF.cpp +++ b/src/VDF/Milieu/EOS_Tools_VDF.cpp @@ -1,5 +1,5 @@ /**************************************************************************** -* Copyright (c) 2024, CEA +* Copyright (c) 2025, CEA * All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: @@ -37,7 +37,7 @@ Entree& EOS_Tools_VDF::readOn(Entree& is) return is; } -void EOS_Tools_VDF::associer_domaines(const Domaine_dis_base& dds, const Domaine_Cl_dis_base& domaine_cl) +void EOS_Tools_VDF::associer_domaines(const Domaine_dis_base& dds, const Domaine_Cl_dis_base& domaine_cl) { le_dom = ref_cast(Domaine_VDF,dds); le_dom_Cl = domaine_cl; @@ -57,46 +57,58 @@ void EOS_Tools_VDF::associer_domaines(const Domaine_dis_base& dds, const Domain double EOS_Tools_VDF::moyenne_vol(const DoubleTab& tab) const { int nb_elem=le_dom->nb_elem(); - const DoubleVect& volumes = le_dom->volumes(); assert(tab.dimension(0)==nb_elem); - ArrOfDouble sum(2); - sum = 0; - for (int elem=0 ; elemvolumes().view_ro(); + CDoubleArrView val = static_cast(tab).view_ro(); + Kokkos::parallel_reduce(start_gpu_timer(__KERNEL_NAME__), nb_elem, KOKKOS_LAMBDA(const int elem, double & sum_tmp, double & volume_tmp) + { + double v = volumes(elem); + volume_tmp += v; + sum_tmp += v * val(elem); + }, sum, volume); + end_gpu_timer(__KERNEL_NAME__); + + DoubleTrav pair(2); + pair[0] = volume; + pair[1] = sum; + mp_sum_for_each_item(pair); + return pair[1]/pair[0]; } void EOS_Tools_VDF::calculer_rho_face_np1(const DoubleTab& tab_rhoP0) { - int face, elem, nb_faces_tot = le_dom->nb_faces_tot(); + int nb_faces_tot = le_dom->nb_faces_tot(); Debog::verifier("tab_rhoP0",tab_rhoP0); - int i, nb_comp; - IntTab& face_voisins = le_dom->face_voisins(); - for (face=0 ; faceface_voisins().view_ro(); + CDoubleArrView rhoP0 = static_cast(tab_rhoP0).view_ro(); + CDoubleArrView rho_face = static_cast(tab_rho_face).view_ro(); + DoubleArrView rho_face_np1 = static_cast(tab_rho_face_np1).view_wo(); + Kokkos::parallel_for(start_gpu_timer(__KERNEL_NAME__), nb_faces_tot, KOKKOS_LAMBDA(const int face) + { + int nb_comp=0; + rho_face_np1(face) = 0; + for (int i=0 ; i<2 ; i++) + { + const int elem = face_voisins(face,i); + if (elem!=-1) + { + nb_comp++; + rho_face_np1(face) += rhoP0(elem); + } + } + rho_face_np1(face) /= nb_comp; + }); + end_gpu_timer(__KERNEL_NAME__); tab_rho_face_np1.echange_espace_virtuel(); Debog::verifier("tab_rho_face_np1",tab_rho_face_np1); - for (face=0 ; face(tab_rho_face_demi).view_wo(); + Kokkos::parallel_for(start_gpu_timer(__KERNEL_NAME__), nb_faces_tot, KOKKOS_LAMBDA(const int face) + { + rho_face_demi(face)=0.5*(rho_face_np1(face)+rho_face(face)); + }); + end_gpu_timer(__KERNEL_NAME__); } /*! @brief Renvoie rho avec la meme discretisation que la vitesse : une valeur par face en VDF @@ -114,6 +126,7 @@ const DoubleTab& EOS_Tools_VDF::rho_discvit() const */ void EOS_Tools_VDF::divu_discvit(const DoubleTab& secmem1, DoubleTab& secmem2) { + ToDo_Kokkos("VDF critical but not tested..."); assert_espace_virtuel_vect(secmem1); int nb_faces_tot = le_dom->nb_faces_tot(); IntTab& face_voisins = le_dom->face_voisins(); @@ -143,17 +156,21 @@ void EOS_Tools_VDF::divu_discvit(const DoubleTab& secmem1, DoubleTab& secmem2) void EOS_Tools_VDF::secmembre_divU_Z(DoubleTab& tab_W) const { double dt = le_fluide().vitesse().equation().schema_temps().pas_de_temps(); - int elem,nb_elem = le_dom->nb_elem();//,nb_faces = le_dom->nb_faces(); - DoubleVect tab_dZ(nb_elem); - //DoubleTab tab_gradZ(nb_faces); + int nb_elem = le_dom->nb_elem(); + DoubleTrav tab_dZ(nb_elem); const DoubleTab& tab_rhonP0 = le_fluide().loi_etat()->rho_n(); const DoubleTab& tab_rhonp1P0 = le_fluide().loi_etat()->rho_np1(); Debog::verifier("divU tab_rhonP0",tab_rhonP0); Debog::verifier("divU tab_rhonp1P0",tab_rhonp1P0); - const DoubleVect& volumes = le_dom->volumes(); - for (elem=0 ; elem(tab_rhonP0).view_ro(); + CDoubleArrView rhonp1P0 = static_cast(tab_rhonp1P0).view_ro(); + DoubleArrView dZ = static_cast(tab_dZ).view_wo(); + Kokkos::parallel_for(start_gpu_timer(__KERNEL_NAME__), nb_elem, KOKKOS_LAMBDA(const int elem) + { + dZ(elem) = (rhonp1P0(elem)-rhonP0(elem))/dt; + }); + end_gpu_timer(__KERNEL_NAME__); // Ajout des termes sources speciaux de l'equation de masse: const bool has_mass_flux = (sub_type(Navier_Stokes_Fluide_Dilatable_base, le_fluide().vitesse().equation())) ? @@ -165,16 +182,26 @@ void EOS_Tools_VDF::secmembre_divU_Z(DoubleTab& tab_W) const src_mass.ajouter_projection(le_fluide(), static_cast(tab_dZ)); } - for (elem = 0; elem < nb_elem; elem++) - tab_W(elem) = -tab_dZ(elem) * volumes(elem); + CDoubleArrView volumes = le_dom->volumes().view_ro(); + CDoubleArrView dZ2 = static_cast(tab_dZ).view_ro(); // dZ sync cause tab_dZ may be host updated into src_mass.ajouter_projection !!!! + DoubleArrView W = static_cast(tab_W).view_wo(); + Kokkos::parallel_for(start_gpu_timer(__KERNEL_NAME__), nb_elem, KOKKOS_LAMBDA(const int elem) + { + W(elem) = -dZ2(elem) * volumes(elem); + }); + end_gpu_timer(__KERNEL_NAME__); } void EOS_Tools_VDF::mettre_a_jour(double temps) { int n=tab_rho_face_np1.size_totale(); - for (int i=0; i(tab_rho_face_np1).view_ro(); + DoubleArrView rho_face = static_cast(tab_rho_face).view_wo(); + DoubleArrView rho_face_demi = static_cast(tab_rho_face_demi).view_wo(); + Kokkos::parallel_for(start_gpu_timer(__KERNEL_NAME__), n, KOKKOS_LAMBDA(const int i) + { + rho_face(i) = rho_face_np1(i); + rho_face_demi(i) = rho_face_np1(i); + }); + end_gpu_timer(__KERNEL_NAME__); } diff --git a/src/VDF/Operateurs/Eval_Conv/Eval_Conv_VDF.h b/src/VDF/Operateurs/Eval_Conv/Eval_Conv_VDF.h index d85514240b..018b44d3ad 100644 --- a/src/VDF/Operateurs/Eval_Conv/Eval_Conv_VDF.h +++ b/src/VDF/Operateurs/Eval_Conv/Eval_Conv_VDF.h @@ -1,5 +1,5 @@ /**************************************************************************** -* Copyright (c) 2024, CEA +* Copyright (c) 2026, CEA * All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: @@ -33,7 +33,11 @@ class Eval_Conv_VDF : public Evaluateur_VDF, public Eval_Conv_VDF_tools public: inline Eval_Conv_VDF() { } - inline Eval_Conv_VDF(const Eval_Conv_VDF& eval) : Evaluateur_VDF(eval), vitesse_(eval.vitesse_) { dt_vitesse.ref(eval.dt_vitesse); } + inline Eval_Conv_VDF(const Eval_Conv_VDF& eval) : Evaluateur_VDF(eval), vitesse_(eval.vitesse_) + { + dt_vitesse.ref(eval.dt_vitesse); + dt_vitesse_v_ = eval.dt_vitesse_v_; + } inline void associer(const Champ_Face_VDF& ); inline void mettre_a_jour( ) { dt_vitesse.ref(vitesse_->valeurs()); } @@ -41,19 +45,52 @@ class Eval_Conv_VDF : public Evaluateur_VDF, public Eval_Conv_VDF_tools inline Champ_Inc_base& vitesse() { return vitesse_.valeur(); } // pour CRTP - inline int get_elem(int i, int j) const { return elem_(i,j); } - inline int get_orientation(int i ) const { return orientation(i); } - inline int get_premiere_face_bord() const { return premiere_face_bord; } - inline double get_dt_vitesse(int face, int comp = 0) const { return dt_vitesse(face, comp); } - inline double get_surface_porosite(int face) const { return surface(face)*porosite(face); } - inline double get_surface(int face) const { return surface(face); } - inline double get_porosite(int face) const { return porosite(face); } inline const DoubleTab& get_tab_vitesse() const { return dt_vitesse; } - inline const Domaine_Cl_VDF& get_la_zcl() const { return la_zcl.valeur(); } + + // For views: + template + KOKKOS_INLINE_FUNCTION int get_elem(int i, int j) const { if constexpr (std::is_same::value) return elem_(i,j); else return elem_v_(i,j); } + template + KOKKOS_INLINE_FUNCTION int get_orientation(int i ) const { if constexpr (std::is_same::value) return orientation(i); else return orientation_v_(i); } + template + KOKKOS_INLINE_FUNCTION double get_surface(int face) const { if constexpr (std::is_same::value) return surface(face); else return surface_v_(face); } + template + KOKKOS_INLINE_FUNCTION int get_dt_vitesse_nb_comp() const { if constexpr (std::is_same::value) return dt_vitesse.line_size(); else return (int)dt_vitesse_v_.extent(1); } + template + KOKKOS_INLINE_FUNCTION double get_dt_vitesse(int face, int comp = 0) const { if constexpr (std::is_same::value) return dt_vitesse(face, comp); else return dt_vitesse_v_(face, comp); } + template + KOKKOS_INLINE_FUNCTION double get_surface_porosite(int face) const { if constexpr (std::is_same::value) return surface(face)*porosite(face); else return surface_v_(face)*porosite_v_(face); } + template + KOKKOS_INLINE_FUNCTION double get_porosite(int face) const { if constexpr (std::is_same::value) return porosite(face); else return porosite_v_(face); } + template + KOKKOS_INLINE_FUNCTION double get_dist_face(int n1,int n2,int k) const { if constexpr (std::is_same::value) return le_dom->dist_face(n1,n2,k); else return le_dom_v_.dist_face(n1,n2,k); } + template + KOKKOS_INLINE_FUNCTION double get_dist_face_period(int n1,int n2,int k) const { if constexpr (std::is_same::value) return le_dom->dist_face_period(n1,n2,k); else return le_dom_v_.dist_face_period(n1,n2,k); } + template + KOKKOS_INLINE_FUNCTION int get_amont_amont(int face, int i) const { if constexpr (std::is_same::value) return le_dom->amont_amont(face,i); else return le_dom_v_.amont_amont(face,i); } + template + KOKKOS_INLINE_FUNCTION int get_face_amont_princ(int num_face, int i) const { if constexpr (std::is_same::value) return le_dom->face_amont_princ(num_face,i); else return le_dom_v_.face_amont_princ(num_face,i); } + template + KOKKOS_FORCEINLINE_FUNCTION int get_face_amont_conj(int num_face,int i, int k) const { if constexpr (std::is_same::value) return le_dom->face_amont_conj(num_face,i,k); else return le_dom_v_.face_amont_conj(num_face,i,k); } + template + KOKKOS_INLINE_FUNCTION double get_dim_face(int n1,int k) const { if constexpr (std::is_same::value) return le_dom->dim_face(n1,k); else return le_dom_v_.dim_face(n1,k); } + template + KOKKOS_INLINE_FUNCTION double get_dim_elem(int n1, int k) const { if constexpr (std::is_same::value) return le_dom->dim_elem(n1, k); else return le_dom_v_.dim_elem(n1, k); } + template + KOKKOS_INLINE_FUNCTION double get_dist_elem(int n1, int n2, int k) const { if constexpr (std::is_same::value) return le_dom->dist_elem(n1, n2, k); else return le_dom_v_.dist_elem(n1, n2, k); } + template + KOKKOS_INLINE_FUNCTION double get_dist_elem_period(int n1, int n2, int k) const { if constexpr (std::is_same::value) return le_dom->dist_elem_period(n1, n2, k); else return le_dom_v_.dist_elem_period(n1, n2, k); } + + void view_ro() const override + { + Evaluateur_VDF::view_ro(); + dt_vitesse_v_ = dt_vitesse.view_ro(); + } protected: OBS_PTR(Champ_Face_VDF) vitesse_; DoubleTab dt_vitesse; + mutable CDoubleTabView dt_vitesse_v_; }; /*! @brief associe le champ de vitesse transportante diff --git a/src/VDF/Operateurs/Eval_Conv/Eval_Conv_VDF_Elem.h b/src/VDF/Operateurs/Eval_Conv/Eval_Conv_VDF_Elem.h index 493e3db25f..8a5014619e 100644 --- a/src/VDF/Operateurs/Eval_Conv/Eval_Conv_VDF_Elem.h +++ b/src/VDF/Operateurs/Eval_Conv/Eval_Conv_VDF_Elem.h @@ -38,9 +38,11 @@ class Eval_Conv_VDF_Elem : public Eval_VDF_Elem // To overload template inline void flux_face(const DoubleTab&, const DoubleTab&, const int, const Dirichlet_entree_fluide&, const int, Type_Double& ) const; template inline void flux_face(const DoubleTab&, const DoubleTab&, const int, const Neumann_sortie_libre&, const int, Type_Double& ) const; - template inline void flux_face(const DoubleTab&, const DoubleTab&, const int, const Periodique&, const int, Type_Double& ) const; template inline void flux_face(const DoubleTab&, const int, const int, const int, const Echange_externe_impose&, const int, Type_Double& ) const { /* Do nothing */ } - template inline void flux_faces_interne(const DoubleTab&, const int, Type_Double& ) const; + + template + KOKKOS_INLINE_FUNCTION void flux_faces_bord_comp(CDoubleTabView, CDoubleTabView, const int, const BC_View&, const int, const int, double&) const; + KOKKOS_INLINE_FUNCTION void flux_faces_interne_comp(CDoubleTabView, const int, const int, double&) const; /* ************************************** * * ********* POUR L'IMPLICITE ********** * @@ -77,17 +79,36 @@ class Eval_Conv_VDF_Elem : public Eval_VDF_Elem template inline void coeffs_face_bloc_vitesse_common(const DoubleTab&, const int, Type_Double& ) const; // CRTP pattern to static_cast the appropriate class and get the implementation : This is magic ! - inline int elem_(const int i, const int j) const { return static_cast(this)->get_elem(i,j); } - inline int amont_amont_(const int face, const int i) const { return static_cast(this)->amont_amont(face,i); } - inline double dt_vitesse(const int face, int comp = 0) const { return static_cast(this)->get_dt_vitesse(face, comp); } + template + KOKKOS_INLINE_FUNCTION int elem_(const int i, const int j) const { return static_cast(this)->template get_elem(i,j); } + template + KOKKOS_INLINE_FUNCTION int amont_amont_(const int face, const int i) const { return static_cast(this)->template get_amont_amont(face,i); } + KOKKOS_INLINE_FUNCTION double dist_elem(const int n1, const int n2, const int k) const { return static_cast(this)->get_dist_elem(n1,n2,k); } + template + KOKKOS_INLINE_FUNCTION double dt_vitesse(const int face, int comp = 0) const { return static_cast(this)->template get_dt_vitesse(face, comp); } + template + KOKKOS_INLINE_FUNCTION int dt_vitesse_nb_comp() const { return static_cast(this)->template get_dt_vitesse_nb_comp(); } inline const DoubleTab& tab_vitesse() const { return static_cast(this)->get_tab_vitesse(); } - inline double surface_porosite(const int face) const { return static_cast(this)->get_surface_porosite(face); } + template + KOKKOS_INLINE_FUNCTION double surface_porosite(const int face) const { return static_cast(this)->template get_surface_porosite(face); } template inline void quick_fram_(const Type_Double& psc, const int num0, const int num1, const int num0_0, const int num1_1, const int face, const DoubleTab& transporte, Type_Double& flux) const { static_cast(this)->template quick_fram(psc, num0, num1, num0_0, num1_1, face, transporte, flux); } + KOKKOS_INLINE_FUNCTION + void quick_fram_view_(const double psc, const int num0, const int num1, const int num0_0, const int num1_1, const int face, CDoubleTabView transporte, DoubleArrView flux) const + { static_cast(this)->quick_fram_view(psc, num0, num1, num0_0, num1_1, face, transporte, flux); } + KOKKOS_INLINE_FUNCTION + void quick_fram_view_comp_(const double psc, const int num0, const int num1, const int num0_0, const int num1_1, const int face, CDoubleTabView transporte, const int k, double& flux) const + { static_cast(this)->quick_fram_view_comp(psc, num0, num1, num0_0, num1_1, face, transporte, k, flux); } template inline void qcentre_(const double psc, const int num0, const int num1, const int num0_0, const int num1_1, const int face, const DoubleTab& transporte, Type_Double& flux) const { static_cast(this)->template qcentre(psc,num0,num1,num0_0,num1_1,face,transporte,flux); } + KOKKOS_INLINE_FUNCTION + void qcentre_view_(const double psc, const int num0, const int num1, const int num0_0, const int num1_1, const int face, CDoubleTabView transporte, DoubleArrView flux) const + { static_cast(this)->qcentre_view(psc,num0,num1,num0_0,num1_1,face,transporte,flux); } + KOKKOS_INLINE_FUNCTION + void qcentre_view_comp_(const double psc, const int num0, const int num1, const int num0_0, const int num1_1, const int face, CDoubleTabView transporte, const int k, double& flux) const + { static_cast(this)->qcentre_view_comp(psc,num0,num1,num0_0,num1_1,face,transporte,k,flux); } }; #include // templates specializations ici ;) diff --git a/src/VDF/Operateurs/Eval_Conv/Eval_Conv_VDF_Elem.tpp b/src/VDF/Operateurs/Eval_Conv/Eval_Conv_VDF_Elem.tpp index 3b2f1e3c45..caf50497c6 100644 --- a/src/VDF/Operateurs/Eval_Conv/Eval_Conv_VDF_Elem.tpp +++ b/src/VDF/Operateurs/Eval_Conv/Eval_Conv_VDF_Elem.tpp @@ -21,7 +21,7 @@ inline void Eval_Conv_VDF_Elem::flux_face(const DoubleTab& inco, cons { for (int n = 0; n < flux.size_array(); n++) { - const int ind = (tab_vitesse().line_size() == flux.size_array()) ? n : 0; + const int ind = (dt_vitesse_nb_comp() == flux.size_array()) ? n : 0; const double psc = dt_vitesse(f,ind) * surface_porosite(f); for (int i = 0, e; i < 2; i++) if ((e = elem_(f, i)) > -1) @@ -34,7 +34,7 @@ inline void Eval_Conv_VDF_Elem::flux_face(const DoubleTab& inco, cons { for (int n = 0; n < flux.size_array(); n++) { - const int ind = (tab_vitesse().line_size() == flux.size_array()) ? n : 0; + const int ind = (dt_vitesse_nb_comp() == flux.size_array()) ? n : 0; const double psc = dt_vitesse(f, ind) * surface_porosite(f); for (int i = 0, e; i < 2; i++) if ((e = elem_(f, i)) > -1) @@ -42,91 +42,84 @@ inline void Eval_Conv_VDF_Elem::flux_face(const DoubleTab& inco, cons } } -template template -inline void Eval_Conv_VDF_Elem::flux_face(const DoubleTab& inco, const DoubleTab&, const int face, const Periodique& la_cl, const int num1, Type_Double& flux) const -{ - const int i = elem_(face,0), j = elem_(face,1), ncomp = flux.size_array(); - if (!DERIVED_T::IS_AMONT) +template template +KOKKOS_INLINE_FUNCTION void Eval_Conv_VDF_Elem::flux_faces_bord_comp(CDoubleTabView inco, CDoubleTabView val_b, const int face, const BC_View& bc_view, const int num1, const int k, double& flux) const +{ + if constexpr (std::is_same_v) { - const double psc = dt_vitesse(face) * surface_porosite(face); - const int i_0 = amont_amont_(face, 0), j_1 = amont_amont_(face, 1); - - if (DERIVED_T::IS_CENTRE || DERIVED_T::IS_CENTRE4) - qcentre_ < Type_Double > (psc, i, j, i_0, j_1, face, inco, flux); // on applique le schema centre 2 ou 4 + const int i = elem_(face,0), j = elem_(face,1); + if constexpr (!DERIVED_T::IS_AMONT) + { + const double psc = dt_vitesse(face) * surface_porosite(face); + const int i_0 = amont_amont_(face,0), j_1 = amont_amont_(face,1); + if constexpr (DERIVED_T::IS_CENTRE || DERIVED_T::IS_CENTRE4) + qcentre_view_comp_(psc,i,j,i_0,j_1,face,inco,k,flux); + else + quick_fram_view_comp_(psc,i,j,i_0,j_1,face,inco,k,flux); + flux *= -1.; + } else { - Type_Double psc_multi(ncomp); - for (int k = 0; k < ncomp; k++) - { - const int ind = (tab_vitesse().line_size() == ncomp) ? k : 0; - psc_multi[k] = dt_vitesse(face, ind) * surface_porosite(face); - } - quick_fram_(psc_multi, i, j, i_0, j_1, face, inco, flux); // on applique le schema Quick + const int ind = (dt_vitesse_nb_comp() == 1) ? 0 : k; + const double psc = dt_vitesse(face,ind) * surface_porosite(face); + flux = (psc > 0) ? -psc * inco(i,k) : -psc * inco(j,k); } - - for (int k = 0; k < ncomp; k++) flux[k] *= -1; + } + else if constexpr (std::is_same_v || std::is_same_v) + { + const int ind = (dt_vitesse_nb_comp() == 1) ? 0 : k; + const double psc = dt_vitesse(face,ind) * surface_porosite(face); + flux = 0.; + for (int i = 0, e; i < 2; i++) + if ((e = elem_(face,i)) > -1) + flux = -psc * (((psc > 0 && !i) || (psc <= 0 && i)) ? inco(e,k) : val_b(face,k)); } else - for (int k = 0; k < ncomp; k++) - { - const int ind = (tab_vitesse().line_size() == ncomp) ? k : 0; - const double psc = dt_vitesse(face, ind) * surface_porosite(face); - flux[k] = (psc > 0) ? -psc * inco(i, k) : -psc * inco(j, k); /* AMONT */ - } + flux = 0.; // generic do nothing } -template template -inline void Eval_Conv_VDF_Elem::flux_faces_interne(const DoubleTab& inco, const int face, Type_Double& flux) const + +template +KOKKOS_INLINE_FUNCTION void Eval_Conv_VDF_Elem::flux_faces_interne_comp(CDoubleTabView inco, const int face, const int k, double& flux) const { - const int i = elem_(face,0), j = elem_(face,1), ncomp = flux.size_array(); - if (!DERIVED_T::IS_AMONT) + const int i = elem_(face,0), j = elem_(face,1); + if constexpr (!DERIVED_T::IS_AMONT) { - const double psc = dt_vitesse(face)*surface_porosite(face); - const int i_0 = amont_amont_(face,0), j_1 = amont_amont_(face,1); - if (DERIVED_T::IS_CENTRE) + const double psc = dt_vitesse(face)*surface_porosite(face); + const int i_0 = amont_amont_(face,0), j_1 = amont_amont_(face,1); + if constexpr (DERIVED_T::IS_CENTRE) { - qcentre_(psc,i,j,i_0,j_1,face,inco,flux); - for (int k=0; k(psc,i,j,i_0,j_1,face,inco,flux); - for (int k=0; k= 0) || (j_1 == -1 && psc <= 0)) + flux = (psc > 0) ? -psc*inco(i,k) : -psc*inco(j,k); + else { - const int ind = (tab_vitesse().line_size() == ncomp) ? k : 0; - psc_multi[k] = dt_vitesse(face, ind) * surface_porosite(face); + quick_fram_view_comp_(psc,i,j,i_0,j_1,face,inco,k,flux); + flux *= -1.; } - quick_fram_(psc_multi,i,j,i_0,j_1,face,inco,flux); - for (int k=0; k 0) ? -psc * inco(i, k) : -psc * inco(j, k); /* AMONT */ - } + { + const int ind = (dt_vitesse_nb_comp() == 1) ? 0 : k; + const double psc = dt_vitesse(face, ind)*surface_porosite(face); + flux = (psc > 0) ? -psc * inco(i, k) : -psc * inco(j, k); + } } /* ************************************** * @@ -149,7 +142,7 @@ inline void Eval_Conv_VDF_Elem::coeffs_face_common(const int face, Ty else for (int k = 0; k < ncomp; k++) { - const int ind = (tab_vitesse().line_size() == ncomp) ? k : 0; + const int ind = (dt_vitesse_nb_comp() == ncomp) ? k : 0; psc = dt_vitesse(face, ind)*surface_porosite(face); aii[k] = (psc > 0) ? psc : 0.; ajj[k] = 0.; @@ -165,7 +158,7 @@ inline void Eval_Conv_VDF_Elem::coeffs_face_common(const int face, Ty else for (int k = 0; k < ncomp; k++) { - const int ind = (tab_vitesse().line_size() == ncomp) ? k : 0; + const int ind = (dt_vitesse_nb_comp() == ncomp) ? k : 0; psc = dt_vitesse(face, ind)*surface_porosite(face); ajj[k] = (psc < 0) ? -psc : 0.; aii[k] = 0.; @@ -203,7 +196,7 @@ inline void Eval_Conv_VDF_Elem::coeffs_face(const int face, const int else for (int k = 0; k < ncomp; k++) { - const int ind = (tab_vitesse().line_size() == ncomp) ? k : 0; + const int ind = (dt_vitesse_nb_comp() == ncomp) ? k : 0; const double psc = dt_vitesse(face, ind)*surface_porosite(face); aii[k] = (psc > 0) ? psc : 0.; ajj[k] = (psc > 0) ? 0. : -psc; @@ -237,7 +230,7 @@ inline void Eval_Conv_VDF_Elem::coeffs_faces_interne(const int face, else for (int k = 0; k < ncomp; k++) { - const int ind = (tab_vitesse().line_size() == ncomp) ? k : 0; + const int ind = (dt_vitesse_nb_comp() == ncomp) ? k : 0; const double psc = dt_vitesse(face, ind)*surface_porosite(face); aii[k] = (psc > 0) ? psc : 0.; ajj[k] = (psc > 0) ? 0. : -psc; @@ -254,7 +247,7 @@ inline void Eval_Conv_VDF_Elem::coeffs_face_bloc_vitesse(const Double for (int i = 0, e; i < 2; i++) if ((e = elem_(f, i)) > -1) { - const int ind = (tab_vitesse().line_size() == flux.size_array()) ? n : 0; + const int ind = (dt_vitesse_nb_comp() == flux.size_array()) ? n : 0; flux[n] = surface_porosite(f) * (((dt_vitesse(f, ind) > 0 && !i) || (dt_vitesse(f, ind) <= 0 && i)) ? inco(e, n) : val_b(f, n)); @@ -272,7 +265,7 @@ inline void Eval_Conv_VDF_Elem::coeffs_face_bloc_vitesse(const Double for (int i = 0, e; i < 2; i++) if ((e = elem_(f, i)) > -1) { - const int ind = (tab_vitesse().line_size() == flux.size_array()) ? n : 0; + const int ind = (dt_vitesse_nb_comp() == flux.size_array()) ? n : 0; flux[n] = surface_porosite(f) * (((dt_vitesse(f, ind) > 0 && !i) || (dt_vitesse(f, ind) <= 0 && i)) ? inco(e, n) : val_b(f, n)); @@ -296,7 +289,7 @@ inline void Eval_Conv_VDF_Elem::coeffs_face_bloc_vitesse_common(const else for (int k = 0; k < ncomp; k++) { - const int ind = (tab_vitesse().line_size() == ncomp) ? k : 0; + const int ind = (dt_vitesse_nb_comp() == ncomp) ? k : 0; flux[k] = (dt_vitesse(face, ind) > 0) ? psc * inco(i, k) : psc * inco(j, k); } } diff --git a/src/VDF/Operateurs/Eval_Conv/Eval_Conv_VDF_Elem_leaves.h b/src/VDF/Operateurs/Eval_Conv/Eval_Conv_VDF_Elem_leaves.h index 6ed54b39c4..fbf309ce88 100644 --- a/src/VDF/Operateurs/Eval_Conv/Eval_Conv_VDF_Elem_leaves.h +++ b/src/VDF/Operateurs/Eval_Conv/Eval_Conv_VDF_Elem_leaves.h @@ -55,14 +55,21 @@ class Eval_Centre_VDF_Elem : public Eval_Conv_VDF_Elem, pu { public: static constexpr bool IS_CENTRE = true; - inline int amont_amont(int face, int i) const override { return le_dom->amont_amont(face, i); } - inline double dim_elem(int n1, int k) const override { return le_dom->dim_elem(n1,k); } - inline double dist_elem(int n1, int n2, int k) const override { return le_dom->dist_elem(n1,n2,k); } - inline double dist_face_elem1(int num_face,int n1) const { return le_dom->dist_face_elem1(num_face, n1); } - template inline void qcentre(const double psc, const int num0, const int num1, const int num0_0, const int num1_1,const int face,const DoubleTab& transporte, Type_Double& flux) const - { qcentre2_impl(psc,num0,num1,num0_0,num1_1,face,transporte,flux); } + { + qcentre2_impl(psc,num0,num1,num0_0,num1_1,face,transporte,flux); + } + KOKKOS_INLINE_FUNCTION + void qcentre_view(const double psc, const int num0, const int num1, const int num0_0, const int num1_1,const int face,CDoubleTabView transporte, DoubleArrView flux) const + { + qcentre2_impl_view(psc,num0,num1,num0_0,num1_1,face,transporte,flux); + } + KOKKOS_INLINE_FUNCTION + void qcentre_view_comp(const double psc, const int num0, const int num1, const int num0_0, const int num1_1, const int face, CDoubleTabView transporte, const int k, double& flux) const + { + qcentre2_impl_comp(psc,num0,num1,transporte,k,flux); + } }; /*! @brief class Eval_Centre4_VDF_Elem Evaluateur VDF pour la convection Le champ convecte est scalaire (Champ_P0_VDF) @@ -75,18 +82,29 @@ class Eval_Centre4_VDF_Elem : public Eval_Conv_VDF_Elem, public: static constexpr bool IS_CENTRE4 = true; - inline int amont_amont(int face, int i) const override { return le_dom->amont_amont(face, i); } - inline double dist_elem(int n1, int n2, int k) const override { return le_dom->dist_elem_period(n1,n2,k); } - template inline void qcentre(const double ,const int ,const int ,const int ,const int ,const int , const DoubleTab& ,Type_Double& ) const; + template + inline void qcentre(const double psc, const int num0, const int num1, const int num0_0, const int num1_1, const int face, const DoubleTab& transporte, Type_Double& flux) const + { + const int ori = orientation(face); + const double dx = get_dist_elem_period(num0, num1, ori), dxam = get_dist_elem_period(num0_0, num0, ori), dxav = get_dist_elem_period(num1, num1_1, ori); + qcentre4_impl(ori,dx,dxam,dxav,psc,num0,num1,num0_0,num1_1,face,transporte,flux); + } + KOKKOS_INLINE_FUNCTION + void qcentre_view(const double psc, const int num0, const int num1, const int num0_0, const int num1_1, const int face, CDoubleTabView transporte, DoubleArrView flux) const + { + const int ori = orientation_v_(face); + const double dx = get_dist_elem_period(num0, num1, ori), dxam = get_dist_elem_period(num0_0, num0, ori), dxav = get_dist_elem_period(num1, num1_1, ori); + qcentre4_impl_view(ori,dx,dxam,dxav,psc,num0,num1,num0_0,num1_1,face,transporte,flux); + } + KOKKOS_INLINE_FUNCTION + void qcentre_view_comp(const double psc, const int num0, const int num1, const int num0_0, const int num1_1, const int face, CDoubleTabView transporte, const int k, double& flux) const + { + const int ori = orientation_v_(face); + const double dx = get_dist_elem_period(num0, num1, ori), dxam = get_dist_elem_period(num0_0, num0, ori), dxav = get_dist_elem_period(num1, num1_1, ori); + qcentre4_impl_comp(ori,dx,dxam,dxav,psc,num0,num1,num0_0,num1_1,transporte,k,flux); + } }; -template -inline void Eval_Centre4_VDF_Elem::qcentre(const double psc, const int num0, const int num1, const int num0_0, const int num1_1, const int face, const DoubleTab& transporte, Type_Double& flux) const -{ - const int ori = orientation(face); - const double dx = dist_elem(num0, num1, ori), dxam = dist_elem(num0_0, num0, ori), dxav = dist_elem(num1, num1_1, ori); - qcentre4_impl(ori,dx,dxam,dxav,psc,num0,num1,num0_0,num1_1,face,transporte,flux); -} /*! @brief class Eval_Quick_VDF_Elem Evaluateur VDF pour la convection Le champ convecte est scalaire (Champ_P0_VDF) * @@ -97,20 +115,33 @@ class Eval_Quick_VDF_Elem : public Eval_Conv_VDF_Elem, publ { public: static constexpr bool IS_QUICK = true; - inline int amont_amont(int face, int i) const override { return le_dom->amont_amont(face, i); } - inline double dim_elem(int n1, int k) const override { return le_dom->dim_elem(n1,k); } - inline double dist_elem(int n1, int n2, int k) const override { return le_dom->dist_elem_period(n1,n2,k); } - template inline void quick_fram(const Type_Double&, const int, const int,const int, const int ,const int ,const DoubleTab&, Type_Double& ) const; + template + inline void quick_fram(const double psc, const int num0, const int num1,const int num0_0, const int num1_1, const int face,const DoubleTab& transporte, Type_Double& flux) const + { + const int ori = orientation(face); + const double dx = get_dist_elem_period(num0, num1, ori), + dm0 = get_dim_elem(num0, ori), dxam0 = (num0_0!=-1?get_dist_elem_period(num0_0, num0, ori):0), + dm1 = get_dim_elem(num1, ori), dxam1 = (num1_1!=-1?get_dist_elem_period(num1, num1_1, ori):0); + quick_fram_impl(ori,dx,dm0,dxam0,dm1,dxam1,psc,num0,num1,num0_0,num1_1,face,transporte,flux); + } + KOKKOS_INLINE_FUNCTION + void quick_fram_view(const double psc, const int num0, const int num1, const int num0_0, const int num1_1, const int face, CDoubleTabView transporte, DoubleArrView flux) const + { + const int ori = orientation_v_(face); + const double dx = get_dist_elem_period(num0, num1, ori), + dm0 = get_dim_elem(num0, ori), dxam0 = (num0_0!=-1?get_dist_elem_period(num0_0, num0, ori):0), + dm1 = get_dim_elem(num1, ori), dxam1 = (num1_1!=-1?get_dist_elem_period(num1, num1_1, ori):0); + quick_fram_impl_view(ori,dx,dm0,dxam0,dm1,dxam1,psc,num0,num1,num0_0,num1_1,face,transporte,flux); + } + KOKKOS_INLINE_FUNCTION + void quick_fram_view_comp(const double psc, const int num0, const int num1, const int num0_0, const int num1_1, const int face, CDoubleTabView transporte, const int k, double& flux) const + { + const int ori = orientation_v_(face); + const double dx = get_dist_elem_period(num0, num1, ori), + dm0 = get_dim_elem(num0, ori), dxam0 = (num0_0!=-1?get_dist_elem_period(num0_0, num0, ori):0), + dm1 = get_dim_elem(num1, ori), dxam1 = (num1_1!=-1?get_dist_elem_period(num1, num1_1, ori):0); + quick_fram_impl_comp(ori,dx,dm0,dxam0,dm1,dxam1,psc,num0,num1,num0_0,num1_1,transporte,k,flux); + } }; -template -inline void Eval_Quick_VDF_Elem::quick_fram(const Type_Double& psc, const int num0, const int num1,const int num0_0, const int num1_1, const int face,const DoubleTab& transporte, Type_Double& flux) const -{ - const int ori = orientation(face); - const double dx = dist_elem(num0, num1, ori), - dm0 = dim_elem(num0, ori), dxam0 = (num0_0!=-1?dist_elem(num0_0, num0, ori):0), - dm1 = dim_elem(num1, ori), dxam1 = (num1_1!=-1?dist_elem(num1, num1_1, ori):0); - quick_fram_impl(ori,dx,dm0,dxam0,dm1,dxam1,psc,num0,num1,num0_0,num1_1,face,transporte,flux); -} - #endif /* Eval_Conv_VDF_Elem_leaves_included */ diff --git a/src/VDF/Operateurs/Eval_Conv/Eval_Conv_VDF_Face.h b/src/VDF/Operateurs/Eval_Conv/Eval_Conv_VDF_Face.h index 192ae8f759..1b1520a87b 100644 --- a/src/VDF/Operateurs/Eval_Conv/Eval_Conv_VDF_Face.h +++ b/src/VDF/Operateurs/Eval_Conv/Eval_Conv_VDF_Face.h @@ -1,5 +1,5 @@ /**************************************************************************** -* Copyright (c) 2023, CEA +* Copyright (c) 2026, CEA * All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: @@ -34,40 +34,53 @@ class Eval_Conv_VDF_Face : public Eval_VDF_Face * ********* POUR L'EXPLICITE ********** * * ************************************** */ - template inline std::enable_if_t< Fa7_Type == Type_Flux_Fa7::SORTIE_LIBRE, void> - flux_fa7(const DoubleTab&, const DoubleTab*, int , const Neumann_sortie_libre&, int, Type_Double& ) const; + template + inline std::enable_if_t< Arete_Type == Type_Flux_Arete::INTERNE, void> + flux_arete(const DoubleTab&, const DoubleTab*, int, int, int, int, Type_Double& ) const ; - template inline std::enable_if_t< Fa7_Type == Type_Flux_Fa7::ELEM, void> - flux_fa7(const DoubleTab&, const DoubleTab*,int, int, int, Type_Double& ) const; + // _comp variants: void with output ref(s) for one component k + template + KOKKOS_FORCEINLINE_FUNCTION std::enable_if_t + flux_arete_comp(CDoubleTabView, CDoubleTabView, int, int, int, int, int, double&) const; - template inline std::enable_if_t< Arete_Type == Type_Flux_Arete::INTERNE, void> - flux_arete(const DoubleTab&, const DoubleTab*, int, int, int, int, Type_Double& ) const ; + template + KOKKOS_INLINE_FUNCTION std::enable_if_t + flux_arete_comp(CDoubleTabView, CDoubleTabView, int, int, int, int, int, double&) const; - template inline std::enable_if_t< Arete_Type == Type_Flux_Arete::MIXTE, void> - flux_arete(const DoubleTab&, const DoubleTab*,int, int, int, int, Type_Double& ) const ; + template + KOKKOS_INLINE_FUNCTION std::enable_if_t + flux_arete_comp(CDoubleTabView, CDoubleTabView, CDoubleTabView, CDoubleTabView, int, int, int, int, int, double& flux) const { flux = 0.; } - template - inline std::enable_if_t<(Arete_Type == Type_Flux_Arete::PAROI || Arete_Type == Type_Flux_Arete::NAVIER || Arete_Type == Type_Flux_Arete::NAVIER_PAROI), void> - flux_arete(const DoubleTab&, const DoubleTab*,int, int, int, int, Type_Double& ) const { /* do nothing */ } + template + KOKKOS_INLINE_FUNCTION std::enable_if_t + flux_arete_comp(CDoubleTabView, CDoubleTabView, CDoubleTabView, CDoubleTabView, int, int, int, int, int, int, double&, double&) const; - template - inline std::enable_if_t - flux_arete(const DoubleTab&, const DoubleTab*,int, int, int, int, Type_Double&, Type_Double& ) const; + template + KOKKOS_INLINE_FUNCTION std::enable_if_t + flux_arete_comp(CDoubleTabView, CDoubleTabView, int, int, int, int, int, double&, double&) const; - template inline std::enable_if_t< Arete_Type == Type_Flux_Arete::PERIODICITE, void> - flux_arete(const DoubleTab&, const DoubleTab*, int, int, int, int, Type_Double&, Type_Double& ) const ; + template + KOKKOS_INLINE_FUNCTION std::enable_if_t + flux_arete_comp(CDoubleTabView, CDoubleTabView, CDoubleTabView, CDoubleTabView, int, int, int, int, int, double&, double&) const; - template inline std::enable_if_t< Arete_Type == Type_Flux_Arete::COIN_FLUIDE, void> - flux_arete(const DoubleTab&, const DoubleTab*, int, int, int, int, Type_Double&, Type_Double&) const; + template + KOKKOS_FORCEINLINE_FUNCTION std::enable_if_t + flux_fa7_comp(CDoubleTabView, CDoubleTabView, int, int, int, int, double&) const; + + template + KOKKOS_INLINE_FUNCTION std::enable_if_t + flux_fa7_comp(CDoubleTabView, CDoubleTabView, int, CDoubleTabView, int, int, double&) const; /* ************************************** * - * ********* POUR L'IMPLICITE ********** * - * ************************************** */ + * ********* POUR L'IMPLICITE ********** * + * ************************************** */ - template inline std::enable_if_t< Fa7_Type == Type_Flux_Fa7::SORTIE_LIBRE, void> + template + inline std::enable_if_t< Fa7_Type == Type_Flux_Fa7::SORTIE_LIBRE, void> coeffs_fa7(const DoubleTab*, int , const Neumann_sortie_libre&, Type_Double& , Type_Double& ) const; - template inline std::enable_if_t< Fa7_Type == Type_Flux_Fa7::ELEM, void> + template + inline std::enable_if_t< Fa7_Type == Type_Flux_Fa7::ELEM, void> coeffs_fa7(const DoubleTab*, int, int, int, Type_Double& , Type_Double& ) const; template @@ -76,43 +89,56 @@ class Eval_Conv_VDF_Face : public Eval_VDF_Face template inline std::enable_if_t< Arete_Type == Type_Flux_Arete::FLUIDE || Arete_Type == Type_Flux_Arete::NAVIER_FLUIDE || Arete_Type == Type_Flux_Arete::PAROI_FLUIDE || Arete_Type == Type_Flux_Arete::COIN_FLUIDE, void> - coeffs_arete(const DoubleTab*, int, int, int, int, Type_Double& , Type_Double& , Type_Double& ) const; + coeffs_arete(const DoubleTab&, const DoubleTab&, const DoubleTab*, int, int, int, int, Type_Double& , Type_Double& , Type_Double& ) const; + + template + inline std::enable_if_t<(Arete_Type == Type_Flux_Arete::PAROI || Arete_Type == Type_Flux_Arete::NAVIER || Arete_Type == Type_Flux_Arete::NAVIER_PAROI), void> + coeffs_arete(const DoubleTab&, const DoubleTab&, const DoubleTab*, int, int, int, int, Type_Double& , Type_Double& , Type_Double& ) const { /* do nothing */ } - template inline - std::enable_if_t<(Arete_Type == Type_Flux_Arete::PAROI || Arete_Type == Type_Flux_Arete::NAVIER || Arete_Type == Type_Flux_Arete::NAVIER_PAROI), void> - coeffs_arete(const DoubleTab*, int, int, int, int, Type_Double& , Type_Double& , Type_Double& ) const { /* do nothing */ } private: template inline void fill_coeffs_proto(const int, const double , const double, Type_Double& , Type_Double& ) const; // CRTP pattern to static_cast the appropriate class and get the implementation: This is magic ! - inline int premiere_face_bord() const { return static_cast(this)->get_premiere_face_bord(); } - inline int orientation(int face) const { return static_cast(this)->get_orientation(face); } - inline int elem_(int i, int j) const { return static_cast(this)->get_elem(i,j); } - inline int face_amont_princ_(int num_face, int i) const { return static_cast(this)->face_amont_princ(num_face,i); } - inline int face_amont_conj_(int num_face,int i, int k) const { return static_cast(this)->face_amont_conj(num_face,i,k); } - inline double dt_vitesse(int face, int comp = 0) const { return static_cast(this)->get_dt_vitesse(face, comp); } - inline double surface_porosite(int face) const { return static_cast(this)->get_surface_porosite(face); } - inline double surface(int face) const { return static_cast(this)->get_surface(face); } - inline double porosite(int face) const { return static_cast(this)->get_porosite(face); } - inline double dim_face_(int n1,int k) const { return static_cast(this)->dim_face(n1,k); } - inline double dim_elem_(int n1,int k) const { return static_cast(this)->dim_elem(n1,k); } - inline double dist_face_(int n1,int n2,int k) const { return static_cast(this)->dist_face(n1,n2,k); } - inline double dist_face_period_(int n1,int n2,int k) const { return static_cast(this)->dist_face_period(n1,n2,k); } - inline double dist_elem_period_(int n1, int n2, int k) const { return static_cast(this)->dist_elem_period(n1,n2,k); } + template + KOKKOS_INLINE_FUNCTION int orientation(int face) const { return static_cast(this)->template get_orientation(face); } + template + KOKKOS_INLINE_FUNCTION int elem_(int i, int j) const { return static_cast(this)->template get_elem(i,j); } + template + KOKKOS_INLINE_FUNCTION int face_amont_princ_(int num_face, int i) const { return static_cast(this)->template get_face_amont_princ(num_face,i); } + template + KOKKOS_FORCEINLINE_FUNCTION int face_amont_conj_(int num_face,int i, int k) const { return static_cast(this)->template get_face_amont_conj(num_face,i,k); } + template + KOKKOS_INLINE_FUNCTION double dt_vitesse(int face, int comp = 0) const { return static_cast(this)->template get_dt_vitesse(face, comp); } + //KOKKOS_INLINE_FUNCTION double surface_porosite(int face) const { return static_cast(this)->get_surface_porosite(face); } + template + KOKKOS_INLINE_FUNCTION double surface(int face) const { return static_cast(this)->template get_surface(face); } + template + KOKKOS_INLINE_FUNCTION double porosite(int face) const { return static_cast(this)->template get_porosite(face); } + template + KOKKOS_INLINE_FUNCTION double dim_face_(int n1,int k) const { return static_cast(this)->template get_dim_face(n1,k); } + template + KOKKOS_INLINE_FUNCTION double dim_elem_(int n1,int k) const { return static_cast(this)->template get_dim_elem(n1,k); } + template + KOKKOS_INLINE_FUNCTION double dist_face_(int n1,int n2,int k) const { return static_cast(this)->template get_dist_face(n1,n2,k); } + template + KOKKOS_INLINE_FUNCTION double dist_face_period_(int n1,int n2,int k) const { return static_cast(this)->template get_dist_face_period(n1,n2,k); } + template + KOKKOS_INLINE_FUNCTION double dist_elem_period_(int n1, int n2, int k) const { return static_cast(this)->template get_dist_elem_period(n1,n2,k); } + inline const Domaine_Cl_VDF& la_zcl() const { return static_cast(this)->get_la_zcl(); } - inline double conv_quick_sharp_plus_(const double psc,const double vit_0, const double vit_1, const double vit_0_0, const double dx, const double dm, const double dxam) const + KOKKOS_INLINE_FUNCTION double conv_quick_sharp_plus_(const double psc,const double vit_0, const double vit_1, const double vit_0_0, const double dx, const double dm, const double dxam) const { return static_cast(this)->conv_quick_sharp_plus(psc,vit_0,vit_1,vit_0_0,dx,dm,dxam); } - inline double conv_quick_sharp_moins_(const double psc,const double vit_0,const double vit_1, const double vit_1_1,const double dx, const double dm,const double dxam) const + KOKKOS_INLINE_FUNCTION double conv_quick_sharp_moins_(const double psc,const double vit_0,const double vit_1, const double vit_1_1,const double dx, const double dm,const double dxam) const { return static_cast(this)->conv_quick_sharp_moins(psc,vit_0,vit_1,vit_1_1,dx,dm,dxam); } - inline double conv_centre_(const double psc,const double vit_0_0, const double vit_0, const double vit_1, const double vit1_1,double g1, double g2, double g3,double g4) const + KOKKOS_INLINE_FUNCTION double conv_centre_(const double psc,const double vit_0_0, const double vit_0, const double vit_1, const double vit1_1,double g1, double g2, double g3,double g4) const { return static_cast(this)->conv_centre(psc,vit_0_0,vit_0,vit_1,vit1_1,g1,g2,g3,g4); } - inline void calcul_g_(const double dxam, const double dx, const double dxav, double& g1, double& g2, double& g3, double& g4) const + KOKKOS_INLINE_FUNCTION void calcul_g_(const double dxam, const double dx, const double dxav, double& g1, double& g2, double& g3, double& g4) const { static_cast(this)->calcul_g(dxam,dx,dxav,g1,g2,g3,g4); } }; diff --git a/src/VDF/Operateurs/Eval_Conv/Eval_Conv_VDF_Face.tpp b/src/VDF/Operateurs/Eval_Conv/Eval_Conv_VDF_Face.tpp index b398471925..57f0ec1eef 100644 --- a/src/VDF/Operateurs/Eval_Conv/Eval_Conv_VDF_Face.tpp +++ b/src/VDF/Operateurs/Eval_Conv/Eval_Conv_VDF_Face.tpp @@ -20,102 +20,67 @@ * ********* POUR L'EXPLICITE ********** * * ************************************** */ -template template inline std::enable_if_t< Fa7_Type == Type_Flux_Fa7::SORTIE_LIBRE, void> -Eval_Conv_VDF_Face::flux_fa7(const DoubleTab& inco, const DoubleTab* a_r, int face, const Neumann_sortie_libre& la_cl, int num1, Type_Double& flux) const -{ - const int elem1 = elem_(face, 0), elem2 = elem_(face,1); - for (int k = 0; k < flux.size_array(); k++) - { - double psc = dt_vitesse(face, k) * surface(face); - if (a_r && DERIVED_T::IS_AMONT) psc *= (*a_r)((elem1 != -1) ? elem1 : elem2, k); - flux[k] = -psc * inco(face, k) * porosite(face); - } -} - -template template inline std::enable_if_t< Fa7_Type == Type_Flux_Fa7::ELEM, void> -Eval_Conv_VDF_Face::flux_fa7(const DoubleTab& inco, const DoubleTab* a_r, int num_elem, int fac1, int fac2, Type_Double& flux) const +template template KOKKOS_FORCEINLINE_FUNCTION std::enable_if_t< Fa7_Type == Type_Flux_Fa7::ELEM, void> +Eval_Conv_VDF_Face::flux_fa7_comp(CDoubleTabView inco, CDoubleTabView a_r, int num_elem, int fac1, int fac2, int k, double& flux) const { - const int ncomp = flux.size_array(); - double psc = 0.25*(dt_vitesse(fac1)+dt_vitesse(fac2))*(surface(fac1)+surface(fac2)); + double psc = 0.25 * (dt_vitesse(fac1) + dt_vitesse(fac2)) * + (surface(fac1) + surface(fac2)); if (DERIVED_T::IS_AMONT) { - for (int k = 0; k < ncomp; k++) + psc = 0.25 * (dt_vitesse(fac1, k) + dt_vitesse(fac2, k)) * + (surface(fac1) + surface(fac2)); + const int f = psc > 0 ? fac1 : fac2; + if (a_r.size() > 0) { - psc = 0.25*(dt_vitesse(fac1,k)+dt_vitesse(fac2,k))*(surface(fac1)+surface(fac2)); - const int f = psc > 0 ? fac1 : fac2; - - if (a_r) - { - const int elem = elem_(f, 0), elem2 = elem_(f, 1); - const int e = dt_vitesse(f,k) > 0 ? (elem > -1 ? elem : elem2) : (elem2 > -1 ? elem2 : elem); - psc *= (*a_r)(e, k); - } - - flux[k] = -psc * inco(f, k) * porosite(f); + const int elem = elem_(f, 0), elem2 = elem_(f, 1); + const int e = dt_vitesse(f, k) > 0 ? (elem > -1 ? elem : elem2) : (elem2 > -1 ? elem2 : elem); + psc *= a_r(e, k); } + flux = -psc * inco(f, k) * porosite(f); } else if (DERIVED_T::IS_CENTRE) - for (int k = 0; k < ncomp; k++) flux[k] = -psc*0.5*(inco(fac1,k)*porosite(fac1)+inco(fac2,k)*porosite(fac2)); + flux = -psc * 0.5 * (inco(fac1, k) * porosite(fac1) + inco(fac2, k) * porosite(fac2)); else { - const int num0_0 = face_amont_princ_(fac1,0), num1_1 = face_amont_princ_(fac2,1); + const int num0_0 = face_amont_princ_(fac1, 0), num1_1 = face_amont_princ_(fac2, 1); if (DERIVED_T::IS_CENTRE4) { - const int ori = orientation(fac1); - if ( (num0_0 == -1) || (num1_1== -1) ) - for (int k = 0; k < ncomp; k++) flux[k] = -psc*0.5*(inco(fac1,k)*porosite(fac1)+inco(fac2,k)*porosite(fac2)); // Schema centre 2 - else // Schema centre 4 + const int ori = orientation(fac1); + if ((num0_0 == -1) || (num1_1 == -1)) + flux = -psc * 0.5 * (inco(fac1, k) * porosite(fac1) + inco(fac2, k) * porosite(fac2)); + else { - Type_Double vit_0(ncomp),vit_0_0(ncomp),vit_1_1(ncomp),vit_1(ncomp); - const double dx = dim_elem_(num_elem,ori), dxam = dim_elem_(elem_(fac1,0),ori), dxav = dim_elem_(elem_(fac2,1),ori); + const double dx = dim_elem_(num_elem, ori); + const double dxam = dim_elem_(elem_(fac1, 0), ori); + const double dxav = dim_elem_(elem_(fac2, 1), ori); double g1, g2, g3, g4; - calcul_g_(dxam,dx,dxav,g1,g2,g3,g4); - for (int k = 0; k < ncomp; k++) - { - vit_0_0[k] = inco(num0_0,k)*porosite(num0_0); - vit_0[k] = inco(fac1,k)*porosite(fac1); - vit_1[k] = inco(fac2,k)*porosite(fac2); - vit_1_1[k] = inco(num1_1,k)*porosite(num1_1); - flux[k] = -conv_centre_(psc,vit_0_0[k],vit_0[k],vit_1[k],vit_1_1[k],g1,g2,g3,g4); - } + calcul_g_(dxam, dx, dxav, g1, g2, g3, g4); + flux = -conv_centre_(psc, inco(num0_0, k) * porosite(num0_0), inco(fac1, k) * porosite(fac1), + inco(fac2, k) * porosite(fac2), inco(num1_1, k) * porosite(num1_1), g1, g2, g3, g4); } } - else // QUICK + else // QUICK { - if (psc > 0) + if (psc > 0) { if (num0_0 == -1) - for (int k=0; k(fac1); else { - Type_Double vit_0(ncomp), vit_0_0(ncomp), vit_1(ncomp); - const int ori = orientation(fac1), elem_amont = elem_(fac1,0); - const double dx = dim_elem_(num_elem,ori), dm = dist_elem_period_(elem_amont,num_elem,ori), dxam = dim_elem_(elem_amont,ori); - for (int k = 0; k < ncomp; k++) - { - vit_0[k] = inco(fac1,k)*porosite(fac1); - vit_1[k] = inco(fac2,k)*porosite(fac2); - vit_0_0[k] = inco(num0_0,k)*porosite(num0_0); - flux[k] = -conv_quick_sharp_plus_(psc,vit_0[k],vit_1[k],vit_0_0[k],dx,dm,dxam); - } + const int ori = orientation(fac1), elem_amont = elem_(fac1, 0); + const double dx = dim_elem_(num_elem, ori), dm = dist_elem_period_(elem_amont, num_elem, ori), dxam = dim_elem_(elem_amont, ori); + flux = -conv_quick_sharp_plus_(psc, inco(fac1, k) * porosite(fac1), inco(fac2, k) * porosite(fac2), inco(num0_0, k) * porosite(num0_0), dx, dm, dxam); } } - else // (psc < 0) + else { if (num1_1 == -1) - for (int k = 0; k < ncomp; k++) flux[k] = -psc*inco(fac2,k)*porosite(fac2); // Schema amont + flux = -psc * inco(fac2, k) * porosite(fac2); else { - Type_Double vit_0(ncomp), vit_1(ncomp), vit_1_1(ncomp); - const int ori = orientation(fac2), elem_amont = elem_(fac2,1); - const double dx = dim_elem_(num_elem,ori), dm = dist_elem_period_(num_elem,elem_amont,ori), dxam = dim_elem_(elem_amont,ori); - for (int k = 0; k < ncomp; k++) - { - vit_0[k] = inco(fac1,k)*porosite(fac1); - vit_1[k] = inco(fac2,k)*porosite(fac2); - vit_1_1[k] = inco(num1_1,k)*porosite(num1_1); - flux[k] = -conv_quick_sharp_moins_(psc,vit_0[k],vit_1[k],vit_1_1[k],dx,dm,dxam); - } + const int ori = orientation(fac2), elem_amont = elem_(fac2, 1); + const double dx = dim_elem_(num_elem, ori), dm = dist_elem_period_(num_elem, elem_amont, ori), dxam = dim_elem_(elem_amont, ori); + flux = -conv_quick_sharp_moins_(psc, inco(fac1, k) * porosite(fac1), inco(fac2, k) * porosite(fac2), inco(num1_1, k) * porosite(num1_1), dx, dm, dxam); } } } @@ -155,18 +120,17 @@ Eval_Conv_VDF_Face::flux_arete(const DoubleTab& inco, const DoubleTab for (int k = 0; k < ncomp; k++) flux[k] = -0.5*(inco(fac3,k)+inco(fac4,k))*psc ; // Schema centre 2 (pas assez de faces) else // Schema Centre 4 { - Type_Double vit_0(ncomp), vit_0_0(ncomp), vit_1_1(ncomp), vit_1(ncomp); // Inutile de prendre dist_face_period pour dx car fac3 et fac4 ne peuvent etre periodiques (arete interne) const double dx = dist_face_(fac3,fac4,ori), dxam = dist_face_period_(num0_0,fac3,ori), dxav = dist_face_period_(fac4,num1_1,ori); double g1, g2, g3, g4; calcul_g_(dxam,dx,dxav,g1,g2,g3,g4); for (int k = 0; k < ncomp; k++) { - vit_0_0[k] = inco(num0_0,k); - vit_0[k] = inco(fac3,k); - vit_1[k] = inco(fac4,k); - vit_1_1[k] = inco(num1_1,k); - flux[k] = -conv_centre_(psc,vit_0_0[k],vit_0[k],vit_1[k],vit_1_1[k],g1,g2,g3,g4); + const double vit_0_0 = inco(num0_0,k); + const double vit_0 = inco(fac3,k); + const double vit_1 = inco(fac4,k); + const double vit_1_1 = inco(num1_1,k); + flux[k] = -conv_centre_(psc,vit_0_0,vit_0,vit_1,vit_1_1,g1,g2,g3,g4); } } } @@ -178,14 +142,13 @@ Eval_Conv_VDF_Face::flux_arete(const DoubleTab& inco, const DoubleTab for (int k = 0; k < ncomp; k++) flux[k] = -psc*inco(fac3,k); // Schema amont else // Schema quick { - Type_Double vit_0(ncomp), vit_0_0(ncomp), vit_1(ncomp); const double dx = dist_face_period_(fac3,fac4,ori), dm = dim_face_(fac3,ori), dxam = dist_face_period_(num0_0,fac3,ori); for (int k = 0; k < ncomp; k++) { - vit_0[k] = inco(fac3,k); - vit_1[k] = inco(fac4,k); - vit_0_0[k] = inco(num0_0,k); - flux[k] = -conv_quick_sharp_plus_(psc,vit_0[k],vit_1[k],vit_0_0[k],dx,dm,dxam); + const double vit_0 = inco(fac3,k); + const double vit_1 = inco(fac4,k); + const double vit_0_0 = inco(num0_0,k); + flux[k] = -conv_quick_sharp_plus_(psc,vit_0,vit_1,vit_0_0,dx,dm,dxam); } } } @@ -195,14 +158,13 @@ Eval_Conv_VDF_Face::flux_arete(const DoubleTab& inco, const DoubleTab for (int k = 0; k < ncomp; k++) flux[k] = -psc*inco(fac4,k); // Schema amont else // Schema quick { - Type_Double vit_0(ncomp), vit_1(ncomp), vit_1_1(ncomp); const double dx = dist_face_period_(fac3,fac4,ori), dm = dim_face_(fac4,ori), dxam = dist_face_period_(fac4,num1_1,ori); for (int k = 0; k < ncomp; k++) { - vit_0[k] = inco(fac3,k); - vit_1[k] = inco(fac4,k); - vit_1_1[k] = inco(num1_1,k); - flux[k] = -conv_quick_sharp_moins_(psc,vit_0[k],vit_1[k],vit_1_1[k],dx,dm,dxam); + const double vit_0 = inco(fac3,k); + const double vit_1 = inco(fac4,k); + const double vit_1_1 = inco(num1_1,k); + flux[k] = -conv_quick_sharp_moins_(psc,vit_0,vit_1,vit_1_1,dx,dm,dxam); } } } @@ -210,225 +172,187 @@ Eval_Conv_VDF_Face::flux_arete(const DoubleTab& inco, const DoubleTab } } -template template inline std::enable_if_t< Arete_Type == Type_Flux_Arete::MIXTE, void> -Eval_Conv_VDF_Face::flux_arete(const DoubleTab& inco, const DoubleTab* a_r, int fac1, int fac2,int fac3, int fac4, Type_Double& flux) const +// ===== _comp scalar variants (one component k) for MDRangePolicy kernels ===== + +template template +KOKKOS_FORCEINLINE_FUNCTION std::enable_if_t +Eval_Conv_VDF_Face::flux_arete_comp(CDoubleTabView inco, CDoubleTabView a_r, int fac1, int fac2, int fac3, int fac4, int k, double& flux) const { - double psc = 0.25*((dt_vitesse(fac1)*porosite(fac1)+dt_vitesse(fac2)*porosite(fac2))*(surface(fac1)+surface(fac2))); - if (DERIVED_T::IS_CENTRE) - for (int k = 0; k < flux.size_array(); k++) flux[k] = -psc*0.5*(inco(fac3,k)+inco(fac4,k)); + double psc = 0.25*((dt_vitesse(fac1)*porosite(fac1)+dt_vitesse(fac2)*porosite(fac2))*(surface(fac1)+surface(fac2))); + if (DERIVED_T::IS_AMONT) + { + psc = 0.25*((dt_vitesse(fac1,k)*porosite(fac1)+dt_vitesse(fac2,k)*porosite(fac2))*(surface(fac1)+surface(fac2))); + const int f = psc > 0 ? fac3 : fac4; + if (a_r.size()>0) + { + const int elem = elem_(f,0), elem2 = elem_(f,1); + const int e = dt_vitesse(f,k) > 0 ? (elem > -1 ? elem : elem2) : (elem2 > -1 ? elem2 : elem); + psc *= a_r(e,k); + } + flux = -psc*inco(f,k); + } + else if (DERIVED_T::IS_CENTRE) + flux = -0.5*(inco(fac3,k)+inco(fac4,k))*psc; else { - for (int k = 0; k < flux.size_array(); k++) + const int ori = orientation(fac1); + const int num0_0 = face_amont_conj_(fac3,ori,0), num1_1 = face_amont_conj_(fac4,ori,1); + if (DERIVED_T::IS_CENTRE4) + { + if ((num0_0==-1)||(num1_1==-1)) { flux = -0.5*(inco(fac3,k)+inco(fac4,k))*psc; return; } + const double dx = dist_face_(fac3,fac4,ori), dxam = dist_face_period_(num0_0,fac3,ori), dxav = dist_face_period_(fac4,num1_1,ori); + double g1,g2,g3,g4; + calcul_g_(dxam,dx,dxav,g1,g2,g3,g4); + flux = -conv_centre_(psc,inco(num0_0,k),inco(fac3,k),inco(fac4,k),inco(num1_1,k),g1,g2,g3,g4); + } + else // IS_QUICK { - psc = 0.25 * ((dt_vitesse(fac1, k) * porosite(fac1) + dt_vitesse(fac2, k) * porosite(fac2)) * (surface(fac1) + surface(fac2))); if (psc > 0) { - const int elem = elem_(fac3, 0) > 0 ? elem_(fac3, 0) : elem_(fac3, 1); - if (a_r) psc *= (*a_r)(elem,k); - flux[k] = -psc * inco(fac3, k); + if (num0_0==-1) { flux = -psc*inco(fac3,k); return; } + const double dx = dist_face_period_(fac3,fac4,ori), dm = dim_face_(fac3,ori), dxam = dist_face_period_(num0_0,fac3,ori); + flux = -conv_quick_sharp_plus_(psc,inco(fac3,k),inco(fac4,k),inco(num0_0,k),dx,dm,dxam); } else { - const int elem = elem_(fac4, 0) > 0 ? elem_(fac4, 0) : elem_(fac4, 1); - if (a_r) psc *= (*a_r)(elem,k); - flux[k] = -psc * inco(fac4, k); + if (num1_1==-1) { flux = -psc*inco(fac4,k); return; } + const double dx = dist_face_period_(fac3,fac4,ori), dm = dim_face_(fac4,ori), dxam = dist_face_period_(fac4,num1_1,ori); + flux = -conv_quick_sharp_moins_(psc,inco(fac3,k),inco(fac4,k),inco(num1_1,k),dx,dm,dxam); } } } } -template template -inline std::enable_if_t -Eval_Conv_VDF_Face::flux_arete(const DoubleTab& inco, const DoubleTab* a_r, int fac1, int fac2, int fac3, int signe, Type_Double& flux3, Type_Double& flux1_2) const +template template +KOKKOS_INLINE_FUNCTION std::enable_if_t +Eval_Conv_VDF_Face::flux_arete_comp(CDoubleTabView inco, CDoubleTabView a_r, int fac1, int fac2, int fac3, int fac4, int k, double& flux) const { - assert(flux3.size_array() == flux1_2.size_array()); - constexpr bool is_SYM = (Arete_Type == Type_Flux_Arete::NAVIER_FLUIDE); - if (DERIVED_T::IS_AXI && is_SYM) return; - const int ncomp = flux3.size_array(); - - const int pfb = premiere_face_bord(), ori = orientation(fac3), rang1 = DERIVED_T::IS_QUICK ? fac1 : (fac1-pfb), rang2 = DERIVED_T::IS_QUICK ? fac2 :(fac2-pfb); // TODO : FIXME : euh ? pourquoi ca ? - - for (int k = 0; k < ncomp; k++) + double psc = 0.25*((dt_vitesse(fac1)*porosite(fac1)+dt_vitesse(fac2)*porosite(fac2))*(surface(fac1)+surface(fac2))); + if (DERIVED_T::IS_CENTRE) { flux = -psc*0.5*(inco(fac3,k)+inco(fac4,k)); return; } + psc = 0.25*((dt_vitesse(fac1,k)*porosite(fac1)+dt_vitesse(fac2,k)*porosite(fac2))*(surface(fac1)+surface(fac2))); + if (psc > 0) { - double psc = 0.25*((dt_vitesse(fac1,k)*porosite(fac1)+dt_vitesse(fac2,k)*porosite(fac2))*(surface(fac1)+surface(fac2))); - if ((psc*signe)>0) - { - const int elem = elem_(fac3, 0), elem2 = elem_(fac3, 1); - const int e = dt_vitesse(fac3, k) > 0 ? (elem > -1 ? elem : elem2) : (elem2 > -1 ? elem2 : elem); - const double aa_r = (a_r && DERIVED_T::IS_AMONT) ? (*a_r)(e, k) : 1.0; - flux3[k] = -aa_r*inco(fac3,k)*psc ; - } - else - { - const int ind = ncomp*ori+k; - const double vf1 = Champ_Face_get_val_imp_face_bord_sym(inco,inconnue->temps(),rang1,ind,la_zcl()); - const double vf2 = Champ_Face_get_val_imp_face_bord_sym(inco,inconnue->temps(),rang2,ind,la_zcl()); - const int elem = elem_(fac3, 0), elem2 = elem_(fac3, 1); - const int e = dt_vitesse(fac3, k) > 0 ? (elem > -1 ? elem : elem2) : (elem2 > -1 ? elem2 : elem); - const double aa_r = (a_r && DERIVED_T::IS_AMONT) ? (*a_r)(e, k) : 1.0; - flux3[k] = -aa_r * 0.5 * (vf1 + vf2) * psc ; - } + const int elem = elem_(fac3,0) > 0 ? elem_(fac3,0) : elem_(fac3,1); + if (a_r.size()>0) psc *= a_r(elem,k); + flux = -psc*inco(fac3,k); } - - for (int k = 0; k < ncomp; k++) + else { - double psc = 0.5*dt_vitesse(fac3,k)*surface(fac3)*porosite(fac3); - if (psc>0) - { - const int elem = elem_(fac1, 0), elem2 = elem_(fac1, 1); - const int e = dt_vitesse(fac1, k) > 0 ? (elem > -1 ? elem : elem2) : (elem2 > -1 ? elem2 : elem); - const double aa_r = (a_r && DERIVED_T::IS_AMONT) ? (*a_r)(e, k) : 1.0; - flux1_2[k] = -aa_r * psc * inco(fac1, k); - } - else - { - const int elem = elem_(fac2, 0), elem2 = elem_(fac2, 1); - const int e = dt_vitesse(fac2, k) > 0 ? (elem > -1 ? elem : elem2) : (elem2 > -1 ? elem2 : elem); - const double aa_r = (a_r && DERIVED_T::IS_AMONT) ? (*a_r)(e, k) : 1.0; - flux1_2[k] = (DERIVED_T::IS_CENTRE || DERIVED_T::IS_CENTRE4) ? -psc*0.5*(inco(fac1,k)+inco(fac2,k)) : -aa_r * psc * inco(fac2, k); - } + const int elem = elem_(fac4,0) > 0 ? elem_(fac4,0) : elem_(fac4,1); + if (a_r.size()>0) psc *= a_r(elem,k); + flux = -psc*inco(fac4,k); } } -template template inline std::enable_if_t< Arete_Type == Type_Flux_Arete::PERIODICITE, void> -Eval_Conv_VDF_Face::flux_arete(const DoubleTab& inco, const DoubleTab* a_r, int fac1, int fac2 , int fac3, int fac4, Type_Double& flux3_4, Type_Double& flux1_2) const +template template +KOKKOS_INLINE_FUNCTION std::enable_if_t +Eval_Conv_VDF_Face::flux_arete_comp(CDoubleTabView inco, CDoubleTabView val_imp_face_bord, CDoubleTabView, CDoubleTabView a_r, int fac1, int fac2, int fac3, int signe, int ncomp, int k, double& flux3, double& flux1_2) const { - assert(flux3_4.size_array() == flux1_2.size_array()); - if (DERIVED_T::IS_QUICK) // XXX : LOL + constexpr bool is_SYM = (Arete_Type == Type_Flux_Arete::NAVIER_FLUIDE); + if (DERIVED_T::IS_AXI && is_SYM) { flux3 = 0.; flux1_2 = 0.; return; } + const int ori = orientation(fac3); + double psc = 0.25*((dt_vitesse(fac1,k)*porosite(fac1)+dt_vitesse(fac2,k)*porosite(fac2))*(surface(fac1)+surface(fac2))); + if ((psc*signe) > 0) { - if (DERIVED_T::IS_AXI) return; - else - { - flux_arete < Type_Flux_Arete::INTERNE > (inco, a_r, fac1, fac2, fac3, fac4, flux3_4); - flux_arete < Type_Flux_Arete::INTERNE > (inco, a_r, fac3, fac4, fac1, fac2, flux1_2); - return; - } + const int elem = elem_(fac3,0), elem2 = elem_(fac3,1); + const int e = dt_vitesse(fac3,k) > 0 ? (elem > -1 ? elem : elem2) : (elem2 > -1 ? elem2 : elem); + const double aa_r = (a_r.data() && DERIVED_T::IS_AMONT) ? a_r(e,k) : 1.0; + flux3 = -aa_r*inco(fac3,k)*psc; } - const int ncomp = flux3_4.size_array(); - - // FIXME : pb_multi ! - if (ncomp > 1) throw; + else + { + const int ind = ncomp*ori+k; + const int elem = elem_(fac3,0), elem2 = elem_(fac3,1); + const int e = dt_vitesse(fac3,k) > 0 ? (elem > -1 ? elem : elem2) : (elem2 > -1 ? elem2 : elem); + const double aa_r = (a_r.data() && DERIVED_T::IS_AMONT) ? a_r(e,k) : 1.0; + flux3 = -aa_r*0.5*(val_imp_face_bord(fac1,ind)+val_imp_face_bord(fac2,ind))*psc; + } + psc = 0.5*dt_vitesse(fac3,k)*surface(fac3)*porosite(fac3); + if (psc > 0) + { + const int elem = elem_(fac1,0), elem2 = elem_(fac1,1); + const int e = dt_vitesse(fac1,k) > 0 ? (elem > -1 ? elem : elem2) : (elem2 > -1 ? elem2 : elem); + const double aa_r = (a_r.data() && DERIVED_T::IS_AMONT) ? a_r(e,k) : 1.0; + flux1_2 = -aa_r*psc*inco(fac1,k); + } + else + { + const int elem = elem_(fac2,0), elem2 = elem_(fac2,1); + const int e = dt_vitesse(fac2,k) > 0 ? (elem > -1 ? elem : elem2) : (elem2 > -1 ? elem2 : elem); + const double aa_r = (a_r.data() && DERIVED_T::IS_AMONT) ? a_r(e,k) : 1.0; + flux1_2 = (DERIVED_T::IS_CENTRE || DERIVED_T::IS_CENTRE4) ? -psc*0.5*(inco(fac1,k)+inco(fac2,k)) : -aa_r*psc*inco(fac2,k); + } +} - double psc = 0.25*(dt_vitesse(fac1)*porosite(fac1)+dt_vitesse(fac2)*porosite(fac2))*(surface(fac1) +surface(fac2)); +template template +KOKKOS_INLINE_FUNCTION std::enable_if_t +Eval_Conv_VDF_Face::flux_arete_comp(CDoubleTabView inco, CDoubleTabView a_r, int fac1, int fac2, int fac3, int fac4, int k, double& flux3_4, double& flux1_2) const +{ + if (DERIVED_T::IS_QUICK) + { + if (DERIVED_T::IS_AXI) { flux3_4 = 0.; flux1_2 = 0.; return; } + flux_arete_comp(inco, a_r, fac1, fac2, fac3, fac4, k, flux3_4); + flux_arete_comp(inco, a_r, fac3, fac4, fac1, fac2, k, flux1_2); + return; + } + double psc = 0.25*(dt_vitesse(fac1)*porosite(fac1)+dt_vitesse(fac2)*porosite(fac2))*(surface(fac1)+surface(fac2)); if (DERIVED_T::IS_CENTRE) - for (int k = 0; k < ncomp; k++) flux3_4[k] = -psc*0.5*(inco(fac3,k)+inco(fac4,k)); + flux3_4 = -psc*0.5*(inco(fac3,k)+inco(fac4,k)); else if (DERIVED_T::IS_CENTRE4) { - const int ori = orientation(fac1), num0_0 = face_amont_conj_(fac3,ori,0),num1_1 = face_amont_conj_(fac4,ori,1); - if ( (num0_0 == -1)||(num1_1== -1) ) - for (int k = 0; k < ncomp; k++) flux3_4[k] = -psc*0.5*(inco(fac3,k)+inco(fac4,k)); // Schema centre 2 (pas assez de faces) - else // Schema Centre4 + const int ori = orientation(fac1), num0_0 = face_amont_conj_(fac3,ori,0), num1_1 = face_amont_conj_(fac4,ori,1); + if ((num0_0==-1)||(num1_1==-1)) + flux3_4 = -psc*0.5*(inco(fac3,k)+inco(fac4,k)); + else { - Type_Double vit_0(ncomp), vit_0_0(ncomp), vit_1_1(ncomp), vit_1(ncomp); - const double dx = dist_face_period_(fac3,fac4,ori), dxam = dist_face_period_(num0_0,fac3,ori), dxav = dist_face_period_(fac4,num1_1,ori); - double g1, g2, g3, g4; + const double dx = dist_face_period_(fac3,fac4,ori), dxam = dist_face_period_(num0_0,fac3,ori), dxav = dist_face_period_(fac4,num1_1,ori); + double g1,g2,g3,g4; calcul_g_(dxam,dx,dxav,g1,g2,g3,g4); - for (int k = 0; k < ncomp; k++) - { - vit_0_0[k] = inco(num0_0,k); - vit_0[k] = inco(fac3,k); - vit_1[k] = inco(fac4,k); - vit_1_1[k] = inco(num1_1,k); - flux3_4[k] = -conv_centre_(psc,vit_0_0[k],vit_0[k],vit_1[k],vit_1_1[k],g1,g2,g3,g4); - } + flux3_4 = -conv_centre_(psc,inco(num0_0,k),inco(fac3,k),inco(fac4,k),inco(num1_1,k),g1,g2,g3,g4); } } else - { - if (psc>0) - for (int k = 0; k < ncomp; k++) - { -// if (a_r && DERIVED_T::IS_AMONT) psc *= (*a_r)(elem_(fac3,0),0); // FIXME - flux3_4[k] = -psc*inco(fac3,k); - } - else for (int k = 0; k < ncomp; k++) - { -// if (a_r && DERIVED_T::IS_AMONT) psc *= (*a_r)(elem_(fac4,0),0); // FIXME - flux3_4[k] = -psc*inco(fac4,k); - } - } + flux3_4 = psc > 0 ? -psc*inco(fac3,k) : -psc*inco(fac4,k); - psc = 0.25*(dt_vitesse(fac3)*porosite(fac3)+dt_vitesse(fac4)*porosite(fac4))*(surface(fac3)+surface(fac4)); + psc = 0.25*(dt_vitesse(fac3)*porosite(fac3)+dt_vitesse(fac4)*porosite(fac4))*(surface(fac3)+surface(fac4)); if (DERIVED_T::IS_CENTRE) - for (int k = 0; k < ncomp; k++) flux1_2[k] = -psc*0.5*(inco(fac1,k)+inco(fac2,k)); + flux1_2 = -psc*0.5*(inco(fac1,k)+inco(fac2,k)); else if (DERIVED_T::IS_CENTRE4) { - const int ori = orientation(fac3), num0_0 = face_amont_conj_(fac1,ori,0), num1_1 = face_amont_conj_(fac2,ori,1); - - if ( (num0_0 == -1)||(num1_1== -1) ) - for (int k=0; k(fac3), num0_0 = face_amont_conj_(fac1,ori,0), num1_1 = face_amont_conj_(fac2,ori,1); + if ((num0_0==-1)||(num1_1==-1)) + flux1_2 = -psc*0.5*(inco(fac1,k)+inco(fac2,k)); + else { - Type_Double vit_0(ncomp), vit_0_0(ncomp), vit_1_1(ncomp), vit_1(ncomp); - const double dx = dist_face_period_(fac1,fac2,ori),dxam = dist_face_period_(num0_0,fac1,ori), dxav = dist_face_period_(fac2,num1_1,ori); - double g1, g2, g3, g4; + const double dx = dist_face_period_(fac1,fac2,ori), dxam = dist_face_period_(num0_0,fac1,ori), dxav = dist_face_period_(fac2,num1_1,ori); + double g1,g2,g3,g4; calcul_g_(dxam,dx,dxav,g1,g2,g3,g4); - for (int k = 0; k < ncomp; k++) - { - vit_0_0[k] = inco(num0_0,k); - vit_0[k] = inco(fac1,k); - vit_1[k] = inco(fac2,k); - vit_1_1[k]=inco(num1_1,k); - flux1_2[k] = -conv_centre_(psc,vit_0_0[k],vit_0[k],vit_1[k],vit_1_1[k],g1,g2,g3,g4); - } + flux1_2 = -conv_centre_(psc,inco(num0_0,k),inco(fac1,k),inco(fac2,k),inco(num1_1,k),g1,g2,g3,g4); } } else - { - if (psc>0) - for (int k = 0; k < ncomp; k++) - { -// if (a_r && DERIVED_T::IS_AMONT) psc *= (*a_r)(elem_(fac1,0),0); // FIXME - flux1_2[k] = -psc*inco(fac1,k); - } - else for (int k = 0; k < ncomp; k++) - { -// if (a_r && DERIVED_T::IS_AMONT) psc *= (*a_r)(elem_(fac2,0),0); // FIXME - flux1_2[k] = -psc*inco(fac2,k); - } - } + flux1_2 = psc > 0 ? -psc*inco(fac1,k) : -psc*inco(fac2,k); } -template template inline std::enable_if_t< Arete_Type == Type_Flux_Arete::COIN_FLUIDE, void> -Eval_Conv_VDF_Face::flux_arete(const DoubleTab& inco, const DoubleTab* a_r, int fac1, int , int fac3, int signe, Type_Double& flux3, Type_Double& flux1_2) const +template template +KOKKOS_INLINE_FUNCTION std::enable_if_t +Eval_Conv_VDF_Face::flux_arete_comp(CDoubleTabView inco, CDoubleTabView val_imp_face_bord, CDoubleTabView, CDoubleTabView, int fac1, int, int fac3, int signe, int k, double& flux3, double& flux1_2) const { - assert(flux3.size_array() == flux1_2.size_array()); - if (!DERIVED_T::IS_AMONT) - { - Cerr << "Flux_arete with Type_Flux_Arete::COIN_FLUIDE is only coded for amont scheme !" < 1) throw; - - double psc = 0.5 * dt_vitesse(fac1) * porosite(fac1) * surface(fac1); - if ((psc * signe) > 0) - for (int k = 0; k < ncomp; k++) - { -// if (a_r) psc *= (*a_r)(elem_(fac3,0),0); // FIXME - flux3[k] = -inco(fac3,k) * psc; - } - else - { - const int pfb = premiere_face_bord(), rang1 = (fac1 - pfb), ori = orientation(fac3); - for (int k = 0; k < ncomp; k++) flux3[k] = -Champ_Face_get_val_imp_face_bord(inconnue->temps(), rang1, ori, la_zcl()) * psc; - } + if (!DERIVED_T::IS_AMONT) { Process::Kokkos_exit("COIN_FLUIDE flux_arete_comp: only coded for amont"); flux3 = 0.; flux1_2 = 0.; return; } + double psc = 0.5*dt_vitesse(fac1)*porosite(fac1)*surface(fac1); + flux3 = ((psc*signe) > 0) ? -inco(fac3,k)*psc : -val_imp_face_bord(fac1, orientation(fac3))*psc; + psc = 0.5*dt_vitesse(fac3)*surface(fac3)*porosite(fac3); + flux1_2 = psc > 0 ? -psc*inco(fac1,k) : -psc*val_imp_face_bord(fac3, orientation(fac1)); +} - psc = 0.5 * dt_vitesse(fac3) * surface(fac3) * porosite(fac3); - if (psc > 0) - for (int k = 0; k < ncomp; k++) - { -// if (a_r) psc *= (*a_r)(elem_(fac1,0),0); // FIXME - flux1_2[k] = -psc * inco(fac1,k); - } - else - { - const int pfb = premiere_face_bord(), rang3 = (fac3 - pfb), ori = orientation(fac1); - for (int k = 0; k < ncomp; k++) flux1_2[k] = -psc * Champ_Face_get_val_imp_face_bord(inconnue->temps(), rang3, ori, la_zcl()); - } +template template +KOKKOS_INLINE_FUNCTION std::enable_if_t +Eval_Conv_VDF_Face::flux_fa7_comp(CDoubleTabView inco, CDoubleTabView a_r, int fac1, CDoubleTabView, int, int k, double& flux) const +{ + const int elem1 = elem_(fac1,0), elem2 = elem_(fac1,1); + double psc = dt_vitesse(fac1,k)*surface(fac1); + if (a_r.data() && DERIVED_T::IS_AMONT) psc *= a_r((elem1 != -1) ? elem1 : elem2, k); + flux = -psc*inco(fac1,k)*porosite(fac1); } /* ************************************** * @@ -507,7 +431,7 @@ Eval_Conv_VDF_Face::coeffs_arete(const DoubleTab* a_r, int fac1, int template template inline std::enable_if_t -Eval_Conv_VDF_Face::coeffs_arete(const DoubleTab* a_r, int fac1, int fac2,int fac3,int signe,Type_Double& aii1_2, Type_Double& aii3_4, Type_Double& ajj1_2) const +Eval_Conv_VDF_Face::coeffs_arete(const DoubleTab&, const DoubleTab&, const DoubleTab* a_r, int fac1, int fac2,int fac3,int signe,Type_Double& aii1_2, Type_Double& aii3_4, Type_Double& ajj1_2) const { assert(aii1_2.size_array() == aii3_4.size_array() && aii1_2.size_array() == ajj1_2.size_array()); if (DERIVED_T::IS_CENTRE || DERIVED_T::IS_AXI || DERIVED_T::IS_CENTRE4) return; @@ -541,6 +465,7 @@ Eval_Conv_VDF_Face::coeffs_arete(const DoubleTab* a_r, int fac1, int } } + template template inline void Eval_Conv_VDF_Face::fill_coeffs_proto(const int k, const double psc1, const double psc2, Type_Double& A, Type_Double& B) const { diff --git a/src/VDF/Operateurs/Eval_Conv/Eval_Conv_VDF_Face_leaves.h b/src/VDF/Operateurs/Eval_Conv/Eval_Conv_VDF_Face_leaves.h index 337041dc7e..5eff0ced62 100644 --- a/src/VDF/Operateurs/Eval_Conv/Eval_Conv_VDF_Face_leaves.h +++ b/src/VDF/Operateurs/Eval_Conv/Eval_Conv_VDF_Face_leaves.h @@ -1,5 +1,5 @@ /**************************************************************************** -* Copyright (c) 2023, CEA +* Copyright (c) 2026, CEA * All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: @@ -44,6 +44,17 @@ class Eval_Amont_VDF_Face : public Eval_Conv_VDF_Face, publ { public: static constexpr bool IS_AMONT = true, CALC_ARR_COIN_FL = true; + + KOKKOS_INLINE_FUNCTION double conv_centre(const double psc,const double vit_0_0, const double vit_0, const double vit_1,const double vit1_1,double g1, double g2, double g3,double g4) const + { Process::Kokkos_exit("Error!"); return 0; } + KOKKOS_INLINE_FUNCTION double conv_quick_sharp_moins(const double psc,const double vit_0,const double vit_1, const double vit_1_1, + const double dx, const double dm,const double dxam) const + { Process::Kokkos_exit("Error!"); return 0; } + KOKKOS_INLINE_FUNCTION double conv_quick_sharp_plus(const double psc,const double vit_0, const double vit_1, const double vit_0_0, + const double dx, const double dm, const double dxam) const + { Process::Kokkos_exit("Error!"); return 0; } + KOKKOS_INLINE_FUNCTION void calcul_g(const double dxam, const double dx, const double dxav, double& g1, double& g2, double& g3, double& g4) const + { Process::Kokkos_exit("Error!"); } }; /*! @brief class Eval_Centre_VDF_Face Evaluateur VDF pour la convection Le champ convecte est un Champ_Face_VDF @@ -55,6 +66,16 @@ class Eval_Centre_VDF_Face : public Eval_Conv_VDF_Face, pu { public: static constexpr bool IS_CENTRE = true; + + KOKKOS_INLINE_FUNCTION double conv_centre(const double psc,const double vit_0_0, const double vit_0, const double vit_1,const double vit1_1,double g1, double g2, double g3,double g4) const + { Process::Kokkos_exit("Error!"); return 0; } + KOKKOS_INLINE_FUNCTION double conv_quick_sharp_moins(const double psc,const double vit_0,const double vit_1, const double vit_1_1, + const double dx, const double dm,const double dxam) const + { Process::Kokkos_exit("Error!"); return 0; } + KOKKOS_INLINE_FUNCTION double conv_quick_sharp_plus(const double psc,const double vit_0, const double vit_1, const double vit_0_0,const double dx, const double dm, const double dxam) const + { Process::Kokkos_exit("Error!"); return 0; } + KOKKOS_INLINE_FUNCTION void calcul_g(const double dxam, const double dx, const double dxav, double& g1, double& g2, double& g3, double& g4) const + { Process::Kokkos_exit("Error!"); } }; /*! @brief class Eval_Centre4_VDF_Face Evaluateur VDF pour la convection Le champ convecte est un Champ_Face_VDF @@ -67,16 +88,17 @@ class Eval_Centre4_VDF_Face : public Eval_Conv_VDF_Face, public: static constexpr bool IS_CENTRE4 = true; - inline int face_amont_conj(int num_face,int i,int k) const override { return le_dom->face_amont_conj(num_face, i, k); } - inline int face_amont_princ(int num_face,int i) const override { return le_dom->face_amont_princ(num_face, i); } inline double dist_face(int n1,int n2,int k) const { return le_dom->dist_face(n1,n2,k); } inline double dist_face_period(int n1,int n2,int k) const { return le_dom->dist_face_period(n1,n2,k); } - inline double dist_elem_period(int n1,int n2,int k) const override { return le_dom->dist_elem_period(n1,n2,k); } - inline double dim_elem(int n1,int k) const override { return le_dom->dim_elem(n1,k); } - inline double conv_centre(const double psc,const double vit_0_0, const double vit_0, const double vit_1,const double vit1_1,double g1, double g2, double g3,double g4) const override + KOKKOS_INLINE_FUNCTION double conv_centre(const double psc,const double vit_0_0, const double vit_0, const double vit_1,const double vit1_1,double g1, double g2, double g3,double g4) const { return (g1*vit_0_0 + g2*vit_0 + g3*vit_1 + g4*vit1_1) * psc; } - - inline void calcul_g(const double dxam, const double dx, const double dxav, double& g1, double& g2, double& g3, double& g4) const override + KOKKOS_INLINE_FUNCTION double conv_quick_sharp_moins(const double psc,const double vit_0,const double vit_1, const double vit_1_1, + const double dx, const double dm,const double dxam) const + { Process::Kokkos_exit("Error!"); return 0; } + KOKKOS_INLINE_FUNCTION double conv_quick_sharp_plus(const double psc,const double vit_0, const double vit_1, const double vit_0_0, + const double dx, const double dm, const double dxam) const + { Process::Kokkos_exit("Error!"); return 0; } + KOKKOS_INLINE_FUNCTION void calcul_g(const double dxam, const double dx, const double dxav, double& g1, double& g2, double& g3, double& g4) const {return calcul_g_impl(dxam,dx,dxav,g1,g2,g3,g4); } }; @@ -90,21 +112,21 @@ class Eval_Quick_VDF_Face : public Eval_Conv_VDF_Face, publ public: static constexpr bool IS_QUICK = true; - inline int face_amont_conj(int num_face,int i,int k) const override { return le_dom->face_amont_conj(num_face, i, k); } - inline int face_amont_princ(int num_face,int i) const override { return le_dom->face_amont_princ(num_face, i); } - inline double dim_elem(int n1,int k) const override { return le_dom->dim_elem(n1,k); } - inline double dim_face(int n1,int k) const override { return le_dom->dim_face(n1,k); } inline double dist_face(int n1,int n2,int k) const { return le_dom->dist_face(n1,n2,k); } - inline double dist_elem(int n1,int n2,int k) const override { return le_dom->dist_elem(n1,n2,k); } - inline double dist_elem_period(int n1, int n2, int k) const override { return le_dom->dist_elem_period(n1,n2,k); } inline double dist_face_period(int n1,int n2,int k) const { return le_dom->dist_face_period(n1,n2,k); } - inline double conv_quick_sharp_plus(const double psc,const double vit_0, const double vit_1, const double vit_0_0, - const double dx, const double dm, const double dxam) const override + KOKKOS_INLINE_FUNCTION double conv_centre(const double psc,const double vit_0_0, const double vit_0, const double vit_1,const double vit1_1,double g1, double g2, double g3,double g4) const + { Process::Kokkos_exit("Error!"); return 0; } + + KOKKOS_INLINE_FUNCTION double conv_quick_sharp_plus(const double psc,const double vit_0, const double vit_1, const double vit_0_0, + const double dx, const double dm, const double dxam) const { return conv_quick_sharp_plus_impl(psc,vit_0,vit_1,vit_0_0,dx,dm,dxam); } - inline double conv_quick_sharp_moins(const double psc,const double vit_0,const double vit_1, const double vit_1_1, - const double dx, const double dm,const double dxam) const override + KOKKOS_INLINE_FUNCTION double conv_quick_sharp_moins(const double psc,const double vit_0,const double vit_1, const double vit_1_1, + const double dx, const double dm,const double dxam) const { return conv_quick_sharp_moins_impl(psc,vit_0,vit_1,vit_1_1,dx,dm,dxam); } + + KOKKOS_INLINE_FUNCTION void calcul_g(const double dxam, const double dx, const double dxav, double& g1, double& g2, double& g3, double& g4) const + { Process::Kokkos_exit("Error!"); } }; /*! @brief class Eval_Quick_VDF_Face_Axi Evaluateur VDF pour la convection en coordonnees cylindriques : Le champ convecte est un Champ_Face_VDF @@ -117,26 +139,31 @@ class Eval_Quick_VDF_Face_Axi : public Eval_Conv_VDF_Faceface_amont_princ(num_face, i); } - inline int face_amont_conj(int ,int ,int ) const override; + inline int face_amont_conj(int ,int ,int ) const; inline double dist_face(int ,int ,int ) const; - inline double dist_elem_period(int n1, int n2, int k) const override { return dist_face(n1,n2,k); } - inline double dim_face(int ,int ) const override; - inline double dist_elem(int ,int ,int ) const override; - inline double dim_elem(int ,int ) const override; - inline double conv_quick_sharp_plus(const double psc,const double vit_0, const double vit_1, const double vit_0_0, - const double dx, const double dm, const double dxam) const override + inline double dist_elem_period(int n1, int n2, int k) const { return dist_face(n1,n2,k); } + inline double dim_face(int ,int ) const; + inline double dist_elem(int ,int ,int ) const; + inline double dim_elem(int ,int ) const; + KOKKOS_INLINE_FUNCTION double conv_centre(const double psc,const double vit_0_0, const double vit_0, const double vit_1,const double vit1_1,double g1, double g2, double g3,double g4) const + { Process::Kokkos_exit("Error!"); return 0; } + + KOKKOS_INLINE_FUNCTION double conv_quick_sharp_plus(const double psc,const double vit_0, const double vit_1, const double vit_0_0, + const double dx, const double dm, const double dxam) const { return conv_quick_sharp_plus_impl(psc,vit_0,vit_1,vit_0_0,dx,dm,dxam); } - inline double conv_quick_sharp_moins(const double psc,const double vit_0,const double vit_1, const double vit_1_1, - const double dx, const double dm,const double dxam) const override + KOKKOS_INLINE_FUNCTION double conv_quick_sharp_moins(const double psc,const double vit_0,const double vit_1, const double vit_1_1, + const double dx, const double dm,const double dxam) const { return conv_quick_sharp_moins_impl(psc,vit_0,vit_1,vit_1_1,dx,dm,dxam); } + + KOKKOS_INLINE_FUNCTION void calcul_g(const double dxam, const double dx, const double dxav, double& g1, double& g2, double& g3, double& g4) const + { Process::Kokkos_exit("Error!"); } }; inline double Eval_Quick_VDF_Face_Axi::dim_elem(int n1, int k) const { const IntTab& elem_faces_ = le_dom->elem_faces(); - return dist_face(elem_faces_(n1,k), elem_faces_(n1,k+dimension), k) ; + return dist_face(elem_faces_(n1,k), elem_faces_(n1,k+Objet_U::dimension), k) ; } inline double Eval_Quick_VDF_Face_Axi::dist_elem(int n1, int n2, int k) const @@ -165,7 +192,7 @@ inline int Eval_Quick_VDF_Face_Axi::face_amont_conj(int num_face, int k, int i) const IntTab& face_voisins_ = le_dom->face_voisins(); const IntTab& elem_faces_ = le_dom->elem_faces(); const IntVect& orientation_ = le_dom->orientation(); - return face_amont_conj_axi_impl(num_face,k,i,dimension,face_voisins_,elem_faces_,orientation_); + return face_amont_conj_axi_impl(num_face,k,i,Objet_U::dimension,face_voisins_,elem_faces_,orientation_); } #endif /* Eval_Conv_VDF_Face_leaves_included */ diff --git a/src/VDF/Operateurs/Eval_Conv/Eval_Conv_VDF_tools.cpp b/src/VDF/Operateurs/Eval_Conv/Eval_Conv_VDF_tools.cpp index d9912cb61e..8b2232cf5a 100644 --- a/src/VDF/Operateurs/Eval_Conv/Eval_Conv_VDF_tools.cpp +++ b/src/VDF/Operateurs/Eval_Conv/Eval_Conv_VDF_tools.cpp @@ -1,5 +1,5 @@ /**************************************************************************** -* Copyright (c) 2022, CEA +* Copyright (c) 2025, CEA * All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: @@ -15,44 +15,6 @@ #include -// quick pour un champ face -double Eval_Conv_VDF_tools::conv_quick_sharp_plus_impl(const double psc,const double vit_0, const double vit_1, - const double vit_0_0, const double dx, - const double dm, const double dxam) const -{ - double cf, curv, delta_0 = vit_0 - vit_0_0, delta = vit_1 - vit_0, dd1,utc, delta_delta; - curv = (delta/dx - delta_0/dxam)/dm ; - // Calcul de cf: - delta_delta = delta_0+delta; - dd1 = std::fabs(delta_delta); - if (dd1 < 1.e-5) cf = 0.125; - else - { - utc = delta_0/delta_delta; - cf = sharp2(utc); - } - return (0.5*(vit_0 + vit_1) - cf*(dx*dx)*curv)*psc; -} - -// quick pour un champ face -double Eval_Conv_VDF_tools::conv_quick_sharp_moins_impl(const double psc,const double vit_0,const double vit_1, - const double vit_1_1,const double dx, - const double dm,const double dxam) const -{ - double cf, curv, delta_1 = vit_1_1 - vit_1, delta = vit_1 - vit_0, dd1,utc, delta_delta; - curv = ( delta_1/dxam - delta/dx )/dm ; - // Calcul de cf: - delta_delta = delta_1+delta; - dd1 = std::fabs(delta_delta); - if (dd1 < 1.e-5) cf = 0.125; - else - { - utc = delta_1/delta_delta; - cf = sharp2(utc); - } - return (0.5*(vit_0 + vit_1) - cf*(dx*dx)*curv)*psc; -} - int Eval_Conv_VDF_tools::face_amont_conj_axi_impl(int num_face, int k, int i, int dimension, const IntTab& face_voisins, const IntTab& elem_faces, const IntVect& orientation) const @@ -108,12 +70,3 @@ double Eval_Conv_VDF_tools::dist_elem_axi_impl(int n1, int n2, int k, const Doub } return dist ; } - -// Calcul des coefficients g1,g2,g3,g4 a partir de dxam,dx,dxav -void Eval_Conv_VDF_tools::calcul_g_impl(const double dxam, const double dx, const double dxav, double& g1, double& g2, double& g3, double& g4) const -{ - g1 = -dx*dx*(dx/2+dxav)/(4*(dx+dxam+dxav)*(dx+dxam)*dxam); - g2 = (dx+2*dxam)*(dx+2*dxav)/(8*dxam*(dx+dxav)); - g3 = (dx+2*dxam)*(dx+2*dxav)/(8*dxav*(dx+dxam)); - g4 = -dx*dx*(dx/2+dxam)/(4*(dx+dxam+dxav)*dxav*(dx+dxav)); -} diff --git a/src/VDF/Operateurs/Eval_Conv/Eval_Conv_VDF_tools.h b/src/VDF/Operateurs/Eval_Conv/Eval_Conv_VDF_tools.h index 127bca092e..ea3b153872 100644 --- a/src/VDF/Operateurs/Eval_Conv/Eval_Conv_VDF_tools.h +++ b/src/VDF/Operateurs/Eval_Conv/Eval_Conv_VDF_tools.h @@ -23,48 +23,55 @@ class Eval_Conv_VDF_tools { public: virtual ~Eval_Conv_VDF_tools() {} - // DANGER !!!! FAUT JAMAIS ENTRER - virtual int amont_amont(int face, int i) const { return dont_call(__func__); } - virtual int face_amont_conj(int ,int ,int ) const { return dont_call(__func__); } - virtual int face_amont_princ(int ,int ) const { return dont_call(__func__); } - virtual double dim_elem(int ,int ) const { return dont_call(__func__); } - virtual double dim_face(int ,int ) const { return dont_call(__func__); } - virtual double dist_elem(int ,int ,int ) const { return dont_call(__func__); } - virtual double dist_elem_period(int , int , int ) const { return dont_call(__func__); } - virtual double conv_centre(const double,const double,const double,const double,const double,double,double,double,double) const { return dont_call(__func__); } - virtual double conv_quick_sharp_plus(const double,const double,const double,const double,const double,const double,const double) const { return dont_call(__func__); } - virtual double conv_quick_sharp_moins(const double,const double,const double,const double,const double,const double,const double) const { return dont_call(__func__); } - virtual void calcul_g(const double,const double,const double,double&,double&,double&,double&) const { return dont_call(__func__); } - template void qcentre(const double, const int, const int, const int, const int, const int, const DoubleTab&, Type_Double& ) const { return dont_call(__func__); } + KOKKOS_INLINE_FUNCTION + void qcentre_view(const double, const int, const int, const int, const int, const int, CDoubleTabView, DoubleArrView) const { return dont_call(__func__); } template void quick_fram(const Type_Double&, const int, const int, const int, const int, const int, const DoubleTab&, Type_Double& ) const { return dont_call(__func__); } + KOKKOS_INLINE_FUNCTION + void quick_fram_view(const double, const int, const int, const int, const int, const int, CDoubleTabView, DoubleArrView) const { return dont_call(__func__); } protected: int face_amont_conj_axi_impl(int ,int ,int ,int , const IntTab& , const IntTab& , const IntVect&) const; double dist_face_axi_impl(int ,int ,int ,const DoubleTab&) const; double dist_elem_axi_impl(int ,int ,int ,const DoubleTab&) const; - double conv_quick_sharp_plus_impl(const double,const double,const double,const double,const double,const double,const double) const ; - double conv_quick_sharp_moins_impl(const double,const double,const double,const double,const double,const double,const double) const; - void calcul_g_impl(const double,const double,const double,double&,double&,double&,double& ) const ; template void qcentre2_impl(const double,const int,const int,const int,const int,const int,const DoubleTab&,Type_Double&) const; + KOKKOS_INLINE_FUNCTION + void qcentre2_impl_view(const double,const int,const int,const int,const int,const int,CDoubleTabView,DoubleArrView) const; + KOKKOS_INLINE_FUNCTION + void qcentre2_impl_comp(const double,const int,const int,CDoubleTabView,const int,double&) const; template void qcentre4_impl(const int,const double,const double,const double,const double,const int,const int,const int,const int,const int,const DoubleTab&,Type_Double&) const; + KOKKOS_INLINE_FUNCTION + void qcentre4_impl_view(const int,const double,const double,const double,const double,const int,const int,const int,const int,const int,CDoubleTabView,DoubleArrView) const; + KOKKOS_INLINE_FUNCTION + void qcentre4_impl_comp(const int,const double,const double,const double,const double,const int,const int,const int,const int,CDoubleTabView,const int,double&) const; template void quick_fram_impl(const int,const double,const double,const double,const double,const double,const Type_Double&,const int,const int,const int,const int,const int,const DoubleTab&,Type_Double&) const; + KOKKOS_INLINE_FUNCTION + void quick_fram_impl_view(const int,const double,const double,const double,const double,const double,const double,const int,const int,const int,const int,const int,CDoubleTabView,DoubleArrView) const; + KOKKOS_INLINE_FUNCTION + void quick_fram_impl_comp(const int,const double,const double,const double,const double,const double,const double,const int,const int,const int,const int,CDoubleTabView,const int,double&) const; + + KOKKOS_INLINE_FUNCTION + double conv_quick_sharp_plus_impl(const double,const double,const double,const double,const double,const double,const double) const ; + KOKKOS_INLINE_FUNCTION + double conv_quick_sharp_moins_impl(const double,const double,const double,const double,const double,const double,const double) const; + KOKKOS_INLINE_FUNCTION + void calcul_g_impl(const double,const double,const double,double&,double&,double&,double& ) const ; private: template - type dont_call (const char * nom_fct) const + KOKKOS_INLINE_FUNCTION type dont_call (const char * nom_fct) const { - Cerr << "What ??? You should not call the function " << nom_fct << finl; - throw; + Process::Kokkos_exit("What ??? You should not call the function."); + if constexpr (!std::is_void_v) return type {}; } }; @@ -80,25 +87,26 @@ inline double Fram4(const double s1,const double s2, const double s3,const doubl } // Fram pour QUICK -inline double Fram(const double s1,const double s2, const double s3,const double s4) +KOKKOS_INLINE_FUNCTION +double Fram(const double s1,const double s2, const double s3,const double s4) { - double smin0 = std::min(s4,s2), smax0 = std::max(s4,s2), smin1 = std::min(s3,s1), smax1 = std::max(s3,s1); + double smin0 = Kokkos::min(s4,s2), smax0 = Kokkos::max(s4,s2), smin1 = Kokkos::min(s3,s1), smax1 = Kokkos::max(s3,s1); // Ajout du DMINFLOAT car le compilateur Nvidia evalue quand meme (bug) si smax0-smin0=0... - double sr0 = (std::fabs(smax0-smin0)= 1.5) ) cf = 0.125; else if ((utc > -1) && (utc <= 0) ) cf = 0.5 + 0.375*utc; - else if ((utc <= 0.25) && (utc > 0) ) cf = 0.5 - 0.625*sqrt(utc); + else if ((utc <= 0.25) && (utc > 0) ) cf = 0.5 - 0.625*Kokkos::sqrt(utc); else if ((utc > 0.25) && (utc <= 1.) ) cf = 0.25*(1.-utc); else cf = 0.25*(utc-1.); return cf; @@ -109,13 +117,7 @@ void Eval_Conv_VDF_tools::qcentre2_impl(const double psc, const int num0, const const DoubleTab& transporte,Type_Double& flux) const { int k, ncomp = flux.size_array(); - Type_Double T0(ncomp), T1(ncomp); - for (k=0; k @@ -123,17 +125,9 @@ void Eval_Conv_VDF_tools::qcentre4_impl(const int ori,const double dx, const dou const int num0_0, const int num1_1, const int face, const DoubleTab& transporte,Type_Double& flux) const { int k, ncomp = flux.size_array(); - Type_Double T0(ncomp), T0_0(ncomp), T1(ncomp), T1_1(ncomp); - for (k=0; k @@ -145,18 +139,100 @@ void Eval_Conv_VDF_tools::quick_fram_impl(const int ori,const double dx, const d for (int k=0; k= 0 ) || (num1_1 == -1 && psc[k] <= 0 ) ) + T0 = transporte(num0,k); + T0_0 = (num0_0!=-1?transporte(num0_0,k):0); + T1 = transporte(num1,k); + T1_1 = (num1_1!=-1?transporte(num1_1,k):0); + + if (psc > 0) { - flux[k] = (psc[k] > 0) ? psc[k]*transporte(num0,k) : psc[k]*transporte(num1,k); + assert(num0_0!=-1); + trans_amont = T0; + curv = ( (T1 - T0)/dx - (T0 - T0_0)/dxam0 )/dm0 ; } else { - T0 = transporte(num0,k); - T0_0 = (num0_0!=-1?transporte(num0_0,k):0); - T1 = transporte(num1,k); - T1_1 = (num1_1!=-1?transporte(num1_1,k):0); + assert(num1_1!=-1); + trans_amont = T1; + curv = ( (T1_1 - T1)/dxam1 - (T1 - T0)/dx )/dm1; + } + flux[k] = 0.5*(T0+T1) - 0.125*(dx*dx)*curv; + // On applique le filtre Fram: + fr = ( num0_0 != -1 && num1_1 != -1 ) ? Fram(T0_0,T0,T1,T1_1) : 1.; + flux[k] = ((1.-fr)*flux[k] + fr*trans_amont)*psc; + } +} + +// Views implementation: +KOKKOS_INLINE_FUNCTION +void Eval_Conv_VDF_tools::qcentre2_impl_view(const double psc, const int num0, const int num1, const int num0_0, const int num1_1, const int face, + CDoubleTabView transporte,DoubleArrView flux) const +{ + const int ncomp = (int)flux.size(); + for (int k=0; k 0) +KOKKOS_INLINE_FUNCTION +void Eval_Conv_VDF_tools::qcentre2_impl_comp(const double psc, const int num0, const int num1, + CDoubleTabView transporte, const int k, double& flux) const +{ + flux = 0.5 * (transporte(num0, k) + transporte(num1, k)) * psc; +} + +KOKKOS_INLINE_FUNCTION +void Eval_Conv_VDF_tools::qcentre4_impl_view(const int ori,const double dx, const double dxam, const double dxav, const double psc, const int num0, const int num1, + const int num0_0, const int num1_1, const int face, CDoubleTabView transporte,DoubleArrView flux) const +{ + const int ncomp = (int)flux.size(); + const double g1 = -dx*dx*(dx/2+dxav)/(4*(dx+dxam+dxav)*(dx+dxam)*dxam), g2 = (dx+2*dxam)*(dx+2*dxav)/(8*dxam*(dx+dxav)); + const double g3 = (dx+2*dxam)*(dx+2*dxav)/(8*dxav*(dx+dxam)), g4 = -dx*dx*(dx/2+dxam)/(4*(dx+dxam+dxav)*dxav*(dx+dxav)); + for (int k=0; k= 0 ) || (num1_1 == -1 && psc <= 0 ) ) + { + flux[k] = (psc > 0) ? psc*transporte(num0,k) : psc*transporte(num1,k); + } + else + { + double T0 = transporte(num0,k); + double T0_0 = (num0_0!=-1?transporte(num0_0,k):0); + double T1 = transporte(num1,k); + double T1_1 = (num1_1!=-1?transporte(num1_1,k):0); + double trans_amont, curv; + if (psc > 0) { trans_amont = T0; curv = ( (T1 - T0)/dx - (T0 - T0_0)/dxam0 )/dm0 ; @@ -168,10 +244,88 @@ void Eval_Conv_VDF_tools::quick_fram_impl(const int ori,const double dx, const d } flux[k] = 0.5*(T0+T1) - 0.125*(dx*dx)*curv; // On applique le filtre Fram: - fr = ( num0_0 != -1 && num1_1 != -1 ) ? Fram(T0_0,T0,T1,T1_1) : 1.; - flux[k] = ((1.-fr)*flux[k] + fr*trans_amont)*psc[k]; + double fr = ( num0_0 != -1 && num1_1 != -1 ) ? Fram(T0_0,T0,T1,T1_1) : 1.; + flux[k] = ((1.-fr)*flux[k] + fr*trans_amont)*psc; } } } +KOKKOS_INLINE_FUNCTION +void Eval_Conv_VDF_tools::quick_fram_impl_comp(const int ori, const double dx, const double dm0, const double dxam0, + const double dm1, const double dxam1, const double psc, + const int num0, const int num1, const int num0_0, const int num1_1, + CDoubleTabView transporte, const int k, double& flux) const +{ + const double T0 = transporte(num0, k); + const double T0_0 = (num0_0 != -1 ? transporte(num0_0, k) : 0.); + const double T1 = transporte(num1, k); + const double T1_1 = (num1_1 != -1 ? transporte(num1_1, k) : 0.); + double trans_amont, curv; + if (psc > 0) + { + assert(num0_0 != -1); + trans_amont = T0; + curv = ((T1 - T0)/dx - (T0 - T0_0)/dxam0) / dm0; + } + else + { + assert(num1_1 != -1); + trans_amont = T1; + curv = ((T1_1 - T1)/dxam1 - (T1 - T0)/dx) / dm1; + } + double val = 0.5*(T0+T1) - 0.125*(dx*dx)*curv; + const double fr = (num0_0 != -1 && num1_1 != -1) ? Fram(T0_0, T0, T1, T1_1) : 1.; + flux = ((1.-fr)*val + fr*trans_amont) * psc; +} + +// quick pour un champ face +KOKKOS_INLINE_FUNCTION +double Eval_Conv_VDF_tools::conv_quick_sharp_plus_impl(const double psc,const double vit_0, const double vit_1, + const double vit_0_0, const double dx, + const double dm, const double dxam) const +{ + double cf, curv, delta_0 = vit_0 - vit_0_0, delta = vit_1 - vit_0, dd1,utc, delta_delta; + curv = (delta/dx - delta_0/dxam)/dm ; + // Calcul de cf: + delta_delta = delta_0+delta; + dd1 = Kokkos::fabs(delta_delta); + if (dd1 < 1.e-5) cf = 0.125; + else + { + utc = delta_0/delta_delta; + cf = sharp2(utc); + } + return (0.5*(vit_0 + vit_1) - cf*(dx*dx)*curv)*psc; +} + +// quick pour un champ face +KOKKOS_INLINE_FUNCTION +double Eval_Conv_VDF_tools::conv_quick_sharp_moins_impl(const double psc,const double vit_0,const double vit_1, + const double vit_1_1,const double dx, + const double dm,const double dxam) const +{ + double cf, curv, delta_1 = vit_1_1 - vit_1, delta = vit_1 - vit_0, dd1,utc, delta_delta; + curv = ( delta_1/dxam - delta/dx )/dm ; + // Calcul de cf: + delta_delta = delta_1+delta; + dd1 = Kokkos::fabs(delta_delta); + if (dd1 < 1.e-5) cf = 0.125; + else + { + utc = delta_1/delta_delta; + cf = sharp2(utc); + } + return (0.5*(vit_0 + vit_1) - cf*(dx*dx)*curv)*psc; +} + +// Calcul des coefficients g1,g2,g3,g4 a partir de dxam,dx,dxav +KOKKOS_INLINE_FUNCTION +void Eval_Conv_VDF_tools::calcul_g_impl(const double dxam, const double dx, const double dxav, double& g1, double& g2, double& g3, double& g4) const +{ + g1 = -dx*dx*(dx/2+dxav)/(4*(dx+dxam+dxav)*(dx+dxam)*dxam); + g2 = (dx+2*dxam)*(dx+2*dxav)/(8*dxam*(dx+dxav)); + g3 = (dx+2*dxam)*(dx+2*dxav)/(8*dxav*(dx+dxam)); + g4 = -dx*dx*(dx/2+dxam)/(4*(dx+dxam+dxav)*dxav*(dx+dxav)); +} + #endif /* Eval_Conv_VDF_tools_included */ diff --git a/src/VDF/Operateurs/Eval_Diff_Dift/Eval_Diff_VDF.h b/src/VDF/Operateurs/Eval_Diff_Dift/Eval_Diff_VDF.h index fb72ea0311..2e4179e54f 100644 --- a/src/VDF/Operateurs/Eval_Diff_Dift/Eval_Diff_VDF.h +++ b/src/VDF/Operateurs/Eval_Diff_Dift/Eval_Diff_VDF.h @@ -24,6 +24,16 @@ class Eval_Diff_VDF { public: + inline Eval_Diff_VDF() { } + inline Eval_Diff_VDF(const Eval_Diff_VDF& eval) + { + is_var_ = eval.is_var_; + ref_probleme_ = eval.ref_probleme_; + ref_diffusivite_ = eval.ref_diffusivite_; + tab_diffusivite_.ref(eval.tab_diffusivite_); + tab_alpha_.ref(eval.tab_alpha_); + tab_diffusivite_v_ = eval.tab_diffusivite_v_; + } virtual ~Eval_Diff_VDF() { } inline const int& is_var() const { return is_var_; } @@ -71,43 +81,69 @@ class Eval_Diff_VDF update_diffusivite(); } + // Template function by ExecSpace to get TRUSTTab tab_diffusivite_ (array on host) or view (array on device) + template + KOKKOS_INLINE_FUNCTION double tab_diffusivite(int face, int comp) const { if constexpr (std::is_same::value) return tab_diffusivite_(face, comp); else return tab_diffusivite_v_(face, comp); } + // Methods used by the flux computation in template class: - inline double compute_heq_impl(double d0, int i, double d1, int j, int compo) const + template + KOKKOS_INLINE_FUNCTION double compute_heq_impl(double d0, int i, double d1, int j, int compo) const { - return 1. / (d0 / tab_diffusivite_(is_var_ * i, compo) + d1 / tab_diffusivite_(is_var_ * j, compo)); + return 1. / (d0 / tab_diffusivite(is_var_ * i, compo) + d1 / tab_diffusivite(is_var_ * j, compo)); } - - inline double nu_1_impl(int i, int compo) const { return tab_diffusivite_(is_var_ * i, compo); } - inline double nu_2_impl(int i, int compo) const { return tab_diffusivite_(is_var_ * i, compo); } - - inline double nu_1_impl_face(int i, int j, int compo) const + template + KOKKOS_INLINE_FUNCTION double nu_1_impl(int i, int compo) const { - return 0.5 * (tab_diffusivite_(is_var_ * i, compo) + tab_diffusivite_(is_var_ * j, compo)); + return tab_diffusivite(is_var_ * i, compo); } - - inline double nu_2_impl_face(int i, int j, int k, int l, int compo) const + template + KOKKOS_INLINE_FUNCTION double nu_2_impl(int i, int compo) const { - return 0.25 * (tab_diffusivite_(is_var_ * i, compo) + tab_diffusivite_(is_var_ * j, compo) + tab_diffusivite_(is_var_ * k, compo) + tab_diffusivite_(is_var_ * l, compo)); + return tab_diffusivite(is_var_ * i, compo); + } + template + KOKKOS_INLINE_FUNCTION double nu_1_impl_face(int i, int j, int compo) const + { + return 0.5 * (tab_diffusivite(is_var_ * i, compo) + tab_diffusivite(is_var_ * j, compo)); + } + template + KOKKOS_INLINE_FUNCTION double nu_2_impl_face(int i, int j, int k, int l, int compo) const + { + return 0.25 * (tab_diffusivite(is_var_ * i, compo) + tab_diffusivite(is_var_ * j, compo) + tab_diffusivite(is_var_ * k, compo) + tab_diffusivite(is_var_ * l, compo)); + } + template + KOKKOS_INLINE_FUNCTION double nu_lam_impl_face(int i, int j, int k, int l, int compo) const + { + return nu_2_impl_face(i, j, k, l, compo); + } + template + KOKKOS_INLINE_FUNCTION double nu_lam_impl_face2(int i, int j, int compo) const + { + return nu_1_impl_face(i, j, compo); } - - inline double nu_lam_impl_face(int i, int j, int k, int l, int compo) const { return nu_2_impl_face(i, j, k, l, compo); } - inline double nu_lam_impl_face2(int i, int j, int compo) const { return nu_1_impl_face(i, j, compo); } // These methods will be overloaded in DIFT operators (See Eval_Dift_VDF_const_Elem for example ...) inline int get_ind_Fluctu_Term() const { return 0; } - inline double get_dv_mvol(const int i) const { throw; } /* seulement pour K-Eps */ + template + KOKKOS_INLINE_FUNCTION double get_dv_mvol(const int i) const { Kokkos::abort("get_dv_mvol not implemented"); return 0.; } /* seulement pour K-Eps */ inline virtual double get_equivalent_distance(int boundary_index,int local_face) const { return 0; } - inline double nu_t_impl(int i, int compo) const { return 0.; } - inline double tau_tan_impl(int i, int j) const { return 0.; } - inline bool uses_wall() const { return false; } - inline bool uses_mod() const { return false; } + template + KOKKOS_INLINE_FUNCTION double nu_t_impl(int i, int compo) const { return 0.; } + KOKKOS_INLINE_FUNCTION double tau_tan_impl(int i, int j) const { return 0.; } + KOKKOS_INLINE_FUNCTION bool uses_wall() const { return false; } + KOKKOS_INLINE_FUNCTION bool uses_mod() const { return false; } inline const DoubleTab& get_k_elem() const { throw; } // pour F5 seulement ... + virtual void view_ro_impl() const + { + tab_diffusivite_v_ = tab_diffusivite_.view_ro(); + } protected: int is_var_ = 0; OBS_PTR(Probleme_base) ref_probleme_; OBS_PTR(Champ_base) ref_diffusivite_; DoubleTab tab_diffusivite_, tab_alpha_; + mutable CDoubleTabView tab_diffusivite_v_; }; #endif /* Eval_Diff_VDF_included */ diff --git a/src/VDF/Operateurs/Eval_Diff_Dift/Eval_Diff_VDF_Elem_Gen.h b/src/VDF/Operateurs/Eval_Diff_Dift/Eval_Diff_VDF_Elem_Gen.h index 274b7a4794..c9ee812652 100644 --- a/src/VDF/Operateurs/Eval_Diff_Dift/Eval_Diff_VDF_Elem_Gen.h +++ b/src/VDF/Operateurs/Eval_Diff_Dift/Eval_Diff_VDF_Elem_Gen.h @@ -1,5 +1,5 @@ /**************************************************************************** -* Copyright (c) 2025, CEA +* Copyright (c) 2026, CEA * All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: @@ -63,19 +63,12 @@ class Eval_Diff_VDF_Elem_Gen : public Eval_VDF_Elem, public Evaluateur_VDF * ********* POUR L'EXPLICITE ********** * * ************************************** */ - template // Generic return - inline void flux_face(const DoubleTab&, const DoubleTab&, const int, const BC&, int, Type_Double& ) const { /* Do nothing */ } - // To overload - template inline void flux_face(const DoubleTab&, const DoubleTab&, const int, const Dirichlet_entree_fluide&, const int, Type_Double& ) const; - template inline void flux_face(const DoubleTab&, const DoubleTab&, const int, const Scalaire_impose_paroi&, const int, Type_Double& ) const; - template inline void flux_face(const DoubleTab&, const DoubleTab&, const int, const Dirichlet_loi_paroi&, const int, Type_Double& ) const; - template inline void flux_face(const DoubleTab&, const DoubleTab&, const int, const Neumann_paroi&, const int, Type_Double& ) const; - template inline void flux_face(const DoubleTab&, const DoubleTab&, const int, const Periodique&, const int, Type_Double& ) const; template inline void flux_face(const DoubleTab&, const DoubleTab&, const int, const Echange_global_impose&, const int, Type_Double& ) const; - template inline void flux_face(const DoubleTab&, const DoubleTab&, const int, const Dirichlet_paroi_fixe&, const int, Type_Double& ) const; template inline void flux_face(const DoubleTab&, const int, const int, const int, const Echange_externe_impose&, const int, Type_Double& ) const; - template inline void flux_faces_interne(const DoubleTab&, const int, Type_Double& ) const; + template + KOKKOS_INLINE_FUNCTION void flux_faces_bord_comp(CDoubleTabView, CDoubleTabView, const int, const BC_View&, const int, const int, double&) const; + KOKKOS_INLINE_FUNCTION void flux_faces_interne_comp(CDoubleTabView, const int, const int, double&) const; /* ************************************** * * ********* POUR L'IMPLICITE ********** * @@ -103,13 +96,37 @@ class Eval_Diff_VDF_Elem_Gen : public Eval_VDF_Elem, public Evaluateur_VDF template inline void secmem_face(const int, const Echange_global_impose&, const int, Type_Double& ) const; template inline void secmem_face(const int, const int, const int, const Echange_externe_impose&, const int, Type_Double& ) const; template inline void secmem_faces_interne(const int, Type_Double& ) const { /* Do nothing */ } + void view_ro() const override + { + Evaluateur_VDF::view_ro(); + static_cast(this)->view_ro_impl(); + } private: - inline double Dist_face_elem0(const int face, const int n0) const { return DERIVED_T::IS_AXI ? le_dom->dist_face_elem0_axi(face,n0) : le_dom->dist_face_elem0(face,n0); } - inline double Dist_face_elem1(const int face, const int n1) const { return DERIVED_T::IS_AXI ? le_dom->dist_face_elem1_axi(face,n1) : le_dom->dist_face_elem1(face,n1); } - inline double Dist_norm_bord (const int face) const + template + KOKKOS_INLINE_FUNCTION double get_dist_face_elem0(const int face, const int n0) const + { + if constexpr (std::is_same::value) + return DERIVED_T::IS_AXI ? le_dom->dist_face_elem0_axi(face,n0) : le_dom->dist_face_elem0(face,n0); + else + return DERIVED_T::IS_AXI ? le_dom_v_.dist_face_elem0_axi(face,n0) : le_dom_v_.dist_face_elem0(face,n0); + } + template + KOKKOS_INLINE_FUNCTION double get_dist_face_elem1(const int face, const int n1) const + { + if constexpr (std::is_same::value) + return DERIVED_T::IS_AXI ? le_dom->dist_face_elem1_axi(face,n1) : le_dom->dist_face_elem1(face,n1); + else + return DERIVED_T::IS_AXI ? le_dom_v_.dist_face_elem1_axi(face,n1) : le_dom_v_.dist_face_elem1(face,n1); + } + template + KOKKOS_INLINE_FUNCTION double get_dist_norm_bord(const int face) const { - double val = DERIVED_T::IS_AXI ? le_dom->dist_norm_bord_axi(face) : le_dom->dist_norm_bord(face); + double val; + if constexpr (std::is_same::value) + val = DERIVED_T::IS_AXI ? le_dom->dist_norm_bord_axi(face) : le_dom->dist_norm_bord(face); + else + val = DERIVED_T::IS_AXI ? le_dom_v_.dist_norm_bord_axi(face) : le_dom_v_.dist_norm_bord(face); return DERIVED_T::IS_MULTD ? val : 2*val; } @@ -133,11 +150,15 @@ class Eval_Diff_VDF_Elem_Gen : public Eval_VDF_Elem, public Evaluateur_VDF // CRTP pattern to static_cast the appropriate class and get the implementation : This is magic ! inline int ind_Fluctu_Term() const { return static_cast(this)->get_ind_Fluctu_Term(); } // See generic impl in the class Eval_Diff_VDF. They will be overloaded for Dift ops - inline double nu_1(const int i, int compo = 0) const { return static_cast(this)->nu_1_impl(i,compo); } - inline double nu_2(const int i, int compo = 0) const { return static_cast(this)->nu_2_impl(i,compo); } - inline double compute_heq(const double d0, const int i0, const double d1, const int i1, int compo = 0) const { return static_cast(this)->compute_heq_impl(d0,i0,d1,i1,compo); } + template + KOKKOS_INLINE_FUNCTION double nu_1(const int i, int compo = 0) const { return static_cast(this)->template nu_1_impl(i,compo); } + template + KOKKOS_INLINE_FUNCTION double nu_2(const int i, int compo = 0) const { return static_cast(this)->template nu_2_impl(i,compo); } + template + KOKKOS_INLINE_FUNCTION double compute_heq(const double d0, const int i0, const double d1, const int i1, int compo = 0) const { return static_cast(this)->template compute_heq_impl(d0,i0,d1,i1,compo); } inline double equivalent_distance (const int boundary_index, const int local_face) const { return static_cast(this)->get_equivalent_distance(boundary_index,local_face); } - inline double dv_mvol(const int i) const { return static_cast(this)->get_dv_mvol(i); } + template + KOKKOS_INLINE_FUNCTION double dv_mvol(const int i) const { return static_cast(this)->template get_dv_mvol(i); } }; #include // templates specializations ici ;) diff --git a/src/VDF/Operateurs/Eval_Diff_Dift/Eval_Diff_VDF_Elem_Gen.tpp b/src/VDF/Operateurs/Eval_Diff_Dift/Eval_Diff_VDF_Elem_Gen.tpp index d532e880dd..74a1cf49eb 100644 --- a/src/VDF/Operateurs/Eval_Diff_Dift/Eval_Diff_VDF_Elem_Gen.tpp +++ b/src/VDF/Operateurs/Eval_Diff_Dift/Eval_Diff_VDF_Elem_Gen.tpp @@ -18,159 +18,6 @@ #include -template template -inline void Eval_Diff_VDF_Elem_Gen::flux_face(const DoubleTab& inco, const DoubleTab& val_b, const int face, const Dirichlet_entree_fluide& la_cl, const int num1, Type_Double& flux) const -{ - // Olga avait mis : double dist = 2*Dist_norm_bord(face); - // Pierre dit que : - const double dist = Dist_norm_bord(face); - const int i = elem_(face,0), j = elem_(face,1), ncomp = flux.size_array(); - - for (int k = 0; k < ncomp; k++) - { - if (DERIVED_T::IS_QUASI) - { - const double T_imp = la_cl.val_imp(face - num1, k); - const int ori = DERIVED_T::IS_ANISO ? orientation(face) : k; - flux[k] = (i != -1) ? (T_imp - inco(i, k)) / dv_mvol(i) * surface(face) * porosite(face) * nu_1(i, ori) / dist : - (inco(j, k) - T_imp) / dv_mvol(j) * surface(face) * porosite(face) * nu_1(j, ori) / dist; - } - else if (DERIVED_T::IS_MULTI_SCALAR_DIFF) - { - flux[k] = 0.0; - for (int l = 0; l < ncomp; l++) - { - const double T_imp = la_cl.val_imp(face - num1, l); - const int ori = ncomp * k + l; - - flux[k] += (i != -1) ? (T_imp - inco(i, l)) * surface(face) * porosite(face) * nu_1(i, ori) / dist : - (inco(j, l) - T_imp) * surface(face) * porosite(face) * nu_1(j, ori) / dist; - } - } - else - { - const double T_imp = la_cl.val_imp(face - num1, k); - const int ori = DERIVED_T::IS_ANISO ? orientation(face) : k; - - flux[k] = (i != -1) ? (T_imp - inco(i, k)) * surface(face) * porosite(face) * nu_1(i, ori) / dist : - (inco(j, k) - T_imp) * surface(face) * porosite(face) * nu_1(j, ori) / dist; - } - } -} - -template template -inline void Eval_Diff_VDF_Elem_Gen::flux_face(const DoubleTab& inco, const DoubleTab& val_b, const int face, const Scalaire_impose_paroi& la_cl, const int num1, Type_Double& flux) const -{ - const double dist = Dist_norm_bord(face); - const int i = elem_(face,0), j = elem_(face,1), ncomp = flux.size_array(); - - for (int k = 0; k < ncomp; k++) - { - if (DERIVED_T::IS_MULTI_SCALAR_DIFF) - { - flux[k] = 0.0; - for (int l = 0; l < ncomp; l++) - { - const double T_imp = la_cl.val_imp(face-num1, l); - const int ori = ncomp * k + l; - flux[k] += (i != -1) ? (T_imp-inco(i,l))*surface(face)*porosite(face)*nu_1(i,ori)/dist : - (inco(j,l)-T_imp)*surface(face)*porosite(face)*nu_1(j,ori)/dist; - } - } - else - { - const double T_imp = la_cl.val_imp(face-num1,k); - const int ori = DERIVED_T::IS_ANISO ? orientation(face) : k; - flux[k] = (i != -1) ? (T_imp-inco(i,k))*surface(face)*porosite(face)*nu_1(i,ori)/dist : - (inco(j,k)-T_imp)*surface(face)*porosite(face)*nu_1(j,ori)/dist; - } - } -} - -template template -inline void Eval_Diff_VDF_Elem_Gen::flux_face(const DoubleTab& inco, const DoubleTab& val_b, const int face, const Dirichlet_loi_paroi& la_cl, const int num1, Type_Double& flux) const -{ - if (DERIVED_T::IS_MULTI_SCALAR_DIFF) throw; - - const double dist = Dist_norm_bord(face); - const int i = elem_(face,0), j = elem_(face,1), ncomp = flux.size_array(); - - for (int k = 0; k < ncomp; k++) - { - const double T_imp = la_cl.val_imp(face-num1,k); - const int ori = DERIVED_T::IS_ANISO ? orientation(face) : k; - flux[k] = (i != -1) ? (T_imp-inco(i,k))*surface(face)*porosite(face)*nu_1(i,ori)/dist : (inco(j,k)-T_imp)*surface(face)*porosite(face)*nu_1(j,ori)/dist; - } -} - -template template -inline void Eval_Diff_VDF_Elem_Gen::flux_face(const DoubleTab& , const DoubleTab& val_b, const int face, const Neumann_paroi& la_cl, const int num1, Type_Double& flux) const -{ - const int i = elem_(face,0), ncomp = flux.size_array(); - - // XXX LUIS : Note : Pas de distinguo entre MULTISCALAR_DIFF et une diffusion normale pour des CL de Neumann - for (int k = 0; k < ncomp; k++) - flux[k] = ((i != -1) ? 1 : -1) * la_cl.flux_impose(face - num1, k) * surface(face); -} - -template template -inline void Eval_Diff_VDF_Elem_Gen::flux_face(const DoubleTab& inco, const DoubleTab&, const int face, const Periodique& la_cl, const int , Type_Double& flux) const -{ - const int i = elem_(face,0), j = elem_(face,1), ncomp = flux.size_array(); - const double d0 = le_dom->dist_face_elem0_period(face,i,la_cl.distance()), d1 = le_dom->dist_face_elem1_period(face,j,la_cl.distance()); - - for (int k = 0; k < ncomp; k++) - { - if (DERIVED_T::IS_MULTI_SCALAR_DIFF) - { - - if (DERIVED_T::IS_ANISO) throw; // XXX LUIS : pas d'anisotropie pour l'instant - - flux[k] = 0.0; - for (int l = 0; l < ncomp; l++) - { - const int comp_diff = ncomp * k + l; - double heq = 0.; - - if (nu_1(i,comp_diff) == 0.0 || nu_1(j,comp_diff) == 0.0) heq = 0.; - else - { - assert(nu_1(i,comp_diff) != 0.0 && nu_1(j,comp_diff) != 0.0); - heq = compute_heq(d0, i, d1, j, comp_diff); - } - flux[k] += DERIVED_T::IS_QUASI ? heq*(inco(j,l)/dv_mvol(j) - inco(i,l)/dv_mvol(i))*surface(face)*porosite(face) : heq*(inco(j,l) - inco(i,l))*surface(face)*porosite(face); - } - - } - else - { - double heq = -123.; - const int ori = DERIVED_T::IS_ANISO ? orientation(face) : k; - if (nu_1(i,ori) == 0.0 || nu_1(j,ori) == 0.0) heq = 0.; - else - { - assert(nu_1(i,ori) != 0.0 && nu_1(j,ori) != 0.0); - heq = compute_heq(d0,i, d1,j,ori); - } - flux[k] = DERIVED_T::IS_QUASI ? heq*(inco(j,k)/dv_mvol(j) - inco(i,k)/dv_mvol(i))*surface(face)*porosite(face) : heq*(inco(j,k) - inco(i,k))*surface(face)*porosite(face); - } - } -} - -template template -inline void Eval_Diff_VDF_Elem_Gen::flux_face(const DoubleTab& inco, const DoubleTab& val_b, const int face, const Dirichlet_paroi_fixe&, const int num1, Type_Double& flux ) const -{ - if (DERIVED_T::IS_MULTI_SCALAR_DIFF) throw; - - const int i = elem_(face,0), j = elem_(face,1), ncomp = flux.size_array(); - const double dist = Dist_norm_bord(face); - - if (DERIVED_T::IS_QUASI) - for (int k = 0; k < ncomp; k++) flux[k] = (i != -1) ? -inco(i,k)*surface(face)*porosite(face)*nu_1(i,k)/dv_mvol(i)/dist : inco(j,k)*surface(face)*porosite(face)*nu_1(j,k)/dv_mvol(j)/dist; - else - for (int k = 0; k < ncomp; k++) flux[k] = (i != -1) ? -inco(i,k)*surface(face)*porosite(face)*nu_1(i,k)/dist : inco(j,k)*surface(face)*porosite(face)*nu_1(j,k)/dist; -} - template template inline void Eval_Diff_VDF_Elem_Gen::flux_face(const DoubleTab& inco, const DoubleTab&, const int face , const Echange_global_impose& la_cl, const int num1, Type_Double& flux) const { @@ -346,51 +193,157 @@ inline void Eval_Diff_VDF_Elem_Gen::flux_face(const DoubleTab& inco, } } -template template -inline void Eval_Diff_VDF_Elem_Gen::flux_faces_interne(const DoubleTab& inco, const int face, Type_Double& flux) const +template template +KOKKOS_INLINE_FUNCTION void Eval_Diff_VDF_Elem_Gen::flux_faces_bord_comp(CDoubleTabView inco, CDoubleTabView val_b, const int face, const BC_View& bc_view, const int num1, const int k, double& flux) const { - const int i = elem_(face,0), j = elem_(face,1), ncomp = flux.size_array(); - double heq, d0 = Dist_face_elem0(face,i), d1 = Dist_face_elem1(face,j); - for (int k = 0; k < ncomp; k++) + if constexpr (std::is_same_v) { - const int ori = DERIVED_T::IS_ANISO ? orientation(face) : k; - if (DERIVED_T::IS_RANS) - { - heq = compute_heq(d0,i, d1,j,ori); // pas d'assert pour k-eps ! - flux[k] = DERIVED_T::IS_QUASI ? heq*(inco(j,k)/dv_mvol(j) - inco(i,k)/dv_mvol(i))*surface(face)*porosite(face) : - heq*(inco(j,k)-inco(i,k))*surface(face)*porosite(face); - } - else if (DERIVED_T::IS_MULTI_SCALAR_DIFF) + if (DERIVED_T::IS_MULTI_SCALAR_DIFF) { Process::Kokkos_exit("IS_MULTI_SCALAR_DIFF not supported in _comp"); return; } + const double distance = bc_view.val[0](0,0); + const int i = elem_v_(face,0), j = elem_v_(face,1); + const double d0 = le_dom_v_.dist_face_elem0_period(face,i,distance); + const double d1 = le_dom_v_.dist_face_elem1_period(face,j,distance); + const int ori = DERIVED_T::IS_ANISO ? orientation_v_(face) : k; + const double heq = compute_heq(d0,i,d1,j,ori); + flux = DERIVED_T::IS_QUASI ? heq*(inco(j,k)/dv_mvol(j) - inco(i,k)/dv_mvol(i))*surface_v_(face)*porosite_v_(face) + : heq*(inco(j,k) - inco(i,k))*surface_v_(face)*porosite_v_(face); + } + else if constexpr (std::is_same_v) + { + const double dist = get_dist_norm_bord(face); + const int i = elem_v_(face,0), j = elem_v_(face,1); + if (DERIVED_T::IS_MULTI_SCALAR_DIFF) { - flux[k] = 0.0; + const int ncomp = (int)inco.extent(1); + flux = 0.; for (int l = 0; l < ncomp; l++) { - const int comp_diff = ncomp * k + l; - if (nu_1(i,comp_diff) == 0.0 || nu_1(j,comp_diff) == 0.0) heq = 0.; - else - { - assert(nu_1(i,comp_diff) != 0.0 && nu_1(j,comp_diff) != 0.0); - heq = compute_heq(d0, i, d1, j, comp_diff); - } - flux[k] += heq * (inco(j, l) - inco(i, l)) * surface(face) * porosite(face); + const double T_imp = bc_view.val[0](face - num1, l); + const int ori = ncomp * k + l; + flux += (i != -1) ? (T_imp - inco(i,l)) * surface_v_(face) * porosite_v_(face) * nu_1(i,ori) / dist + : (inco(j,l) - T_imp) * surface_v_(face) * porosite_v_(face) * nu_1(j,ori) / dist; } } + else if (DERIVED_T::IS_QUASI) + { + const double T_imp = bc_view.val[0](face-num1,k); + const int ori = DERIVED_T::IS_ANISO ? orientation_v_(face) : k; + flux = (i != -1) ? (T_imp-inco(i,k))/dv_mvol(i)*surface_v_(face)*porosite_v_(face)*nu_1(i,ori)/dist + : (inco(j,k)-T_imp)/dv_mvol(j)*surface_v_(face)*porosite_v_(face)*nu_1(j,ori)/dist; + } else { - if (nu_1(i,ori) == 0.0 || nu_1(j,ori) == 0.0) heq = 0.; - else + const double T_imp = bc_view.val[0](face-num1,k); + const int ori = DERIVED_T::IS_ANISO ? orientation_v_(face) : k; + flux = (i != -1) ? (T_imp-inco(i,k))*surface_v_(face)*porosite_v_(face)*nu_1(i,ori)/dist + : (inco(j,k)-T_imp)*surface_v_(face)*porosite_v_(face)*nu_1(j,ori)/dist; + } + } + else if constexpr (std::is_same_v) + { + const double dist = get_dist_norm_bord(face); + const int i = elem_v_(face,0), j = elem_v_(face,1); + if (DERIVED_T::IS_MULTI_SCALAR_DIFF) + { + const int ncomp = (int)inco.extent(1); + flux = 0.; + for (int l = 0; l < ncomp; l++) { - assert(nu_1(i,ori) != 0.0 && nu_1(j,ori) != 0.0); - heq = compute_heq(d0,i, d1,j,ori); + const double T_imp = bc_view.val[0](face - num1, l); + const int ori = ncomp * k + l; + flux += (i != -1) ? (T_imp - inco(i,l)) * surface_v_(face) * porosite_v_(face) * nu_1(i,ori) / dist + : (inco(j,l) - T_imp) * surface_v_(face) * porosite_v_(face) * nu_1(j,ori) / dist; } - flux[k] = heq*(inco(j,k)-inco(i,k))*surface(face)*porosite(face); + } + else + { + const double T_imp = bc_view.val[0](face-num1,k); + const int ori = DERIVED_T::IS_ANISO ? orientation_v_(face) : k; + flux = (i != -1) ? (T_imp-inco(i,k))*surface_v_(face)*porosite_v_(face)*nu_1(i,ori)/dist + : (inco(j,k)-T_imp)*surface_v_(face)*porosite_v_(face)*nu_1(j,ori)/dist; } } + else if constexpr (std::is_same_v) + { + if (DERIVED_T::IS_MULTI_SCALAR_DIFF) { Process::Kokkos_exit("IS_MULTI_SCALAR_DIFF not supported in _comp"); return; } + const double dist = get_dist_norm_bord(face); + const int i = elem_v_(face,0), j = elem_v_(face,1); + const double T_imp = bc_view.val[0](face-num1,k); + const int ori = DERIVED_T::IS_ANISO ? orientation_v_(face) : k; + flux = (i != -1) ? (T_imp-inco(i,k))*surface_v_(face)*porosite_v_(face)*nu_1(i,ori)/dist + : (inco(j,k)-T_imp)*surface_v_(face)*porosite_v_(face)*nu_1(j,ori)/dist; + } + else if constexpr (std::is_same_v) + { + // No distinction between IS_MULTI_SCALAR_DIFF and normal diffusion for Neumann BCs + const int i = elem_v_(face,0); + flux = ((i != -1) ? 1 : -1) * bc_view.val[0](face-num1,k)*surface_v_(face); + } + else if constexpr (std::is_same_v) + { + if (DERIVED_T::IS_MULTI_SCALAR_DIFF) { Process::Kokkos_exit("IS_MULTI_SCALAR_DIFF not supported in _comp"); return; } + const int i = elem_v_(face,0), j = elem_v_(face,1); + const double dist = get_dist_norm_bord(face); + if (DERIVED_T::IS_QUASI) + flux = (i != -1) ? -inco(i,k)*surface_v_(face)*porosite_v_(face)*nu_1(i,k)/dv_mvol(i)/dist + : inco(j,k)*surface_v_(face)*porosite_v_(face)*nu_1(j,k)/dv_mvol(j)/dist; + else + flux = (i != -1) ? -inco(i,k)*surface_v_(face)*porosite_v_(face)*nu_1(i,k)/dist + : inco(j,k)*surface_v_(face)*porosite_v_(face)*nu_1(j,k)/dist; + } + else if constexpr(std::is_same_v) + { + if (DERIVED_T::IS_MULTI_SCALAR_DIFF) + { + Process::Kokkos_exit("IS_MULTI_SCALAR_DIFF not supported for Echange_global_impose"); + return; + } + const int i = elem_v_(face, 0), j = elem_v_(face, 1); + const double h = bc_view.val[0](face - num1, k); + const double Text = bc_view.val[1](face - num1, k); + const double phi = bc_view.val[2].data() ? bc_view.val[2](face - num1, k) : 0.0; + flux = (i != -1) ? (phi + h * (Text - inco(i, k))) * surface_v_(face) + : (-phi + h * (inco(j, k) - Text)) * surface_v_(face); + } + else + flux = 0.; // Do nothing for Neumann_sortie_libre, Symetrie, etc. +} + +template +KOKKOS_INLINE_FUNCTION void Eval_Diff_VDF_Elem_Gen::flux_faces_interne_comp(CDoubleTabView inco, const int face, const int k, double& flux) const +{ + const int i = elem_v_(face,0), j = elem_v_(face,1); + const double d0 = get_dist_face_elem0(face,i), d1 = get_dist_face_elem1(face,j); + const int ori = DERIVED_T::IS_ANISO ? orientation_v_(face) : k; + if (DERIVED_T::IS_RANS) + { + const double heq = compute_heq(d0,i,d1,j,ori); + flux = DERIVED_T::IS_QUASI ? heq*(inco(j,k)/dv_mvol(j) - inco(i,k)/dv_mvol(i))*surface_v_(face)*porosite_v_(face) + : heq*(inco(j,k)-inco(i,k))*surface_v_(face)*porosite_v_(face); + } + else if (DERIVED_T::IS_MULTI_SCALAR_DIFF) + { + const int ncomp = (int)inco.extent(1); + flux = 0.; + for (int l = 0; l < ncomp; l++) + { + const int comp_diff = ncomp * k + l; + const double heq = (nu_1(i,comp_diff) == 0.0 || nu_1(j,comp_diff) == 0.0) ? 0. : compute_heq(d0, i, d1, j, comp_diff); + flux += heq * (inco(j, l) - inco(i, l)) * surface_v_(face) * porosite_v_(face); + } + } + else + { + double heq; + if (nu_1(i,ori) == 0.0 || nu_1(j,ori) == 0.0) heq = 0.; + else heq = compute_heq(d0,i,d1,j,ori); + flux = heq*(inco(j,k)-inco(i,k))*surface_v_(face)*porosite_v_(face); + } } /* ************************************** * - * ********* POUR L'IMPLICITE ********** * - * ************************************** */ +* ********* POUR L'IMPLICITE ********** * +* ************************************** */ template template inline void Eval_Diff_VDF_Elem_Gen::coeffs_face(const int face, const int, const Dirichlet_entree_fluide& la_cl, Type_Double& aii, Type_Double& ajj) const @@ -399,7 +352,7 @@ inline void Eval_Diff_VDF_Elem_Gen::coeffs_face(const int face, const const int i = elem_(face,0), j = elem_(face,1), ncomp = DERIVED_T::IS_MULTI_SCALAR_DIFF ? static_cast(std::sqrt(aii.size_array())) : aii.size_array(); - const double dist = Dist_norm_bord(face); + const double dist = get_dist_norm_bord(face); for (int k = 0; k < ncomp; k++) if (DERIVED_T::IS_MULTI_SCALAR_DIFF) @@ -426,7 +379,7 @@ inline void Eval_Diff_VDF_Elem_Gen::coeffs_face(const int face, const const int i = elem_(face,0), j = elem_(face,1), ncomp = DERIVED_T::IS_MULTI_SCALAR_DIFF ? static_cast(std::sqrt(aii.size_array())) : aii.size_array(); - const double dist = Dist_norm_bord(face); + const double dist = get_dist_norm_bord(face); for (int k = 0; k < ncomp; k++) if (DERIVED_T::IS_MULTI_SCALAR_DIFF) @@ -451,7 +404,7 @@ inline void Eval_Diff_VDF_Elem_Gen::coeffs_face(const int face, const { assert (aii.size_array() == ajj.size_array()); const int i = elem_(face,0), j = elem_(face,1), ncomp = aii.size_array(); - const double dist = Dist_norm_bord(face); + const double dist = get_dist_norm_bord(face); for (int k = 0; k < ncomp; k++) { const int ori = DERIVED_T::IS_ANISO ? orientation(face) : k; @@ -473,13 +426,7 @@ inline void Eval_Diff_VDF_Elem_Gen::coeffs_face(const int face, const for (int l = 0; l < ncomp; l++) { const int ori = ncomp * k + l; - double heq; - if (nu_1(i,ori) == 0.0 || nu_1(j,ori) == 0.0) heq = 0.; - else - { - assert(nu_1(i,ori) != 0.0 && nu_1(j,ori) != 0.0); - heq = compute_heq(d0, i, d1, j, ori); - } + double heq = (nu_1(i,ori) == 0.0 || nu_1(j,ori) == 0.0) ? 0. : compute_heq(d0, i, d1, j, ori); aii[ori] = heq * surface(face) * porosite(face); ajj[ori] = heq * surface(face) * porosite(face); } @@ -487,13 +434,7 @@ inline void Eval_Diff_VDF_Elem_Gen::coeffs_face(const int face, const else { const int ori = DERIVED_T::IS_ANISO ? orientation(face) : k; - double heq; - if (nu_1(i,ori) == 0.0 || nu_1(j,ori) == 0.0) heq = 0.; - else - { - assert(nu_1(i,ori) != 0.0 && nu_1(j,ori) != 0.0); - heq = compute_heq(d0, i, d1, j, ori); - } + double heq = (nu_1(i,ori) == 0.0 || nu_1(j,ori) == 0.0) ? 0. : compute_heq(d0, i, d1, j, ori); aii[k] = ajj[k] = heq*surface(face)*porosite(face); // On peut faire ca ! } } @@ -504,7 +445,7 @@ inline void Eval_Diff_VDF_Elem_Gen::coeffs_face(const int face, const { assert (aii.size_array() == ajj.size_array()); const int i = elem_(face,0), j = elem_(face,1), ncomp = aii.size_array(); - const double dist = Dist_norm_bord(face); + const double dist = get_dist_norm_bord(face); if (DERIVED_T::IS_QUASI) { for (int k = 0; k < ncomp; k++) @@ -601,7 +542,7 @@ inline void Eval_Diff_VDF_Elem_Gen::coeffs_faces_interne(const int fa { assert (aii.size_array() == ajj.size_array()); const int i = elem_(face,0), j = elem_(face,1), ncomp = DERIVED_T::IS_MULTI_SCALAR_DIFF ? int(sqrt(aii.size_array())) : aii.size_array(); - double heq, d0 = Dist_face_elem0(face,i), d1 = Dist_face_elem1(face,j); + double heq, d0 = get_dist_face_elem0(face,i), d1 = get_dist_face_elem1(face,j); for (int k = 0; k < ncomp; k++) { const int ori = DERIVED_T::IS_ANISO ? orientation(face) : k; @@ -644,7 +585,7 @@ template template inline void Eval_Diff_VDF_Elem_Gen::secmem_face(const int face, const Dirichlet_entree_fluide& la_cl, const int num1, Type_Double& flux) const { const int i = elem_(face,0), j = elem_(face,1), ncomp = flux.size_array(); - double dist = Dist_norm_bord(face); + double dist = get_dist_norm_bord(face); for (int k = 0; k < ncomp; k++) { const int ori = DERIVED_T::IS_ANISO ? orientation(face) : k; diff --git a/src/VDF/Operateurs/Eval_Diff_Dift/Eval_Diff_VDF_Face_Gen.h b/src/VDF/Operateurs/Eval_Diff_Dift/Eval_Diff_VDF_Face_Gen.h index b783b48e70..d989ebc60d 100644 --- a/src/VDF/Operateurs/Eval_Diff_Dift/Eval_Diff_VDF_Face_Gen.h +++ b/src/VDF/Operateurs/Eval_Diff_Dift/Eval_Diff_VDF_Face_Gen.h @@ -1,5 +1,5 @@ /**************************************************************************** -* Copyright (c) 2023, CEA +* Copyright (c) 2026, CEA * All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: @@ -42,37 +42,47 @@ class Eval_Diff_VDF_Face_Gen : public Eval_VDF_Face, public Evaluateur_VDF * ********* POUR L'EXPLICITE ********** * * ************************************** */ - template inline std::enable_if_t< Fa7_Type == Type_Flux_Fa7::SORTIE_LIBRE, void> - flux_fa7(const DoubleTab&, const DoubleTab*, int , const Neumann_sortie_libre&, int, Type_Double& ) const { /* Do nothing */ } + template + inline std::enable_if_t< Arete_Type == Type_Flux_Arete::INTERNE, void> + flux_arete(const DoubleTab&, const DoubleTab*, int, int, int, int, Type_Double& ) const ; - template inline std::enable_if_t< Fa7_Type == Type_Flux_Fa7::ELEM, void> - flux_fa7(const DoubleTab&, const DoubleTab*, int, int, int, Type_Double& ) const; + // _comp variants: void with output ref(s) for one component k + template + KOKKOS_INLINE_FUNCTION std::enable_if_t + flux_arete_comp(CDoubleTabView, CDoubleTabView, int, int, int, int, int, double&) const; - template inline std::enable_if_t< Arete_Type == Type_Flux_Arete::INTERNE, void> - flux_arete(const DoubleTab&, const DoubleTab*, int, int, int, int, Type_Double& ) const ; + template + KOKKOS_INLINE_FUNCTION std::enable_if_t + flux_arete_comp(CDoubleTabView, CDoubleTabView, int, int, int, int, int, double&) const; - template inline std::enable_if_t< Arete_Type == Type_Flux_Arete::MIXTE, void> - flux_arete(const DoubleTab&, const DoubleTab*, int, int, int, int, Type_Double& ) const ; + template + KOKKOS_INLINE_FUNCTION std::enable_if_t + flux_arete_comp(CDoubleTabView, CDoubleTabView, CDoubleTabView, CDoubleTabView, int, int, int, int, int, double&) const; - template inline std::enable_if_t - flux_arete(const DoubleTab&, const DoubleTab*, int, int, int, int, Type_Double& ) const ; + template + KOKKOS_INLINE_FUNCTION std::enable_if_t + flux_arete_comp(CDoubleTabView, CDoubleTabView, CDoubleTabView, CDoubleTabView, int, int, int, int, int, double&) const; - template inline std::enable_if_t<(Arete_Type == Type_Flux_Arete::NAVIER), void> - flux_arete(const DoubleTab&, const DoubleTab*, int, int, int, int, Type_Double& ) const; + template + KOKKOS_INLINE_FUNCTION std::enable_if_t + flux_arete_comp(CDoubleTabView, CDoubleTabView, CDoubleTabView, CDoubleTabView, int, int, int, int, int, int, double&, double&) const; - template - inline std::enable_if_t< Arete_Type == Type_Flux_Arete::FLUIDE || Arete_Type == Type_Flux_Arete::NAVIER_FLUIDE || Arete_Type == Type_Flux_Arete::PAROI_FLUIDE, void> - flux_arete(const DoubleTab&, const DoubleTab*, int, int, int, int, Type_Double&, Type_Double&) const; + template + KOKKOS_INLINE_FUNCTION std::enable_if_t + flux_arete_comp(CDoubleTabView, CDoubleTabView, int, int, int, int, int, double&, double&) const; - template inline std::enable_if_t< Arete_Type == Type_Flux_Arete::PERIODICITE, void> - flux_arete(const DoubleTab&, const DoubleTab*, int, int, int, int, Type_Double&, Type_Double&) const ; + template + KOKKOS_INLINE_FUNCTION std::enable_if_t + flux_arete_comp(CDoubleTabView, CDoubleTabView, CDoubleTabView, CDoubleTabView, int, int, int, int, int, double& f3, double& f12) const + { Process::Kokkos_exit("arete_coin_fluide not coded for this scheme."); f3 = f12 = 0.; } - template inline std::enable_if_t< Arete_Type == Type_Flux_Arete::COIN_FLUIDE, void> - flux_arete(const DoubleTab&, const DoubleTab*, int, int, int, int, Type_Double&, Type_Double&) const - { - Cerr << "arete_coin_fluide not coded for this scheme. Ask the TRUST support to code like Eval_Amont_VDF_Face !" << finl; - Process::exit(); - } + template + KOKKOS_INLINE_FUNCTION std::enable_if_t + flux_fa7_comp(CDoubleTabView, CDoubleTabView, int, int, int, int, double&) const; + + template + KOKKOS_INLINE_FUNCTION std::enable_if_t + flux_fa7_comp(CDoubleTabView, CDoubleTabView, int, CDoubleTabView, int, int, double& f) const { f = 0.; } /* ************************************** * * ********* POUR L'IMPLICITE ********** * @@ -94,35 +104,51 @@ class Eval_Diff_VDF_Face_Gen : public Eval_VDF_Face, public Evaluateur_VDF coeffs_arete(const DoubleTab*, int, int, int, int, Type_Double& , Type_Double& ) const; template inline std::enable_if_t< Arete_Type == Type_Flux_Arete::PAROI || Arete_Type == Type_Flux_Arete::NAVIER_PAROI, void> - coeffs_arete(const DoubleTab*, int, int, int, int, Type_Double& , Type_Double& , Type_Double& ) const; + coeffs_arete(const DoubleTab&, const DoubleTab&, const DoubleTab*, int, int, int, int, Type_Double& , Type_Double& , Type_Double& ) const; template inline std::enable_if_t< Arete_Type == Type_Flux_Arete::FLUIDE || Arete_Type == Type_Flux_Arete::NAVIER_FLUIDE || Arete_Type == Type_Flux_Arete::PAROI_FLUIDE, void> - coeffs_arete(const DoubleTab*, int, int, int, int, Type_Double& , Type_Double& , Type_Double& ) const; + coeffs_arete(const DoubleTab&, const DoubleTab&, const DoubleTab*, int, int, int, int, Type_Double& , Type_Double& , Type_Double& ) const; template inline std::enable_if_t< (Arete_Type == Type_Flux_Arete::NAVIER), void> - coeffs_arete(const DoubleTab*, int, int, int, int, Type_Double& , Type_Double& , Type_Double& ) const; + coeffs_arete(const DoubleTab&, const DoubleTab&, const DoubleTab*, int, int, int, int, Type_Double& , Type_Double& , Type_Double& ) const; template inline std::enable_if_t< (Arete_Type == Type_Flux_Arete::COIN_FLUIDE), void> - coeffs_arete(const DoubleTab*, int, int, int, int, Type_Double& , Type_Double& , Type_Double& ) const { /* Do nothing */ } + coeffs_arete(const DoubleTab&, const DoubleTab&, const DoubleTab*, int, int, int, int, Type_Double& , Type_Double& , Type_Double& ) const { /* Do nothing */ } + + + void view_ro() const override + { + Evaluateur_VDF::view_ro(); + static_cast(this)->view_ro_impl(); + } private: inline double surface_(int i,int j) const { return 0.5*(surface(i)+surface(j)); } inline double porosity_(int i,int j) const { return 0.5*(porosite(i)+porosite(j)); } + KOKKOS_INLINE_FUNCTION double mean_surface(int i,int j) const { return 0.5*(surface_v_(i)+surface_v_(j)); } + KOKKOS_INLINE_FUNCTION double mean_porosity(int i,int j) const { return 0.5*(porosite_v_(i)+porosite_v_(j)); } // CRTP pattern to static_cast the appropriate class and get the implementation : This is magic ! - inline double nu_mean_2pts(int i=0, int j=0, int compo=0) const { return static_cast(this)->nu_1_impl_face(i, j, compo); } - inline double nu_mean_4pts(int i, int j, int k, int l, int compo=0) const { return static_cast(this)->nu_2_impl_face(i, j, k, l,compo); } - inline double nu_lam(int i, int j=0) const { return static_cast(this)->nu_2_impl(i,j); } // Attention nu_2_impl and not nu_1_impl for Dift ... - inline double nu_lam_mean_4pts(int i, int j, int k, int l, int compo=0) const { return static_cast(this)->nu_lam_impl_face(i,j,k,l,compo); } - inline double nu_lam_mean_2pts(int i, int j, int compo=0) const { return static_cast(this)->nu_lam_impl_face2(i,j,compo); } - inline double nu_turb(int i, int compo=0) const { return static_cast(this)->nu_t_impl(i,compo); } - inline double tau_tan(int i, int j) const { return static_cast(this)->tau_tan_impl(i,j); } - inline bool uses_wall_law() const { return static_cast(this)->uses_wall(); } - inline bool uses_mod_turb() const { return static_cast(this)->uses_mod(); } + template + KOKKOS_INLINE_FUNCTION double nu_mean_2pts(int i=0, int j=0, int compo=0) const { return static_cast(this)->template nu_1_impl_face(i, j, compo); } + template + KOKKOS_INLINE_FUNCTION double nu_mean_4pts(int i, int j, int k, int l, int compo=0) const { return static_cast(this)->template nu_2_impl_face(i, j, k, l,compo); } + template + KOKKOS_INLINE_FUNCTION double nu_lam(int i, int j=0) const { return static_cast(this)->template nu_2_impl(i,j); } // Attention nu_2_impl and not nu_1_impl for Dift ... + template + KOKKOS_INLINE_FUNCTION double nu_lam_mean_4pts(int i, int j, int k, int l, int compo=0) const { return static_cast(this)->template nu_lam_impl_face(i,j,k,l,compo); } + template + KOKKOS_INLINE_FUNCTION double nu_lam_mean_2pts(int i, int j, int compo=0) const { return static_cast(this)->template nu_lam_impl_face2(i,j,compo); } + template + KOKKOS_INLINE_FUNCTION double nu_turb(int i, int compo=0) const { return static_cast(this)->template nu_t_impl(i,compo); } + KOKKOS_INLINE_FUNCTION double tau_tan(int i, int j) const { return static_cast(this)->tau_tan_impl(i,j); } + KOKKOS_INLINE_FUNCTION bool uses_wall_law() const { return static_cast(this)->uses_wall(); } + KOKKOS_INLINE_FUNCTION bool uses_mod_turb() const { return static_cast(this)->uses_mod(); } inline const DoubleTab& k_elem() const { return static_cast(this)->get_k_elem(); } // pour F5 seulement ... // methods to check coeffs/flux implementation + /* static constexpr double EPS = 1e-6; template void check_error(const char * , const int, const int , const Type_Double& , const Type_Double& , const Type_Double& ) const; @@ -133,11 +159,11 @@ class Eval_Diff_VDF_Face_Gen : public Eval_VDF_Face, public Evaluateur_VDF template inline std::enable_if_t - test_coeffs_common(const int , const int , const int , const int , Type_Double& , Type_Double& ) const; + test_coeffs_common(const DoubleTab&, const DoubleTab&, const int , const int , const int , const int , Type_Double& , Type_Double& ) const; template inline std::enable_if_t - test_coeffs_common(const int , const int , const int , const int , Type_Double& , Type_Double& , Type_Double& , Type_Double&) const; + test_coeffs_common(const DoubleTab&, const DoubleTab&, const int , const int , const int , const int , Type_Double& , Type_Double& , Type_Double& , Type_Double&) const; template inline std::enable_if_t @@ -160,11 +186,12 @@ class Eval_Diff_VDF_Face_Gen : public Eval_VDF_Face, public Evaluateur_VDF template inline std::enable_if_t - test_coeffs_arete(const int, const int, const int, const int, const Type_Double&) const; + test_coeffs_arete(const DoubleTab&, const DoubleTab&, const int, const int, const int, const int, const Type_Double&) const; template inline std::enable_if_t - test_coeffs_arete(const int, const int, const int, const int, const Type_Double& , const Type_Double&) const; + test_coeffs_arete(const DoubleTab&, const DoubleTab&, const int, const int, const int, const int, const Type_Double& , const Type_Double&) const; + */ }; #include // templates specializations ici ;) diff --git a/src/VDF/Operateurs/Eval_Diff_Dift/Eval_Diff_VDF_Face_Gen.tpp b/src/VDF/Operateurs/Eval_Diff_Dift/Eval_Diff_VDF_Face_Gen.tpp index 984555ffa4..26a9886586 100644 --- a/src/VDF/Operateurs/Eval_Diff_Dift/Eval_Diff_VDF_Face_Gen.tpp +++ b/src/VDF/Operateurs/Eval_Diff_Dift/Eval_Diff_VDF_Face_Gen.tpp @@ -1,5 +1,5 @@ /**************************************************************************** -* Copyright (c) 2023, CEA +* Copyright (c) 2026, CEA * All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: @@ -25,20 +25,8 @@ * ********* POUR L'EXPLICITE ********** * * ************************************** */ -template template inline std::enable_if_t< Fa7_Type == Type_Flux_Fa7::ELEM, void> -Eval_Diff_VDF_Face_Gen::flux_fa7(const DoubleTab& inco, const DoubleTab*, int elem, int fac1, int fac2, Type_Double& flux) const -{ - const int ori = orientation(fac1), ncomp = flux.size_array(); - const double dist = dist_face(fac1,fac2,ori), surf = 0.5*(surface(fac1)*porosite(fac1)+surface(fac2)*porosite(fac2)); - for (int k = 0; k < ncomp; k++) - { - const double tau = (inco(fac2,k)-inco(fac1,k))/dist, tau_tr = ACTIVATE_TAU_TR ? tau : 0.0; - const double visc_lam = nu_lam(elem, k), visc_turb = DERIVED_T::IS_TURB ? nu_turb(elem, k) : 0.; - flux[k] = ((tau + tau_tr) * (visc_lam + visc_turb)) * surf; - } -} - -template template inline std::enable_if_t< Arete_Type == Type_Flux_Arete::INTERNE, void> +template template +inline std::enable_if_t< Arete_Type == Type_Flux_Arete::INTERNE, void> Eval_Diff_VDF_Face_Gen::flux_arete(const DoubleTab& inco, const DoubleTab*, int fac1, int fac2, int fac3, int fac4, Type_Double& flux) const { const int ori1 = orientation(fac1), ori3 = orientation(fac3), elem1 = elem_(fac3,0), elem2 = elem_(fac3,1), elem3 = elem_(fac4,0), elem4 = elem_(fac4,1), ncomp = flux.size_array(); @@ -53,156 +41,129 @@ Eval_Diff_VDF_Face_Gen::flux_arete(const DoubleTab& inco, const Doubl } } -template template inline std::enable_if_t< Arete_Type == Type_Flux_Arete::MIXTE, void> -Eval_Diff_VDF_Face_Gen::flux_arete(const DoubleTab& inco, const DoubleTab*, int fac1, int fac2, int fac3, int fac4, Type_Double& flux) const +// ===== _comp scalar variants (one component k) for MDRangePolicy kernels ===== + +template template +KOKKOS_INLINE_FUNCTION std::enable_if_t +Eval_Diff_VDF_Face_Gen::flux_fa7_comp(CDoubleTabView inco, CDoubleTabView, int elem, int fac1, int fac2, int k, double& flux) const { - const int N = flux.size_array(); - int elem[4], ori1 = orientation(fac1), ori3 = orientation(fac3); - elem[0] = elem_(fac3,0), elem[1] = elem_(fac3,1), elem[2] = elem_(fac4,0), elem[3] = elem_(fac4,1); - std::vector visc_lam_temp(N), visc_turb_temp(N); - for (int k = 0; k < N; k++) - for (int i = 0; i < 4; i++) - if (elem[i] != -1) - { - visc_lam_temp[k] += nu_lam(elem[i], k); - visc_turb_temp[k] += nu_turb(elem[i], k); - } - for (int k = 0; k < N; k++) - { - visc_lam_temp[k] /= 3.0; - visc_turb_temp[k] /= 3.0; - } - const double surf = surface_(fac1,fac2), poros = porosity_(fac1,fac2); + const int ori = orientation_v_(fac1); + const double dist = le_dom_v_.dist_face(fac1, fac2, ori); + const double surf = 0.5 * (surface_v_(fac1) * porosite_v_(fac1) + surface_v_(fac2) * porosite_v_(fac2)); + const double tau = (inco(fac2, k) - inco(fac1, k)) / dist; + const double tau_tr = ACTIVATE_TAU_TR ? tau : 0.0; + const double visc_lam = nu_lam(elem, k); + const double visc_turb = DERIVED_T::IS_TURB ? nu_turb(elem, k) : 0.; + flux = ((tau + tau_tr) * (visc_lam + visc_turb)) * surf; +} - for (int k = 0; k < N; k++) - if (inco(fac4,k)*inco(fac3,k) != 0) - { - const double visc_lam = visc_lam_temp[k], visc_turb = DERIVED_T::IS_TURB ? visc_turb_temp[k] : 0.0; - const double tau = (inco(fac4,k)-inco(fac3,k))/dist_face(fac3,fac4,ori1), tau_tr = ACTIVATE_TAU_TR ? (inco(fac2,k)-inco(fac1,k))/dist_face(fac1,fac2,ori3) : 0.0; - flux[k] = ((tau + tau_tr) * (visc_lam + visc_turb)) * surf * poros; - } +template template +KOKKOS_INLINE_FUNCTION std::enable_if_t +Eval_Diff_VDF_Face_Gen::flux_arete_comp(CDoubleTabView inco, CDoubleTabView, int fac1, int fac2, int fac3, int fac4, int k, double& flux) const +{ + const int ori1 = orientation_v_(fac1), ori3 = orientation_v_(fac3); + const int elem1 = elem_v_(fac3,0), elem2 = elem_v_(fac3,1), elem3 = elem_v_(fac4,0), elem4 = elem_v_(fac4,1); + const double surf = mean_surface(fac1,fac2), poros = mean_porosity(fac1,fac2); + const double tau = (inco(fac4,k)-inco(fac3,k))/le_dom_v_.dist_face(fac3,fac4,ori1); + const double tau_tr = ACTIVATE_TAU_TR ? (inco(fac2,k)-inco(fac1,k))/le_dom_v_.dist_face(fac1,fac2,ori3) : 0.0; + const int ind = DERIVED_T::IS_ANISO ? ori3 : k; + const double visc_lam = nu_lam_mean_4pts(elem1,elem2,elem3,elem4,ind); + const double visc_turb = DERIVED_T::IS_TURB ? nu_mean_4pts(elem1,elem2,elem3,elem4,ind) : 0.0; + flux = ((tau+tau_tr)*(visc_lam+visc_turb))*surf*poros; } -template template -inline std::enable_if_t -Eval_Diff_VDF_Face_Gen::flux_arete(const DoubleTab& inco, const DoubleTab*, int fac1, int fac2, int fac3, int signe, Type_Double& flux) const +template template +KOKKOS_INLINE_FUNCTION std::enable_if_t +Eval_Diff_VDF_Face_Gen::flux_arete_comp(CDoubleTabView inco, CDoubleTabView, int fac1, int fac2, int fac3, int fac4, int k, double& flux) const +{ + const int ori1 = orientation_v_(fac1), ori3 = orientation_v_(fac3); + const int elems[4] = {elem_v_(fac3,0), elem_v_(fac3,1), elem_v_(fac4,0), elem_v_(fac4,1)}; + const double surf = mean_surface(fac1,fac2), poros = mean_porosity(fac1,fac2); + if (inco(fac4,k)*inco(fac3,k) == 0) { flux = 0.; return; } + double visc_lam_temp = 0, visc_turb_temp = 0; + for (int i = 0; i < 4; i++) + if (elems[i] != -1) { visc_lam_temp += nu_lam(elems[i],k); visc_turb_temp += nu_turb(elems[i],k); } + visc_lam_temp /= 3.0; + visc_turb_temp /= 3.0; + const double visc_lam = visc_lam_temp, visc_turb = DERIVED_T::IS_TURB ? visc_turb_temp : 0.0; + const double tau = (inco(fac4,k)-inco(fac3,k))/le_dom_v_.dist_face(fac3,fac4,ori1); + const double tau_tr = ACTIVATE_TAU_TR ? (inco(fac2,k)-inco(fac1,k))/le_dom_v_.dist_face(fac1,fac2,ori3) : 0.0; + flux = ((tau+tau_tr)*(visc_lam+visc_turb))*surf*poros; +} + +template template +KOKKOS_INLINE_FUNCTION std::enable_if_t +Eval_Diff_VDF_Face_Gen::flux_arete_comp(CDoubleTabView inco, CDoubleTabView val_imp_face_bord, CDoubleTabView coeff_frottement_face_bord, CDoubleTabView, int fac1, int fac2, int fac3, int signe, int k, double& flux) const { constexpr bool is_PAROI = (Arete_Type == Type_Flux_Arete::PAROI); - const int rang1 = (fac1-premiere_face_bord), rang2 = (fac2-premiere_face_bord), ori = orientation(fac3), ncomp = flux.size_array(); - if ( !uses_wall_law() ) + const int ori = orientation_v_(fac3); + if (!uses_wall_law()) { - int elem1 = elem_(fac3,0), elem2 = elem_(fac3,1); - if (is_PAROI) - { - if (elem1 == -1) elem1 = elem2; - else if (elem2 == -1) elem2 = elem1; - } - - const double surf = surface_(fac1,fac2), poros = porosity_(fac1,fac2), dist = dist_norm_bord(fac1), tps = inconnue->temps(); - - const double vit_imp = is_PAROI ? 0.5*(Champ_Face_get_val_imp_face_bord(tps,rang1,ori,la_zcl.valeur())+Champ_Face_get_val_imp_face_bord(tps,rang2,ori,la_zcl.valeur())) : - 0.5*(Champ_Face_get_val_imp_face_bord_sym(inco,tps,rang1,ori,la_zcl.valeur())+Champ_Face_get_val_imp_face_bord_sym(inco,tps,rang2,ori,la_zcl.valeur())); - + int elem1 = elem_v_(fac3,0), elem2 = elem_v_(fac3,1); + if (is_PAROI) { if (elem1==-1) elem1=elem2; else if (elem2==-1) elem2=elem1; } + const double surf = mean_surface(fac1,fac2), poros = mean_porosity(fac1,fac2), dist = le_dom_v_.dist_norm_bord(fac1); + const double vit_imp = 0.5*(val_imp_face_bord(fac1,ori)+val_imp_face_bord(fac2,ori)); double coeff = 0.0; - for (int k = 0; k < ncomp; k++) - { - if (!is_PAROI) // NAVIER_PAROI - coeff = 0.5 * (Champ_Face_coeff_frottement_face_bord(fac1, k, la_zcl.valeur()) + Champ_Face_coeff_frottement_face_bord(fac2, k, la_zcl.valeur())); - - const int ind = DERIVED_T::IS_ANISO ? ori : k; - const double visc_lam = nu_lam_mean_2pts(elem1,elem2,ind), visc_turb = DERIVED_T::IS_TURB ? nu_mean_2pts(elem1,elem2,ind) : 0.0; - const double tau = (signe*(vit_imp-inco(fac3,k))/dist) - (signe * coeff * inco(fac3, k)), tau_tr = 0.; - flux[k] = ((tau + tau_tr) * (visc_lam + visc_turb)) * surf * poros; - } + if (!is_PAROI) coeff = 0.5*(coeff_frottement_face_bord(fac1,k)+coeff_frottement_face_bord(fac2,k)); + const int ind = DERIVED_T::IS_ANISO ? ori : k; + const double visc_lam = nu_lam_mean_2pts(elem1,elem2,ind), visc_turb = DERIVED_T::IS_TURB ? nu_mean_2pts(elem1,elem2,ind) : 0.0; + const double tau = (signe*(vit_imp-inco(fac3,k))/dist) - (signe*coeff*inco(fac3,k)); + flux = (tau*(visc_lam+visc_turb))*surf*poros; } else - { - double tau1 = tau_tan(rang1,ori)*0.5*surface(fac1), tau2 = tau_tan(rang2,ori)*0.5*surface(fac2); - for (int k = 0; k < ncomp; k++) flux[k] = tau1 + tau2; - } + flux = tau_tan(fac1,ori)*0.5*surface_v_(fac1) + tau_tan(fac2,ori)*0.5*surface_v_(fac2); } -template template -inline std::enable_if_t< (Arete_Type == Type_Flux_Arete::NAVIER), void> -Eval_Diff_VDF_Face_Gen::flux_arete(const DoubleTab& inco, const DoubleTab*, int fac1, int fac2, int fac3, int signe, Type_Double& flux) const +template template +KOKKOS_INLINE_FUNCTION std::enable_if_t +Eval_Diff_VDF_Face_Gen::flux_arete_comp(CDoubleTabView inco, CDoubleTabView, CDoubleTabView coeff_frottement_face_bord, CDoubleTabView, int fac1, int fac2, int fac3, int signe, int k, double& flux) const { - // | - // fac 3 | fac 2 - // -------- - // | fac 1 - // | - - // fac3 est la face interne et fac1 et fac2 sont au bord Navier - // XXX : WARNING : nu/nu_turb deja dans coeff - const int ncomp = flux.size_array(); - const double surf = surface_(fac1,fac2), poros = porosity_(fac1,fac2); - for (int k = 0; k < ncomp; k++) - { - const double coeff = 0.5 * (Champ_Face_coeff_frottement_face_bord(fac1, k, la_zcl.valeur()) + Champ_Face_coeff_frottement_face_bord(fac2, k, la_zcl.valeur())); - const double tau = - signe * coeff * inco(fac3, k), tau_tr = 0.; - flux[k] = (tau + tau_tr) * surf * poros; - } + const double surf = mean_surface(fac1,fac2), poros = mean_porosity(fac1,fac2); + const double coeff = 0.5*(coeff_frottement_face_bord(fac1,k)+coeff_frottement_face_bord(fac2,k)); + flux = (-signe*coeff*inco(fac3,k))*surf*poros; } -template template -inline std::enable_if_t< Arete_Type == Type_Flux_Arete::FLUIDE || Arete_Type == Type_Flux_Arete::NAVIER_FLUIDE || Arete_Type == Type_Flux_Arete::PAROI_FLUIDE, void> -Eval_Diff_VDF_Face_Gen::flux_arete(const DoubleTab& inco, const DoubleTab*, int fac1, int fac2, int fac3, int signe, Type_Double& flux3, Type_Double& flux1_2) const +template template +KOKKOS_INLINE_FUNCTION std::enable_if_t +Eval_Diff_VDF_Face_Gen::flux_arete_comp(CDoubleTabView inco, CDoubleTabView val_imp_face_bord, CDoubleTabView coeff_frottement_face_bord, CDoubleTabView, int fac1, int fac2, int fac3, int signe, int /*ncomp*/, int k, double& flux3, double& flux1_2) const { - assert (flux3.size_array() == flux1_2.size_array()); constexpr bool is_NAV_FL = (Arete_Type == Type_Flux_Arete::NAVIER_FLUIDE), is_PAR_FL = (Arete_Type == Type_Flux_Arete::PAROI_FLUIDE); - const int rang1 = (fac1-premiere_face_bord), rang2 = (fac2-premiere_face_bord), elem1 = elem_(fac3,0), elem2 = elem_(fac3,1), ori= orientation(fac3), ncomp = flux3.size_array(); - const double surf = surface_(fac1,fac2), poros = porosity_(fac1,fac2), surfporos = surface(fac3)*porosite(fac3), tps = inconnue->temps(), - dist1 = dist_norm_bord(fac1), dist2 = dist_face(fac1,fac2,ori); - + const int elem1 = elem_v_(fac3,0), elem2 = elem_v_(fac3,1), ori = orientation_v_(fac3); + const double surf = mean_surface(fac1,fac2), poros = mean_porosity(fac1,fac2); + const double surfporos = surface_v_(fac3)*porosite_v_(fac3); + const double dist1 = le_dom_v_.dist_norm_bord(fac1), dist2 = le_dom_v_.dist_face(fac1,fac2,ori); + const int ind = DERIVED_T::IS_ANISO ? ori : k; + const double visc_lam = nu_lam_mean_2pts(elem1,elem2,ind), visc_turb = DERIVED_T::IS_TURB ? nu_mean_2pts(elem1,elem2,ind) : 0.0; double vit_imp, coeff = 0.0; - - for (int k = 0; k < ncomp; k++) - { - const int ind = DERIVED_T::IS_ANISO ? ori : k; - const double visc_lam = nu_lam_mean_2pts(elem1,elem2,ind), visc_turb = DERIVED_T::IS_TURB ? nu_mean_2pts(elem1,elem2,ind) : 0.0; - if (is_NAV_FL) - { - vit_imp = 0.5*(Champ_Face_get_val_imp_face_bord_sym(inco,tps,rang1,ori,la_zcl.valeur())+ Champ_Face_get_val_imp_face_bord_sym(inco,tps,rang2,ori,la_zcl.valeur())); - coeff = 0.5 * (Champ_Face_coeff_frottement_face_bord(fac1, k, la_zcl.valeur()) + Champ_Face_coeff_frottement_face_bord(fac2, k, la_zcl.valeur())); - } - else if (is_PAR_FL) // On ne sait pas qui de fac1 ou de fac2 est la face de paroi - { - if (est_egal(inco(fac1,k),0)) vit_imp = Champ_Face_get_val_imp_face_bord(tps,rang2,ori,la_zcl.valeur()); // fac1 est la face de paroi - else vit_imp = Champ_Face_get_val_imp_face_bord(tps,rang1,ori,la_zcl.valeur()); // fac2 est la face de paroi - } - else vit_imp = 0.5*(Champ_Face_get_val_imp_face_bord(tps,rang1,ori,la_zcl.valeur())+Champ_Face_get_val_imp_face_bord(tps,rang2,ori,la_zcl.valeur())); - - // | - // fac 3 | fac 2 - // -------- - // | fac 1 - // | - - // fac3 est la face interne et fac1 et fac2 sont au bord - const double tau_3 = (signe*(vit_imp-inco(fac3,k))/dist1) -(signe * coeff * inco(fac3, k)), - tau_12 = (inco(fac2,k)-inco(fac1,k))/dist2, tau_tr_3 = ACTIVATE_TAU_TR ? tau_12 : 0.0, tau_tr_12 = ACTIVATE_TAU_TR ? tau_3 : 0.0; - - flux3[k] = ((tau_3 + tau_tr_3) * (visc_lam + visc_turb)) * surf * poros; - flux1_2[k] = ((tau_12 + tau_tr_12) * (visc_lam + visc_turb)) * surfporos; - } + if (is_NAV_FL) + { vit_imp = 0.5*(val_imp_face_bord(fac1,ori)+val_imp_face_bord(fac2,ori)); coeff = 0.5*(coeff_frottement_face_bord(fac1,k)+coeff_frottement_face_bord(fac2,k)); } + else if (is_PAR_FL) + vit_imp = est_egal(inco(fac1,k),0,1e-12) ? val_imp_face_bord(fac2,ori) : val_imp_face_bord(fac1,ori); + else + vit_imp = 0.5*(val_imp_face_bord(fac1,ori)+val_imp_face_bord(fac2,ori)); + const double tau_3 = (signe*(vit_imp-inco(fac3,k))/dist1)-(signe*coeff*inco(fac3,k)); + const double tau_12 = (inco(fac2,k)-inco(fac1,k))/dist2; + const double tau_tr_3 = ACTIVATE_TAU_TR ? tau_12 : 0.0, tau_tr_12 = ACTIVATE_TAU_TR ? tau_3 : 0.0; + flux3 = ((tau_3 +tau_tr_3 )*(visc_lam+visc_turb))*surf*poros; + flux1_2 = ((tau_12 +tau_tr_12)*(visc_lam+visc_turb))*surfporos; } -template template inline std::enable_if_t< Arete_Type == Type_Flux_Arete::PERIODICITE, void> -Eval_Diff_VDF_Face_Gen::flux_arete(const DoubleTab& inco, const DoubleTab*, int fac1, int fac2, int fac3, int fac4, Type_Double& flux3_4, Type_Double& flux1_2) const +template template +KOKKOS_INLINE_FUNCTION std::enable_if_t +Eval_Diff_VDF_Face_Gen::flux_arete_comp(CDoubleTabView inco, CDoubleTabView, int fac1, int fac2, int fac3, int fac4, int k, double& flux3_4, double& flux1_2) const { - assert (flux3_4.size_array() == flux1_2.size_array()); - const int ori1 = orientation(fac1), ori3 = orientation(fac3), elem1 = elem_(fac3,0), elem2 = elem_(fac3,1), elem3 = elem_(fac4,0), elem4 = elem_(fac4,1), ncomp = flux3_4.size_array(); - const double dist3_4 = dist_face_period(fac3,fac4,ori1), dist1_2 = dist_face_period(fac1,fac2,ori3), - surf1_2 = surface_(fac1,fac2), poros1_2 = porosity_(fac1, fac2), surf3_4 = surface_(fac3,fac4), poros3_4 = porosity_(fac3, fac4); - - for (int k = 0; k < ncomp; k++) - { - const int ind = DERIVED_T::IS_ANISO ? ori3 : k; - const double visc_lam = nu_lam_mean_4pts(elem1,elem2,elem3,elem4,ind), visc_turb = DERIVED_T::IS_TURB ? nu_mean_4pts(elem1,elem2,elem3,elem4,ind) : 0.0; - const double tau_34 = (inco(fac4,k)-inco(fac3,k))/dist3_4, tau_12 = (inco(fac2,k)-inco(fac1,k))/dist1_2, tau_tr_34 = ACTIVATE_TAU_TR ? tau_12 : 0.0, tau_tr_12 = ACTIVATE_TAU_TR ? tau_34 : 0.0; - flux3_4[k] = ((tau_34 + tau_tr_34) * (visc_lam + visc_turb)) * surf1_2 * poros1_2; - flux1_2[k] = ((tau_12 + tau_tr_12) * (visc_lam + visc_turb)) * surf3_4 * poros3_4; - } + const int ori1 = orientation_v_(fac1), ori3 = orientation_v_(fac3); + const int elem1 = elem_v_(fac3,0), elem2 = elem_v_(fac3,1), elem3 = elem_v_(fac4,0), elem4 = elem_v_(fac4,1); + const double dist3_4 = le_dom_v_.dist_face_period(fac3,fac4,ori1), dist1_2 = le_dom_v_.dist_face_period(fac1,fac2,ori3); + const double surf1_2 = mean_surface(fac1,fac2), poros1_2 = mean_porosity(fac1,fac2); + const double surf3_4 = mean_surface(fac3,fac4), poros3_4 = mean_porosity(fac3,fac4); + const int ind = DERIVED_T::IS_ANISO ? ori3 : k; + const double visc_lam = nu_lam_mean_4pts(elem1,elem2,elem3,elem4,ind); + const double visc_turb = DERIVED_T::IS_TURB ? nu_mean_4pts(elem1,elem2,elem3,elem4,ind) : 0.0; + const double tau_34 = (inco(fac4,k)-inco(fac3,k))/dist3_4, tau_12 = (inco(fac2,k)-inco(fac1,k))/dist1_2; + const double tau_tr_34 = ACTIVATE_TAU_TR ? tau_12 : 0.0, tau_tr_12 = ACTIVATE_TAU_TR ? tau_34 : 0.0; + flux3_4 = ((tau_34+tau_tr_34)*(visc_lam+visc_turb))*surf1_2*poros1_2; + flux1_2 = ((tau_12+tau_tr_12)*(visc_lam+visc_turb))*surf3_4*poros3_4; } /* ************************************** * @@ -223,7 +184,7 @@ Eval_Diff_VDF_Face_Gen::coeffs_fa7(const DoubleTab*, int elem,int fac f1[k] = f2[k] = ((d_tau + d_tau_tr) * (visc_lam + visc_turb)) * surf; } - if (TEST_COEFFS) test_coeffs_fa7(elem,fac1,fac2,f1); + //if (TEST_COEFFS) test_coeffs_fa7(elem,fac1,fac2,f1); } template template inline std::enable_if_t< Arete_Type == Type_Flux_Arete::INTERNE, void> @@ -241,7 +202,7 @@ Eval_Diff_VDF_Face_Gen::coeffs_arete(const DoubleTab*, int fac1, int aii[k] = ajj[k] = ((d_tau + d_tau_tr) * (visc_lam + visc_turb)) * surf * poros; } - if (TEST_COEFFS) test_coeffs_arete(fac1,fac2,fac3,fac4,aii); + //if (TEST_COEFFS) test_coeffs_arete(fac1,fac2,fac3,fac4,aii); } template template inline std::enable_if_t< Arete_Type == Type_Flux_Arete::MIXTE, void> @@ -269,7 +230,7 @@ Eval_Diff_VDF_Face_Gen::coeffs_arete(const DoubleTab*,int fac1, int f const double surf = surface_(fac1,fac2), poros = porosity_(fac1,fac2), d_tau = 1. / dist_face(fac3,fac4,ori1), d_tau_tr = 0.; // On derive par rapport a fac3 et fac4 - const DoubleTab& inco = inconnue->valeurs(); + const DoubleTab& inco = inconnue_->valeurs(); for (int k = 0; k < ncomp; k++) if (inco(fac4,k) * inco(fac3,k) != 0) { @@ -277,12 +238,12 @@ Eval_Diff_VDF_Face_Gen::coeffs_arete(const DoubleTab*,int fac1, int f aii[k] = ajj[k] = ((d_tau + d_tau_tr) * (visc_lam + visc_turb)) * surf * poros; } - if (TEST_COEFFS) test_coeffs_arete(fac1,fac2,fac3,fac4,aii); + //if (TEST_COEFFS) test_coeffs_arete(fac1,fac2,fac3,fac4,aii); } template template inline std::enable_if_t< Arete_Type == Type_Flux_Arete::PAROI || Arete_Type == Type_Flux_Arete::NAVIER_PAROI, void> -Eval_Diff_VDF_Face_Gen::coeffs_arete(const DoubleTab*, int fac1, int fac2, int fac3, int signe, Type_Double& aii1_2, Type_Double& aii3_4, Type_Double& ajj1_2) const +Eval_Diff_VDF_Face_Gen::coeffs_arete(const DoubleTab& val_imp_face_bord, const DoubleTab& coeff_frottement_face_bord, const DoubleTab*, int fac1, int fac2, int fac3, int signe, Type_Double& aii1_2, Type_Double& aii3_4, Type_Double& ajj1_2) const { assert(aii1_2.size_array() == aii3_4.size_array() && aii1_2.size_array() == ajj1_2.size_array()); constexpr bool is_PAROI = (Arete_Type == Type_Flux_Arete::PAROI); @@ -302,7 +263,7 @@ Eval_Diff_VDF_Face_Gen::coeffs_arete(const DoubleTab*, int fac1, int for (int k = 0; k < ncomp; k++) { if (!is_PAROI) // NAVIER_PAROI - coeff = 0.5 * (Champ_Face_coeff_frottement_face_bord(fac1, k, la_zcl.valeur()) + Champ_Face_coeff_frottement_face_bord(fac2, k, la_zcl.valeur())); + coeff = 0.5 * (coeff_frottement_face_bord(fac1, k) + coeff_frottement_face_bord(fac2, k)); const double d_tau = signe / dist - (signe * coeff), d_tau_tr = 0.; // On a pas derive ... deja nul dans le flux ! const int ind = DERIVED_T::IS_ANISO ? ori : k; @@ -313,12 +274,12 @@ Eval_Diff_VDF_Face_Gen::coeffs_arete(const DoubleTab*, int fac1, int } else for (int k = 0; k < ncomp; k++) aii3_4[k] = aii1_2[k] = ajj1_2[k] = 0.; - if (TEST_COEFFS) test_coeffs_arete(fac1,fac2,fac3,signe,aii3_4); + //if (TEST_COEFFS) test_coeffs_arete(val_imp_face_bord, coeff_frottement_face_bord, fac1,fac2,fac3,signe,aii3_4); } template template inline std::enable_if_t< Arete_Type == Type_Flux_Arete::FLUIDE || Arete_Type == Type_Flux_Arete::NAVIER_FLUIDE || Arete_Type == Type_Flux_Arete::PAROI_FLUIDE, void> -Eval_Diff_VDF_Face_Gen::coeffs_arete(const DoubleTab*, int fac1, int fac2, int fac3, int signe, Type_Double& aii1_2, Type_Double& aii3, Type_Double& ajj1_2) const +Eval_Diff_VDF_Face_Gen::coeffs_arete(const DoubleTab& val_imp_face_bord, const DoubleTab& coeff_frottement_face_bord, const DoubleTab*, int fac1, int fac2, int fac3, int signe, Type_Double& aii1_2, Type_Double& aii3, Type_Double& ajj1_2) const { assert(aii1_2.size_array() == aii3.size_array() && aii1_2.size_array() == ajj1_2.size_array()); constexpr bool is_NAV_FL = (Arete_Type == Type_Flux_Arete::NAVIER_FLUIDE); @@ -330,7 +291,7 @@ Eval_Diff_VDF_Face_Gen::coeffs_arete(const DoubleTab*, int fac1, int for (int k = 0; k < ncomp; k++) { if (is_NAV_FL) - coeff = 0.5 * (Champ_Face_coeff_frottement_face_bord(fac1, k, la_zcl.valeur()) + Champ_Face_coeff_frottement_face_bord(fac2, k, la_zcl.valeur())); + coeff = 0.5 * (coeff_frottement_face_bord(fac1, k) + coeff_frottement_face_bord(fac2, k)); const double d_tau_3 = (signe / dist1) - (signe * coeff), d_tau_tr_3 = 0., // On derive par rapport a fac3 d_tau_12 = 1. / dist2, d_tau_tr_12 = 0.; // On derive par rapport a fac1 et fac2 @@ -341,12 +302,12 @@ Eval_Diff_VDF_Face_Gen::coeffs_arete(const DoubleTab*, int fac1, int aii1_2[k] = ajj1_2[k] = ((d_tau_12 + d_tau_tr_12) * (visc_lam + visc_turb)) * surfporos; } - if (TEST_COEFFS) test_coeffs_arete(fac1,fac2,fac3,signe,aii1_2,aii3); + //if (TEST_COEFFS) test_coeffs_arete(val_imp_face_bord, coeff_frottement_face_bord, fac1,fac2,fac3,signe,aii1_2,aii3); } template template inline std::enable_if_t< (Arete_Type == Type_Flux_Arete::NAVIER), void> -Eval_Diff_VDF_Face_Gen::coeffs_arete(const DoubleTab*, int fac1, int fac2, int fac3, int signe, Type_Double& aii1_2, Type_Double& aii3, Type_Double& ajj1_2) const +Eval_Diff_VDF_Face_Gen::coeffs_arete(const DoubleTab& val_imp_face_bord, const DoubleTab& coeff_frottement_face_bord, const DoubleTab*, int fac1, int fac2, int fac3, int signe, Type_Double& aii1_2, Type_Double& aii3, Type_Double& ajj1_2) const { assert(aii1_2.size_array() == aii3.size_array() && aii1_2.size_array() == ajj1_2.size_array()); const int elem1 = elem_(fac3,0), elem2 = elem_(fac3,1), ncomp = aii1_2.size_array(), ori = orientation(fac3); @@ -356,7 +317,7 @@ Eval_Diff_VDF_Face_Gen::coeffs_arete(const DoubleTab*, int fac1, int { const int ind = DERIVED_T::IS_ANISO ? ori : k; const double visc_lam = nu_lam_mean_2pts(elem1, elem2, ind), visc_turb = DERIVED_T::IS_TURB ? nu_mean_2pts(elem1, elem2, ind) : 0.0; - const double coeff = 0.5 * (Champ_Face_coeff_frottement_face_bord(fac1, k, la_zcl.valeur()) + Champ_Face_coeff_frottement_face_bord(fac2, k, la_zcl.valeur())); + const double coeff = 0.5 * (coeff_frottement_face_bord(fac1, k) + coeff_frottement_face_bord(fac2, k)); const double d_tau_3 = - (signe * coeff), d_tau_tr_3 = 0., // On derive par rapport a fac3 d_tau_12 = 1. / dist2, d_tau_tr_12 = 0.; // On derive par rapport a fac1 et fac2 @@ -380,13 +341,14 @@ Eval_Diff_VDF_Face_Gen::coeffs_arete(const DoubleTab*,int fac1, int f aii[k] = ajj[k] = ((d_tau + d_tau_tr) * (visc_lam + visc_turb)) * surf * poros; } - if (TEST_COEFFS) test_coeffs_arete(fac1,fac2,fac3,fac4,aii); + //if (TEST_COEFFS) test_coeffs_arete(fac1,fac2,fac3,fac4,aii); } /* ************************************** * * ********* For checking *********** * * ************************************** */ +/* template template void Eval_Diff_VDF_Face_Gen::check_error(const char * nom_funct, const int Type_Flux, const int ncomp, const Type_Double& f1, const Type_Double& flux_p, const Type_Double& flux_m) const { @@ -411,7 +373,7 @@ template template Eval_Diff_VDF_Face_Gen::test_coeffs_common(const int fac1, const int fac2, const int fac3, const int fac4, Type_Double& flux_p, Type_Double& flux_m) const { - DoubleTab inco_pert = inconnue->valeurs(); + DoubleTab inco_pert = inconnue_->valeurs(); const int ncomp = flux_p.size_array(); for (int k = 0; k < ncomp; k++) inco_pert(fac4,k) += EPS; // XXX : ATTENTION SIGNE @@ -423,49 +385,49 @@ Eval_Diff_VDF_Face_Gen::test_coeffs_common(const int fac1, const int template template inline std::enable_if_t -Eval_Diff_VDF_Face_Gen::test_coeffs_common(const int fac1, const int fac2, const int fac3, const int signe, Type_Double& flux_p, Type_Double& flux_m) const +Eval_Diff_VDF_Face_Gen::test_coeffs_common(const DoubleTab& val_imp_face_bord, const DoubleTab& coeff_frottement_face_bord, const int fac1, const int fac2, const int fac3, const int signe, Type_Double& flux_p, Type_Double& flux_m) const { - DoubleTab inco_pert = inconnue->valeurs(); + DoubleTab inco_pert = inconnue_->valeurs(); const int ncomp = flux_p.size_array(); for (int k = 0; k < ncomp; k++) inco_pert(fac3,k) -= EPS; // XXX : ATTENTION SIGNE - flux_arete(inco_pert,nullptr,fac1,fac2,fac3,signe,flux_p); + flux_arete(inco_pert,val_imp_face_bord, coeff_frottement_face_bord,nullptr,fac1,fac2,fac3,signe,flux_p); for (int k = 0; k < ncomp; k++) inco_pert(fac3,k) += 2.0 * EPS; // XXX : ATTENTION SIGNE - flux_arete(inco_pert,nullptr,fac1,fac2,fac3,signe,flux_m); + flux_arete(inco_pert,val_imp_face_bord, coeff_frottement_face_bord,nullptr,fac1,fac2,fac3,signe,flux_m); } template template inline std::enable_if_t -Eval_Diff_VDF_Face_Gen::test_coeffs_common(const int fac1, const int fac2, const int fac3, const int signe, Type_Double& flux_p3, Type_Double& flux_m3, Type_Double& flux_p12, Type_Double& flux_m12) const +Eval_Diff_VDF_Face_Gen::test_coeffs_common(const DoubleTab& val_imp_face_bord, const DoubleTab& coeff_frottement_face_bord, const int fac1, const int fac2, const int fac3, const int signe, Type_Double& flux_p3, Type_Double& flux_m3, Type_Double& flux_p12, Type_Double& flux_m12) const { - DoubleTab inco_pert = inconnue->valeurs(); + DoubleTab inco_pert = inconnue_->valeurs(); Type_Double poubelle(flux_p3.size_array()); const int ncomp = flux_p3.size_array(); // pour flux3 for (int k = 0; k < ncomp; k++) inco_pert(fac3,k) -= EPS; // XXX : ATTENTION SIGNE - flux_arete(inco_pert,nullptr,fac1,fac2,fac3,signe,flux_p3,poubelle); + flux_arete(inco_pert,val_imp_face_bord, coeff_frottement_face_bord,nullptr,fac1,fac2,fac3,signe,flux_p3,poubelle); for (int k = 0; k < ncomp; k++) inco_pert(fac3,k) += 2.0 * EPS; // XXX : ATTENTION SIGNE - flux_arete(inco_pert,nullptr,fac1,fac2,fac3,signe,flux_m3,poubelle); + flux_arete(inco_pert,val_imp_face_bord, coeff_frottement_face_bord,nullptr,fac1,fac2,fac3,signe,flux_m3,poubelle); // pour flux1_2 - inco_pert = inconnue->valeurs(); // back to real values + inco_pert = inconnue_->valeurs(); // back to real values for (int k = 0; k < ncomp; k++) inco_pert(fac2,k) += EPS; // XXX : ATTENTION SIGNE - flux_arete(inco_pert,nullptr,fac1,fac2,fac3,signe,poubelle,flux_p12); + flux_arete(inco_pert,val_imp_face_bord, coeff_frottement_face_bord,nullptr,fac1,fac2,fac3,signe,poubelle,flux_p12); for (int k = 0; k < ncomp; k++) inco_pert(fac2,k) -= 2.0 * EPS; // XXX : ATTENTION SIGNE - flux_arete(inco_pert,nullptr,fac1,fac2,fac3,signe,poubelle,flux_m12); + flux_arete(inco_pert,val_imp_face_bord, coeff_frottement_face_bord,nullptr,fac1,fac2,fac3,signe,poubelle,flux_m12); } template template inline std::enable_if_t Eval_Diff_VDF_Face_Gen::test_coeffs_common(const int fac1, const int fac2, const int fac3, const int fac4, Type_Double& flux_p, Type_Double& flux_m) const { - DoubleTab inco_pert = inconnue->valeurs(); + DoubleTab inco_pert = inconnue_->valeurs(); Type_Double poubelle(flux_p.size_array()); const int ncomp = flux_p.size_array(); @@ -481,7 +443,7 @@ Eval_Diff_VDF_Face_Gen::test_coeffs_fa7(const int elem, const int fac { const int ncomp = f1.size_array(); Type_Double flux_p(ncomp), flux_m(ncomp); - DoubleTab inco_pert = inconnue->valeurs(); + DoubleTab inco_pert = inconnue_->valeurs(); for (int k = 0; k < ncomp; k++) inco_pert(fac2,k) += EPS; // XXX : ATTENTION SIGNE flux_fa7(inco_pert,nullptr,elem,fac1,fac2,flux_p); @@ -509,26 +471,26 @@ Eval_Diff_VDF_Face_Gen::test_coeffs_arete(const int fac1, const int f const int ncomp = aii.size_array(); Type_Double flux_p(ncomp), flux_m(ncomp); test_coeffs_common(fac1,fac2,fac3,fac4,flux_p,flux_m); - if (inconnue->valeurs()(fac4,0) * inconnue->valeurs()(fac3,0) != 0) check_error(__func__,(int)Arete_Type,ncomp,aii,flux_p,flux_m); + if (inconnue_->valeurs()(fac4,0) * inconnue_->valeurs()(fac3,0) != 0) check_error(__func__,(int)Arete_Type,ncomp,aii,flux_p,flux_m); } template template inline std::enable_if_t -Eval_Diff_VDF_Face_Gen::test_coeffs_arete(const int fac1, const int fac2, const int fac3, const int signe, const Type_Double& aii3_4) const +Eval_Diff_VDF_Face_Gen::test_coeffs_arete(const DoubleTab& val_imp_face_bord, const DoubleTab& coeff_frottement_face_bord, const int fac1, const int fac2, const int fac3, const int signe, const Type_Double& aii3_4) const { const int ncomp = aii3_4.size_array(); Type_Double flux_p(ncomp), flux_m(ncomp); - test_coeffs_common(fac1,fac2,fac3,signe,flux_p,flux_m); + test_coeffs_common(val_imp_face_bord, coeff_frottement_face_bord,fac1,fac2,fac3,signe,flux_p,flux_m); if ( !uses_wall_law() ) check_error(__func__,(int)Arete_Type,ncomp,aii3_4,flux_p,flux_m); } template template inline std::enable_if_t -Eval_Diff_VDF_Face_Gen::test_coeffs_arete(const int fac1, const int fac2, const int fac3, const int signe, const Type_Double& aii1_2, const Type_Double& aii3) const +Eval_Diff_VDF_Face_Gen::test_coeffs_arete(const DoubleTab& val_imp_face_bord, const DoubleTab& coeff_frottement_face_bord, const int fac1, const int fac2, const int fac3, const int signe, const Type_Double& aii1_2, const Type_Double& aii3) const { const int ncomp = aii1_2.size_array(); Type_Double flux_p3(ncomp), flux_m3(ncomp), flux_p12(ncomp), flux_m12(ncomp); - test_coeffs_common(fac1,fac2,fac3,signe,flux_p3,flux_m3,flux_p12,flux_m12); + test_coeffs_common(val_imp_face_bord, coeff_frottement_face_bord,fac1,fac2,fac3,signe,flux_p3,flux_m3,flux_p12,flux_m12); check_error(__func__,(int)Arete_Type,ncomp,aii3,flux_p3,flux_m3); check_error(__func__,(int)Arete_Type,ncomp,aii1_2,flux_p12,flux_m12); } @@ -542,5 +504,5 @@ Eval_Diff_VDF_Face_Gen::test_coeffs_arete(const int fac1, const int f test_coeffs_common(fac1,fac2,fac3,fac4,flux_p,flux_m); check_error(__func__,(int)Arete_Type,ncomp,aii,flux_p,flux_m); } - +*/ #endif /* Eval_Diff_VDF_Face_Gen_TPP_included */ diff --git a/src/VDF/Operateurs/Eval_Diff_Dift/Eval_Dift_Multiphase_VDF.h b/src/VDF/Operateurs/Eval_Diff_Dift/Eval_Dift_Multiphase_VDF.h index 0adb2a09b7..a1a39cefbe 100644 --- a/src/VDF/Operateurs/Eval_Diff_Dift/Eval_Dift_Multiphase_VDF.h +++ b/src/VDF/Operateurs/Eval_Diff_Dift/Eval_Dift_Multiphase_VDF.h @@ -1,5 +1,5 @@ /**************************************************************************** -* Copyright (c) 2024, CEA +* Copyright (c) 2025, CEA * All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: @@ -37,16 +37,16 @@ class Eval_Dift_Multiphase_VDF : public Eval_Dift_VDF { const DoubleTab& rho = ref_probleme_->milieu().masse_volumique().passe(); const int cR = rho.dimension(0) == 1; - tab_diffusivite_turbulente = nu_t_; + tab_diffusivite_turbulente_ = nu_t_; if (need_alpha_rho_ && sub_type(Pb_Multiphase, ref_probleme_.valeur())) { for (int e = 0; e < nu_t_->dimension(0); e++) for (int n = 0; n < nu_t_->dimension(1); n++) - tab_diffusivite_turbulente(e, n) = tab_alpha_(e, n) * rho(!cR * e, n) * nu_t_.valeur()(e, n); + tab_diffusivite_turbulente_(e, n) = tab_alpha_(e, n) * rho(!cR * e, n) * nu_t_.valeur()(e, n); } - tab_diffusivite_turbulente.echange_espace_virtuel(); + tab_diffusivite_turbulente_.echange_espace_virtuel(); tab_diff_turb_first_update_ = false; } @@ -55,7 +55,7 @@ class Eval_Dift_Multiphase_VDF : public Eval_Dift_VDF if (tab_diff_turb_first_update_) const_cast(*this).update_diffusivite_turbulente(); - return tab_diffusivite_turbulente; + return tab_diffusivite_turbulente_; } const Champ_Fonc_base& diffusivite_turbulente() const { throw; } diff --git a/src/VDF/Operateurs/Eval_Diff_Dift/Eval_Dift_VDF.h b/src/VDF/Operateurs/Eval_Diff_Dift/Eval_Dift_VDF.h index 1fbf26e1c8..471ddcb672 100644 --- a/src/VDF/Operateurs/Eval_Diff_Dift/Eval_Dift_VDF.h +++ b/src/VDF/Operateurs/Eval_Diff_Dift/Eval_Dift_VDF.h @@ -24,6 +24,18 @@ class Eval_Dift_VDF : public Eval_Diff_VDF { public: + inline Eval_Dift_VDF() { } + inline Eval_Dift_VDF(const Eval_Dift_VDF& eval) : Eval_Diff_VDF(eval) + { + is_multi_ = eval.is_multi_; + loipar = eval.loipar; + ref_diffusivite_turbulente_ = eval.ref_diffusivite_turbulente_; + //equivalent_distance.ref(eval.equivalent_distance); + tab_diffusivite_turbulente_.ref(eval.tab_diffusivite_turbulente_); + tab_diffusivite_turbulente_v_ = eval.tab_diffusivite_turbulente_v_; + } + virtual ~Eval_Dift_VDF() { } + inline void mettre_a_jour() override { Eval_Diff_VDF::mettre_a_jour(); @@ -35,46 +47,60 @@ class Eval_Dift_VDF : public Eval_Diff_VDF return equivalent_distance[boundary_index](local_face); } - inline double compute_heq_impl(double d0, int i, double d1, int j, int compo) const + template + KOKKOS_INLINE_FUNCTION double tab_diffusivite_turbulente(int face, int comp) const { if constexpr (std::is_same::value) return tab_diffusivite_turbulente_(face, comp); else return tab_diffusivite_turbulente_v_(face, comp); } + + template + KOKKOS_INLINE_FUNCTION double compute_heq_impl(double d0, int i, double d1, int j, int compo) const { - const double heq_lam = Eval_Diff_VDF::compute_heq_impl(d0, i, d1, j, compo); - const double heq_turb = 0.5 * (tab_diffusivite_turbulente(i, is_multi_ * compo) + tab_diffusivite_turbulente(j, is_multi_ * compo)) / (d1 + d0); + const double heq_lam = Eval_Diff_VDF::compute_heq_impl(d0, i, d1, j, compo); + const double heq_turb = 0.5 * (tab_diffusivite_turbulente(i, is_multi_ * compo) + tab_diffusivite_turbulente(j, is_multi_ * compo)) / (d1 + d0); return heq_lam + heq_turb; } - inline double nu_t_impl(int i, int compo) const { return tab_diffusivite_turbulente(i, is_multi_ * compo); } + template + KOKKOS_INLINE_FUNCTION double nu_t_impl(int i, int compo) const + { + return tab_diffusivite_turbulente(i, is_multi_ * compo); + } - inline double nu_lam_impl_face(int i, int j, int k, int l, int compo) const + template + KOKKOS_INLINE_FUNCTION double nu_lam_impl_face(int i, int j, int k, int l, int compo) const { - return Eval_Diff_VDF::nu_2_impl_face(i, j, k, l, compo); + return Eval_Diff_VDF::nu_2_impl_face(i, j, k, l, compo); } - inline double nu_lam_impl_face2(int i, int j, int compo) const + template + KOKKOS_INLINE_FUNCTION double nu_lam_impl_face2(int i, int j, int compo) const { - return Eval_Diff_VDF::nu_1_impl_face(i, j, compo); + return Eval_Diff_VDF::nu_1_impl_face(i, j, compo); } - inline double nu_1_impl(int i, int compo) const + template + KOKKOS_INLINE_FUNCTION double nu_1_impl(int i, int compo) const { - const double nu_lam = Eval_Diff_VDF::nu_1_impl(i, compo); - const double nu_turb = tab_diffusivite_turbulente(i, is_multi_ * compo); + const double nu_lam = Eval_Diff_VDF::nu_1_impl(i, compo); + const double nu_turb = tab_diffusivite_turbulente(i, is_multi_ * compo); return nu_lam + nu_turb; } - inline double nu_2_impl(int i, int compo) const + template + KOKKOS_INLINE_FUNCTION double nu_2_impl(int i, int compo) const { - return Eval_Diff_VDF::nu_2_impl(i, compo); + return Eval_Diff_VDF::nu_2_impl(i, compo); } - inline double nu_1_impl_face(int i, int j, int compo) const + template + KOKKOS_INLINE_FUNCTION double nu_1_impl_face(int i, int j, int compo) const { - return 0.5 * (tab_diffusivite_turbulente(i, is_multi_ * compo) + tab_diffusivite_turbulente(j, is_multi_ * compo)); + return 0.5 * (tab_diffusivite_turbulente(i, is_multi_ * compo) + tab_diffusivite_turbulente(j, is_multi_ * compo)); } - inline double nu_2_impl_face(int i, int j, int k, int l, int compo) const + template + KOKKOS_INLINE_FUNCTION double nu_2_impl_face(int i, int j, int k, int l, int compo) const { - return 0.25 * (tab_diffusivite_turbulente(i, is_multi_ * compo) + tab_diffusivite_turbulente(j, is_multi_ * compo) + - tab_diffusivite_turbulente(k, is_multi_ * compo) + tab_diffusivite_turbulente(l, is_multi_ * compo)); + return 0.25 * (tab_diffusivite_turbulente(i, is_multi_ * compo) + tab_diffusivite_turbulente(j, is_multi_ * compo) + + tab_diffusivite_turbulente(k, is_multi_ * compo) + tab_diffusivite_turbulente(l, is_multi_ * compo)); } void update_equivalent_distance() @@ -92,19 +118,26 @@ class Eval_Dift_VDF : public Eval_Diff_VDF inline void associer_diff_turb(const Champ_Fonc_base& diff_turb) { ref_diffusivite_turbulente_ = diff_turb; - tab_diffusivite_turbulente.ref(diff_turb.valeurs()); + tab_diffusivite_turbulente_.ref(diff_turb.valeurs()); is_multi_ = (diff_turb.valeurs().dimension(1) > 1) ? 1 : 0; } inline virtual void associer_loipar(const Turbulence_paroi_scal_base& loi_paroi) { loipar = loi_paroi; } inline virtual void init_ind_fluctu_term() { /* do nothing */} + void view_ro_impl() const override + { + Eval_Diff_VDF::view_ro_impl(); + tab_diffusivite_turbulente_v_ = tab_diffusivite_turbulente_.view_ro(); + } + protected: int is_multi_ = 0; OBS_PTR(Champ_Fonc_base) ref_diffusivite_turbulente_; OBS_PTR(Turbulence_paroi_scal_base) loipar; DoubleVects equivalent_distance; - DoubleTab tab_diffusivite_turbulente; + DoubleTab tab_diffusivite_turbulente_; + mutable CDoubleTabView tab_diffusivite_turbulente_v_; }; #endif /* Eval_Dift_VDF_included */ diff --git a/src/VDF/Operateurs/Eval_Diff_Dift/Eval_Dift_VDF_leaves.cpp b/src/VDF/Operateurs/Eval_Diff_Dift/Eval_Dift_VDF_leaves.cpp index 71c23b1f91..9bb72ed8d9 100644 --- a/src/VDF/Operateurs/Eval_Diff_Dift/Eval_Dift_VDF_leaves.cpp +++ b/src/VDF/Operateurs/Eval_Diff_Dift/Eval_Dift_VDF_leaves.cpp @@ -37,19 +37,3 @@ void Eval_Dift_VDF_Face::mettre_a_jour() if (tab.size_array() > 0) tau_tan_.ref(tab); } } - -double Eval_Dift_VDF_Face::tau_tan_impl(int face, int k) const -{ - const int nb_faces = le_dom->nb_faces(); - const ArrOfInt& ind_faces_virt_bord = le_dom->ind_faces_virt_bord(); - int f = (face >= tau_tan_.dimension(0)) ? ind_faces_virt_bord[face-nb_faces] : face; - if(f >= tau_tan_.dimension_tot(0)) - { - Cerr << "Erreur dans tau_tan " << finl; - Cerr << "dimension : " << tau_tan_.dimension(0) << finl; - Cerr << "dimension_tot : " << tau_tan_.dimension_tot(0) << finl; - Cerr << "face : " << face << finl; - Process::exit(); - } - return tau_tan_(f,k); -} diff --git a/src/VDF/Operateurs/Eval_Diff_Dift/Eval_Dift_VDF_leaves.h b/src/VDF/Operateurs/Eval_Diff_Dift/Eval_Dift_VDF_leaves.h index 009925b28e..d064dd5699 100644 --- a/src/VDF/Operateurs/Eval_Diff_Dift/Eval_Dift_VDF_leaves.h +++ b/src/VDF/Operateurs/Eval_Diff_Dift/Eval_Dift_VDF_leaves.h @@ -1,5 +1,5 @@ /**************************************************************************** -* Copyright (c) 2024, CEA +* Copyright (c) 2026, CEA * All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: @@ -92,23 +92,45 @@ class Eval_Dift_VDF_Multi_inco_Elem : public Eval_Diff_VDF_Elem_Gen, public Eval_Dift_VDF { public: + inline Eval_Dift_VDF_Face() { } + inline Eval_Dift_VDF_Face(const Eval_Dift_VDF_Face& eval) : Eval_Diff_VDF_Face_Gen(eval), Eval_Dift_VDF(eval) + { + le_modele_turbulence = eval.le_modele_turbulence; + loipar = eval.loipar; + tau_tan_.ref(eval.tau_tan_); + tau_tan_v_ = eval.tau_tan_v_; + } + virtual ~Eval_Dift_VDF_Face() { } static constexpr bool IS_TURB = true, CALC_FA7_SORTIE_LIB = true, CALC_ARR_PAR_FL = false; inline void associer_modele_turbulence(const Modele_turbulence_hyd_base& mod) { le_modele_turbulence = mod; } - inline bool uses_wall() const { return le_modele_turbulence->utiliser_loi_paroi(); } + KOKKOS_INLINE_FUNCTION bool uses_wall() const { return tau_tan_v_.data(); } void mettre_a_jour() override; - double tau_tan_impl(int face,int k) const; + void view_ro() const override + { + Eval_Diff_VDF_Face_Gen::view_ro(); + if (le_modele_turbulence->utiliser_loi_paroi()) + tau_tan_v_ = tau_tan_.view_ro(); + } + KOKKOS_INLINE_FUNCTION double tau_tan_impl(int face,int k) const + { + int size = (int)tau_tan_v_.extent(0); + int f = (face >= size) ? le_dom_v_.ind_faces_virt_bord(face-le_dom_v_.nb_faces()) : face; + assert(f, public Eval_Dift_Multiphase_VDF { public: static constexpr bool IS_TURB = true, CALC_FA7_SORTIE_LIB = true, CALC_ARR_PAR_FL = false; - inline bool uses_wall() const { return false; } + KOKKOS_INLINE_FUNCTION bool uses_wall() const { return false; } }; #endif /* Eval_Dift_VDF_leaves_included */ diff --git a/src/VDF/Operateurs/Eval_Divers/CL_Types_include.h b/src/VDF/Operateurs/Eval_Divers/CL_Types_include.h index 9594003e5d..b184a5ca43 100644 --- a/src/VDF/Operateurs/Eval_Divers/CL_Types_include.h +++ b/src/VDF/Operateurs/Eval_Divers/CL_Types_include.h @@ -1,5 +1,5 @@ /**************************************************************************** -* Copyright (c) 2025, CEA +* Copyright (c) 2026, CEA * All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: @@ -29,5 +29,13 @@ #include #include #include +#include + +/*! @brief BC_view struct to pass views on array to define Boundary conditions values (val_imp, t_ext, h_imp, ...) + */ +struct BC_View +{ + CDoubleTabView val[3]; +}; #endif /* CL_Types_include_included */ diff --git a/src/VDF/Operateurs/Eval_Divers/Eval_Div_VDF_Elem.h b/src/VDF/Operateurs/Eval_Divers/Eval_Div_VDF_Elem.h index e7f55e8a79..df0ad6122d 100644 --- a/src/VDF/Operateurs/Eval_Divers/Eval_Div_VDF_Elem.h +++ b/src/VDF/Operateurs/Eval_Divers/Eval_Div_VDF_Elem.h @@ -1,5 +1,5 @@ /**************************************************************************** -* Copyright (c) 2025, CEA +* Copyright (c) 2026, CEA * All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: @@ -33,25 +33,26 @@ class Eval_Div_VDF_Elem : public Eval_Div_VDF, public Eval_VDF_Elem * ********* POUR L'EXPLICITE ********** * * ************************************** */ - template // Generic return - inline void flux_face(const DoubleTab& inco, const DoubleTab&, const int face, const BC&, const int , Type_Double& flux) const - { for (int k=0; k inline void flux_face(const DoubleTab&, const DoubleTab&, const int , const Symetrie&, const int, Type_Double& ) const { /* Do nothing */ } - template inline void flux_face(const DoubleTab&, const DoubleTab&, const int , const Dirichlet_paroi_fixe&, const int, Type_Double& ) const { /* Do nothing */ } - template inline void flux_face(const DoubleTab&, const DoubleTab&, const int , const Dirichlet_paroi_defilante&, const int, Type_Double& ) const { /* Do nothing */ } - template - inline void flux_face(const DoubleTab& inco, const int boundary_index, const int face, const int local_face, const Echange_externe_impose&, const int, Type_Double& flux) const + inline void flux_face(const DoubleTab& inco, const DoubleTab&, const int face, const Echange_global_impose&, const int , Type_Double& flux) const { for (int k=0; k - inline void flux_faces_interne(const DoubleTab& inco, const int face, Type_Double& flux) const + template inline void flux_face(const DoubleTab& inco, const int boundary_index, const int face, const int local_face, const Echange_externe_impose&, const int, Type_Double& flux) const { for (int k=0; k + KOKKOS_INLINE_FUNCTION void flux_faces_bord_comp(CDoubleTabView inco, CDoubleTabView, const int face, const BC_View&, const int, const int k, double& flux) const + { + if constexpr (std::is_same_v || std::is_same_v || std::is_same_v) + flux = 0.; + else + flux = inco(face,k)*surface_v_(face)*porosite_v_(face); + } + /* ************************************** * - * ********* POUR L'IMPLICITE ********** * - * ************************************** */ + * ********* POUR L'IMPLICITE ********** * + * ************************************** */ template inline void coeffs_face(const int, const int, const BC&, Type_Double& , Type_Double& ) const { /* Do nothing */ } @@ -63,6 +64,7 @@ class Eval_Div_VDF_Elem : public Eval_Div_VDF, public Eval_VDF_Elem template inline void secmem_face(const int, const BC&, const int, Type_Double& ) const { throw; } template inline void secmem_face(const int, const int, const int, const Echange_externe_impose&, const int, Type_Double& ) const { throw; } template inline void secmem_faces_interne(const int, Type_Double& ) const { throw; } + }; #endif /* Eval_Div_VDF_Elem_included */ diff --git a/src/VDF/Operateurs/Eval_Divers/Eval_VDF_Face.h b/src/VDF/Operateurs/Eval_Divers/Eval_VDF_Face.h index 98b9251293..a45c8599e0 100644 --- a/src/VDF/Operateurs/Eval_Divers/Eval_VDF_Face.h +++ b/src/VDF/Operateurs/Eval_Divers/Eval_VDF_Face.h @@ -1,5 +1,5 @@ /**************************************************************************** -* Copyright (c) 2024, CEA +* Copyright (c) 2025, CEA * All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: @@ -35,15 +35,15 @@ class Eval_VDF_Face CALC_ARR_INT = true, CALC_ARR_MIXTE = true, CALC_ARR_PERIO = true, CALC_ARR_PAR_FL = true, CALC_ARR_PAR = true, CALC_ARR_NAVIER_PAR = true, CALC_ARR_NAVIER_FL = true, CALC_ARR_NAVIER = true; inline void associer_inconnue(const Champ_base& ); - + inline OBS_PTR(Champ_base) inconnue() const { return inconnue_; } protected: - OBS_PTR(Champ_base) inconnue; + OBS_PTR(Champ_base) inconnue_; }; inline void Eval_VDF_Face::associer_inconnue(const Champ_base& inco) { assert(sub_type(Champ_Face_VDF,inco)); - inconnue=ref_cast(Champ_Face_VDF,inco); + inconnue_=ref_cast(Champ_Face_VDF,inco); } #endif /* Eval_VDF_Face_included */ diff --git a/src/VDF/Operateurs/Eval_Divers/Evaluateur_VDF.cpp b/src/VDF/Operateurs/Eval_Divers/Evaluateur_VDF.cpp index f5145b128c..361c3ecb2b 100644 --- a/src/VDF/Operateurs/Eval_Divers/Evaluateur_VDF.cpp +++ b/src/VDF/Operateurs/Eval_Divers/Evaluateur_VDF.cpp @@ -1,5 +1,5 @@ /**************************************************************************** -* Copyright (c) 2023, CEA +* Copyright (c) 2025, CEA * All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: @@ -14,41 +14,43 @@ *****************************************************************************/ #include -#include #include -#include -Evaluateur_VDF::Evaluateur_VDF(const Evaluateur_VDF& eval) : le_dom(eval.le_dom), la_zcl(eval.la_zcl), dimension(eval.dimension), - premiere_face_bord(eval.premiere_face_bord) +Evaluateur_VDF::Evaluateur_VDF(const Evaluateur_VDF& eval) { - surface.ref(eval.surface); - orientation.ref(eval.orientation); - elem_.ref(eval.elem_); + le_dom = eval.le_dom; + la_zcl = eval.la_zcl; porosite.ref(eval.porosite); - volume_entrelaces.ref(eval.volume_entrelaces); - xv.ref(eval.xv); + le_dom_v_ = eval.le_dom_v_; + elem_v_ = eval.elem_v_; + surface_v_ = eval.surface_v_; + orientation_v_ = eval.orientation_v_; + porosite_v_ = eval.porosite_v_; + volume_entrelaces_v_ = eval.volume_entrelaces_v_; + xv_v_ = eval.xv_v_; +} + +void Evaluateur_VDF::view_ro() const +{ + le_dom_v_.set(le_dom); + elem_v_ = le_dom->face_voisins().view_ro(); + surface_v_ = le_dom->face_surfaces().view_ro(); + orientation_v_ = le_dom->orientation().view_ro(); + porosite_v_ = porosite.view_ro(); + volume_entrelaces_v_ = le_dom->volumes_entrelaces().view_ro(); + xv_v_ = le_dom->xv().view_ro(); } void Evaluateur_VDF::associer_domaines(const Domaine_VDF& domaine_vdf, const Domaine_Cl_VDF& domaine_cl_vdf) { le_dom = domaine_vdf; la_zcl = domaine_cl_vdf; - dimension = Objet_U::dimension; - premiere_face_bord = domaine_vdf.premiere_face_bord(); - surface.ref(domaine_vdf.face_surfaces()); - orientation.ref(domaine_vdf.orientation()); - elem_.ref(domaine_vdf.face_voisins()); - porosite.ref(la_zcl->equation().milieu().porosite_face()); - volume_entrelaces.ref(domaine_vdf.volumes_entrelaces()); - xv.ref(domaine_vdf.xv()); + associer_porosite(la_zcl->equation().milieu().porosite_face()); } +// Peut etre appelee par F5: void Evaluateur_VDF::associer_porosite(const DoubleVect& poro) { porosite.ref(poro); } -double Evaluateur_VDF::dist_norm_bord(int face) const -{ - return le_dom->dist_norm_bord(face); -} diff --git a/src/VDF/Operateurs/Eval_Divers/Evaluateur_VDF.h b/src/VDF/Operateurs/Eval_Divers/Evaluateur_VDF.h index e1d2751bac..7c4d618785 100644 --- a/src/VDF/Operateurs/Eval_Divers/Evaluateur_VDF.h +++ b/src/VDF/Operateurs/Eval_Divers/Evaluateur_VDF.h @@ -19,8 +19,9 @@ #include #include #include - -class Domaine_Cl_VDF; +#include +#include +#include /*! @brief class Evaluateur_VDF Classe de base des evaluateurs VDF. * @@ -38,25 +39,32 @@ class Evaluateur_VDF Evaluateur_VDF(const Evaluateur_VDF& ); virtual void associer_domaines(const Domaine_VDF& , const Domaine_Cl_VDF& ); virtual void associer_porosite(const DoubleVect&); - - inline double dist_face_period(int fac1, int fac2, int k) const { return le_dom->dist_face_period(fac1,fac2,k); } - inline double dist_face(int fac1, int fac2, int k) const - { - return xv(fac2,k) - xv(fac1,k); - //return le_dom->dist_face(fac1, fac2, k); - } + virtual void view_ro() const; + inline const Domaine_Cl_VDF& get_la_zcl() const { return la_zcl.valeur(); } protected: OBS_PTR(Domaine_VDF) le_dom; OBS_PTR(Domaine_Cl_VDF) la_zcl; - int dimension = -100, premiere_face_bord = -100; - IntTab elem_; // les 2 elements voisins d'une face - DoubleVect surface; // surfaces des faces - IntVect orientation; // orientations des faces DoubleVect porosite; // porosites surfaciques - DoubleVect volume_entrelaces;// - DoubleTab xv; // coord des centres des faces - double dist_norm_bord(int) const; + + // Vues utilisees par les parties Kokkos: + Domaine_VDF_View le_dom_v_; // Struct pour acceder aux donnees sur le device du Domaine_VDF + mutable CIntTabView elem_v_; // les 2 elements voisins d'une face + mutable CDoubleArrView surface_v_; // surfaces des faces + mutable CIntArrView orientation_v_; // orientations des faces + mutable CDoubleArrView porosite_v_; // porosites surfaciques + mutable CDoubleArrView volume_entrelaces_v_; // volumes entrelacees + mutable CDoubleTabView xv_v_; // coord des centres des faces + + // Fonctions appelees par les parties encore non portees: + inline int elem_(int face, int k) const { return le_dom->face_voisins(face, k); } + inline int orientation(int face) const { return le_dom->orientation(face); } + inline double xv(int face, int k) const { return le_dom->xv(face, k); } + inline double dist_norm_bord(int face) const { return le_dom->dist_norm_bord(face); } + inline double surface(int face) const { return le_dom->face_surfaces()(face); } + inline double volume_entrelaces(int face) const { return le_dom->volumes_entrelaces()(face); } + inline double dist_face_period(int fac1, int fac2, int k) const { return le_dom->dist_face_period(fac1,fac2,k); } + inline double dist_face(int fac1, int fac2, int k) const { return xv(fac2,k) - xv(fac1,k); } }; #endif /* Evaluateur_VDF_included */ diff --git a/src/VDF/Operateurs/Iterateurs/Iterateur_VDF_Elem.h b/src/VDF/Operateurs/Iterateurs/Iterateur_VDF_Elem.h index c7ac3bff73..22cf52f8ac 100644 --- a/src/VDF/Operateurs/Iterateurs/Iterateur_VDF_Elem.h +++ b/src/VDF/Operateurs/Iterateurs/Iterateur_VDF_Elem.h @@ -1,5 +1,5 @@ /**************************************************************************** -* Copyright (c) 2025, CEA +* Copyright (c) 2026, CEA * All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: @@ -33,13 +33,13 @@ class Iterateur_VDF_Elem : public Iterateur_VDF_base public: Iterateur_VDF_Elem() { } - Iterateur_VDF_Elem(const Iterateur_VDF_Elem<_TYPE_>& iter) : Iterateur_VDF_base(iter), flux_evaluateur(iter.flux_evaluateur) { elem.ref(iter.elem); } + Iterateur_VDF_Elem(const Iterateur_VDF_Elem<_TYPE_>& iter) : Iterateur_VDF_base(iter), flux_evaluateur_(iter.flux_evaluateur_) { elem_.ref(iter.elem_); } - inline Evaluateur_VDF& evaluateur() override { return static_cast (flux_evaluateur); } - inline const Evaluateur_VDF& evaluateur() const override { return static_cast (flux_evaluateur); } + inline Evaluateur_VDF& evaluateur() override { return static_cast (flux_evaluateur_); } + inline const Evaluateur_VDF& evaluateur() const override { return static_cast (flux_evaluateur_); } int impr(Sortie& os) const override; - void completer_() override { elem.ref(le_dom->face_voisins()); } + void completer_() override { elem_.ref(le_dom->face_voisins()); } void ajouter_contribution_autre_pb(const DoubleTab& inco, Matrice_Morse& matrice, const Cond_lim& la_cl, std::map>&) const override; void contribuer_au_second_membre(DoubleTab& ) const override; @@ -49,13 +49,13 @@ class Iterateur_VDF_Elem : public Iterateur_VDF_base void creer_champ_T_paroi_pour_flux_parietal() override { /* TODO FIXME */ } protected: - _TYPE_ flux_evaluateur; - IntTab elem; + _TYPE_ flux_evaluateur_; + IntTab elem_; mutable SFichier Flux, Flux_moment, Flux_sum; inline const Milieu_base& milieu() const { return (la_zcl->equation()).milieu(); } OBS_PTR(Correlation_base) corr_flux_parietal_; -private: + private_but_public_for_cuda template void ajouter_blocs_bords(const int , matrices_t mats, DoubleTab& secmem, const tabs_t& semi_impl) const; @@ -77,6 +77,7 @@ class Iterateur_VDF_Elem : public Iterateur_VDF_base void modifier_flux() const; template inline void fill_flux_tables_(const int, const int , const double , const Type_Double& , DoubleTab& ) const; + KOKKOS_INLINE_FUNCTION void fill_flux_tables_(const int, const int , const double , CIntTabView, DoubleArrView, DoubleTabView, DoubleTabView) const; void fill_derivee_cc(matrices_t mats, const tabs_t& semi_impl, VectorDeriv& d_cc) const; @@ -102,6 +103,7 @@ class Iterateur_VDF_Elem : public Iterateur_VDF_base template void contribuer_au_second_membre_bords_(const Echange_externe_impose& , const int , const int , const int, const int , const Front_VF& , DoubleTab& ) const; + }; #include // templates specializations ici ;) diff --git a/src/VDF/Operateurs/Iterateurs/Iterateur_VDF_Elem.tpp b/src/VDF/Operateurs/Iterateurs/Iterateur_VDF_Elem.tpp index 34f32fad1e..fae8e0199f 100644 --- a/src/VDF/Operateurs/Iterateurs/Iterateur_VDF_Elem.tpp +++ b/src/VDF/Operateurs/Iterateurs/Iterateur_VDF_Elem.tpp @@ -21,6 +21,10 @@ #include #include #include +#include +#include +#include +#include template void Iterateur_VDF_Elem<_TYPE_>::ajouter_contribution_autre_pb(const DoubleTab& inco, Matrice_Morse& matrice, const Cond_lim& la_cl, std::map>& f2e) const @@ -35,9 +39,9 @@ void Iterateur_VDF_Elem<_TYPE_>::ajouter_contribution_autre_pb(const DoubleTab& for (int f = ndeb; f < nfin; f++) { const int e1 = f2e[f].first, e2 = f2e[f].second; - flux_evaluateur.coeffs_face(f, ndeb, cl, aii, ajj); + flux_evaluateur_.coeffs_face(f, ndeb, cl, aii, ajj); for (int i = 0; i < ncomp; i++) - matrice(e1 * ncomp + i, e2 * ncomp + i) = -(elem(f, 0) > -1 ? aii[i] : ajj[i]); + matrice(e1 * ncomp + i, e2 * ncomp + i) = -(elem_(f, 0) > -1 ? aii[i] : ajj[i]); } } } @@ -49,18 +53,20 @@ void Iterateur_VDF_Elem<_TYPE_>::ajouter_contribution_autre_pb(const DoubleTab& template void Iterateur_VDF_Elem<_TYPE_>::ajouter_blocs(matrices_t mats, DoubleTab& secmem, const tabs_t& semi_impl) const { - ((_TYPE_&) flux_evaluateur).mettre_a_jour(); + ((_TYPE_&) flux_evaluateur_).mettre_a_jour(); assert(op_base->equation().inconnue().valeurs().nb_dim() < 3 && la_zcl && le_dom); const int ncomp = op_base->equation().inconnue().valeurs().line_size(); DoubleTab& flux_bords = op_base->flux_bords(); flux_bords.resize(le_dom->nb_faces_bord(), ncomp); flux_bords = 0.; + // modif b.m.: on va faire += sur des items virtuels, initialiser les cases : sinon risque que les cases soient invalides ou non initialisees + DoubleArrView data = static_cast(secmem).view_wo(); + Kokkos::parallel_for(start_gpu_timer(__KERNEL_NAME__), range_1D(secmem.size(), secmem.size_array()), KOKKOS_LAMBDA(const int i) { - int n = secmem.size_array() - secmem.size(); - double *data = secmem.addr() + secmem.size(); - for (; n; n--, data++) *data = 0.; - } + data(i)=0.; + }); + end_gpu_timer(__KERNEL_NAME__); if (ncomp == 1) { ajouter_blocs_bords < SingleDouble > (ncomp, mats, secmem, semi_impl); @@ -137,41 +143,49 @@ void Iterateur_VDF_Elem<_TYPE_>::ajouter_blocs_bords(const int ncomp, matrices_t Process::exit(); break; } + Debog::verifier(la_cl.valeur().que_suis_je(),resu); } } template template -void Iterateur_VDF_Elem<_TYPE_>::ajouter_blocs_interne(const int N, matrices_t mats, DoubleTab& resu, const tabs_t& semi_impl) const +void Iterateur_VDF_Elem<_TYPE_>::ajouter_blocs_interne(const int N, matrices_t mats, DoubleTab& tab_resu, const tabs_t& semi_impl) const { - const DoubleTab& donnee = semi_impl.count(nom_ch_inco_) ? semi_impl.at(nom_ch_inco_) : le_champ_convecte_ou_inc->valeurs(); - Type_Double flux(N), aii(N * (multiscalar_diff_ ? N : 1)), ajj(N * (multiscalar_diff_ ? N : 1)), aef(N); - const int ndeb = le_dom->premiere_face_int(), nfin = le_dom->nb_faces(), Mv = le_ch_v ? le_ch_v->valeurs().line_size() : N; - for (int face = ndeb; face < nfin; face++) - { - flux_evaluateur.flux_faces_interne(donnee, face, flux); - const int e0 = elem(face, 0), e1 = elem(face, 1); - // second membre - for (int k = 0; k < N; k++) - { - resu(e0, k) += flux[k]; - resu(e1, k) -= flux[k]; - } - } + const DoubleTab& tab_donnee = semi_impl.count(nom_ch_inco_) ? semi_impl.at(nom_ch_inco_) : le_champ_convecte_ou_inc->valeurs(); + const int ndeb = le_dom->premiere_face_int(), nfin = le_dom->nb_faces(); + + // Copy of flux_evaluateur_ is mandatory here cause it's an attribute of the class + _TYPE_ flux_evaluateur = flux_evaluateur_; + flux_evaluateur.view_ro(); + CIntTabView elem = elem_.view_ro(); + CDoubleTabView donnee = tab_donnee.view_ro(); + DoubleTabView resu = tab_resu.view_rw(); + Kokkos::parallel_for(start_gpu_timer(__KERNEL_NAME__), + Kokkos::MDRangePolicy>({ndeb, 0}, {nfin, N}), + KOKKOS_LAMBDA(const int face, const int k) + { + double flux; + flux_evaluateur.flux_faces_interne_comp(donnee, face, k, flux); + const int e0 = elem(face, 0), e1 = elem(face, 1); + Kokkos::atomic_add(&resu(e0, k), +flux); + Kokkos::atomic_add(&resu(e1, k), -flux); + }); + end_gpu_timer(__KERNEL_NAME__); Matrice_Morse *m_vit = (mats.count("vitesse") && is_convective_op()) ? mats.at("vitesse") : nullptr, *mat = (!is_pb_multiphase() && mats.count(nom_ch_inco_)) ? mats.at(nom_ch_inco_) : nullptr; VectorDeriv d_cc; fill_derivee_cc(mats, semi_impl, d_cc); + Type_Double aii(N * (multiscalar_diff_ ? N : 1)), ajj(N * (multiscalar_diff_ ? N : 1)), aef(N); + const int Mv = le_ch_v ? le_ch_v->valeurs().line_size() : N; //derivees : vitesse if (m_vit) for (int face = ndeb; face < nfin; face++) { - flux_evaluateur.coeffs_faces_interne_bloc_vitesse(donnee, face, aef); + flux_evaluateur.coeffs_faces_interne_bloc_vitesse(tab_donnee, face, aef); for (int i = 0; i < 2; i++) for (int n = 0, m = 0; n < N; n++, m += (Mv > 1)) - (*m_vit)(N * elem(face, i) + n, Mv * face + m) += (i ? -1.0 : 1.0) * aef(n); + (*m_vit)(N * elem_(face, i) + n, Mv * face + m) += (i ? -1.0 : 1.0) * aef(n); } - //derivees : champ convecte if (mat || d_cc.size() > 0) for (int face = ndeb; face < nfin; face++) @@ -182,7 +196,7 @@ void Iterateur_VDF_Elem<_TYPE_>::ajouter_blocs_interne(const int N, matrices_t m } template template -void Iterateur_VDF_Elem<_TYPE_>::ajouter_blocs_bords_(const BC& cl, const int ndeb, const int nfin, const int N, matrices_t mats, DoubleTab& resu, const tabs_t& semi_impl) const +void Iterateur_VDF_Elem<_TYPE_>::ajouter_blocs_bords_(const BC& cl, const int ndeb, const int nfin, const int N, matrices_t mats, DoubleTab& tab_resu, const tabs_t& semi_impl) const { constexpr bool is_Neum_paroi_adiab = std::is_same::value; @@ -199,9 +213,9 @@ void Iterateur_VDF_Elem<_TYPE_>::ajouter_blocs_bords_(const BC& cl, const int nd if (is_Neum_paroi_adiab) Process::exit(); // On bloque ici :-) - const DoubleTab& donnee = semi_impl.count(nom_ch_inco_) ? semi_impl.at(nom_ch_inco_) : le_champ_convecte_ou_inc->valeurs(), - val_b = sub_type(Champ_Face_base, le_champ_convecte_ou_inc.valeur()) ? DoubleTab() : - (use_base_val_b_ ? le_champ_convecte_ou_inc->Champ_base::valeur_aux_bords() : le_champ_convecte_ou_inc->valeur_aux_bords()); // si le champ associe est un champ_face, alors on est dans un operateur de div + const DoubleTab& tab_donnee = semi_impl.count(nom_ch_inco_) ? semi_impl.at(nom_ch_inco_) : le_champ_convecte_ou_inc->valeurs(), + tab_val_b = sub_type(Champ_Face_base, le_champ_convecte_ou_inc.valeur()) ? DoubleTab() : + (use_base_val_b_ ? le_champ_convecte_ou_inc->Champ_base::valeur_aux_bords() : le_champ_convecte_ou_inc->valeur_aux_bords()); // si le champ associe est un champ_face, alors on est dans un operateur de div Matrice_Morse *mat = (!is_pb_multiphase() && mats.count(nom_ch_inco_)) ? mats.at(nom_ch_inco_) : nullptr; VectorDeriv d_cc; @@ -211,34 +225,70 @@ void Iterateur_VDF_Elem<_TYPE_>::ajouter_blocs_bords_(const BC& cl, const int nd if (is_Temp_impose_flux_parietal || is_Neumann_flux_parietal || is_paroi_contact_flux_parietal) { fill_derivee_cc(mats, semi_impl, d_cc); - const DoubleTab& donnee2 = is_pb_multiphase() ? le_champ_convecte_ou_inc->valeurs() : donnee ; // On tente de toujours impliciter le flux parietal en pb multi lol + const DoubleTab& donnee2 = is_pb_multiphase() ? le_champ_convecte_ou_inc->valeurs() : tab_donnee ; // On tente de toujours impliciter le flux parietal en pb multi lol mat = mats.count(nom_ch_inco_) ? mats.at(nom_ch_inco_) : nullptr; // On tente de toujours impliciter le flux parietal en pb multi lol - ajouter_blocs_bords_flux_parietal_(cl, ndeb, nfin, N, donnee2, resu, mat, d_cc, semi_impl); + ajouter_blocs_bords_flux_parietal_(cl, ndeb, nfin, N, donnee2, tab_resu, mat, d_cc, semi_impl); } else { - int e, Mv = le_ch_v ? le_ch_v->valeurs().line_size() : N; - Type_Double flux(N), aii(N * (multiscalar_diff_ ? N : 1)), ajj(N * (multiscalar_diff_ ? N : 1)), aef(N); - for (int face = ndeb; face < nfin; face++) + // Copy of flux_evaluateur_ is mandatory here cause it's an attribute of the class + _TYPE_ flux_evaluateur = flux_evaluateur_; + flux_evaluateur.view_ro(); + CIntTabView elem = elem_.view_ro(); + CDoubleTabView donnee = tab_donnee.view_ro(); + CDoubleTabView val_b = tab_val_b.view_ro(); + BC_View bc_view; + // BC with imposed (non zero) values: + if constexpr (std::is_same_v || + std::is_same_v || + std::is_same_v) + bc_view.val[0] = cl.tab_val_imp().view_ro(); + else if constexpr (std::is_same_v) + bc_view.val[0] = cl.tab_flux_impose().view_ro(); + else if constexpr (std::is_same_v) { - flux_evaluateur.flux_face(donnee, val_b, face, cl, ndeb, flux); // Generic code - fill_flux_tables_(face, N, 1.0 /* coeff */, flux, resu); + bc_view.val[0] = cl.tab_h_imp().view_ro(); + bc_view.val[1] = cl.tab_T_ext().view_ro(); + if (cl.has_phi_ext()) bc_view.val[2] = cl.tab_phi_ext().view_ro(); } + else if constexpr (std::is_same_v || + std::is_same_v || + std::is_same_v) + {} + else + { + cerr << "Error, forbid to assess a BC of type: " << std::string_view(typeid(BC).name()) << endl; + Process::exit(); + } + DoubleTabView resu = tab_resu.view_rw(); + DoubleTabView flux_bords = op_base->flux_bords().view_rw(); + Kokkos::parallel_for(start_gpu_timer(__KERNEL_NAME__), + Kokkos::MDRangePolicy>({ndeb, 0}, {nfin, N}), + KOKKOS_LAMBDA(const int face, const int k) + { + double flux_k; + flux_evaluateur.template flux_faces_bord_comp(donnee, val_b, face, bc_view, ndeb, k, flux_k); + const int elem1 = elem(face, 0), elem2 = elem(face, 1); + if (elem1 > -1) { Kokkos::atomic_add(&resu(elem1, k), +flux_k); flux_bords(face, k) += flux_k; } + if (elem2 > -1) { Kokkos::atomic_add(&resu(elem2, k), -flux_k); flux_bords(face, k) -= flux_k; } + }); + end_gpu_timer(__KERNEL_NAME__); Matrice_Morse *m_vit = (mats.count("vitesse") && is_convective_op()) ? mats.at("vitesse") : nullptr; - fill_derivee_cc(mats, semi_impl, d_cc); + int e, Mv = le_ch_v ? le_ch_v->valeurs().line_size() : N; //derivees : vitesse if (m_vit) { + Type_Double aef(N); const IntTab *fcl_v = le_ch_v ? &ref_cast(Champ_Face_base, le_ch_v.valeur()).fcl() : nullptr; for (int f = ndeb; f < nfin; f++) if ((*fcl_v)(f, 0) < 2) { - flux_evaluateur.coeffs_face_bloc_vitesse(donnee, val_b, f, cl, ndeb, aef); + flux_evaluateur.coeffs_face_bloc_vitesse(tab_donnee, tab_val_b, f, cl, ndeb, aef); for (int i = 0; i < 2; i++) - if ((e = elem(f, i)) >= 0) + if ((e = elem_(f, i)) >= 0) for (int n = 0, m = 0; n < N; n++, m += (Mv > 1)) (*m_vit)(N * e + n, Mv * f + m) += (i ? -1.0 : 1.0) * aef(n); } @@ -246,46 +296,61 @@ void Iterateur_VDF_Elem<_TYPE_>::ajouter_blocs_bords_(const BC& cl, const int nd //derivees : champ convecte if (mat || d_cc.size() > 0) - for (int face = ndeb; face < nfin; face++) - { - flux_evaluateur.coeffs_face(face, ndeb, cl, aii, ajj); // Generic code - fill_coeffs_matrices(face, aii, ajj, mat, d_cc); - } + { + Type_Double aii(N * (multiscalar_diff_ ? N : 1)), ajj(N * (multiscalar_diff_ ? N : 1)); + for (int face = ndeb; face < nfin; face++) + { + flux_evaluateur.coeffs_face(face, ndeb, cl, aii, ajj); // Generic code + fill_coeffs_matrices(face, aii, ajj, mat, d_cc); + } + } } } } template template void Iterateur_VDF_Elem<_TYPE_>::ajouter_blocs_bords_(const Periodique& cl, const int ndeb, const int nfin, const int N, const Front_VF& frontiere_dis, - matrices_t mats, DoubleTab& resu, const tabs_t& semi_impl) const + matrices_t mats, DoubleTab& tab_resu, const tabs_t& semi_impl) const { - DoubleTab& flux_bords = op_base->flux_bords(); if (_TYPE_::CALC_FLUX_FACES_PERIO) { - const DoubleTab& donnee = semi_impl.count(nom_ch_inco_) ? semi_impl.at(nom_ch_inco_) : le_champ_convecte_ou_inc->valeurs(); - - // Luis : je rajoute l'option multiscalar_diff dans les CL périodiques - Type_Double flux(N), aii(N * (multiscalar_diff_ ? N : 1)), ajj(N * (multiscalar_diff_ ? N : 1)), aef(N); - for (int face = ndeb; face < nfin; face++) - { - const int e0 = elem(face, 0), e1 = elem(face, 1); - flux_evaluateur.flux_face(donnee, donnee, face, cl, ndeb, flux); // attention 2 fois donnee - - for (int n = 0; n < N; n++) - { - if (e0 > -1) - { - resu(e0, n) += 0.5 * flux[n]; - if (face < (ndeb + frontiere_dis.nb_faces() / 2)) flux_bords(face, n) += flux[n]; - } - if (e1 > -1) - { - resu(e1, n) -= 0.5 * flux[n]; - if ((ndeb + frontiere_dis.nb_faces() / 2) <= face) flux_bords(face, n) -= flux[n]; - } - } - } + const DoubleTab& tab_donnee = semi_impl.count(nom_ch_inco_) ? semi_impl.at(nom_ch_inco_) : le_champ_convecte_ou_inc->valeurs(); + int nb_faces = frontiere_dis.nb_faces(); + // Copy of flux_evaluateur_ is mandatory here cause it's an attribute of the class + _TYPE_ flux_evaluateur = flux_evaluateur_; + flux_evaluateur.view_ro(); + // DoubleTab tab_val_imp(1,1); + // PL: Even this tiny array should be DoubleTrav cause repeated allocation/allocation on device is sloooow + // Seen during profiling: 6% of runtime on dalia for Canal_VDF !!! + DoubleTrav tab_val_imp(1,1); // Trick to pass to evaluateur flux_face the distance periodicity value + tab_val_imp(0,0) = cl.distance(); + BC_View bc_view; + bc_view.val[0] = tab_val_imp.view_ro(); + CIntTabView elem = elem_.view_ro(); + CDoubleTabView donnee = tab_donnee.view_ro(); + DoubleTabView resu = tab_resu.view_rw(); + DoubleTabView flux_bords = op_base->flux_bords().view_rw(); + Kokkos::parallel_for(start_gpu_timer(__KERNEL_NAME__), + Kokkos::MDRangePolicy>({ndeb, 0}, {nfin, N}), + KOKKOS_LAMBDA(const int face, const int k) + { + double flux_k; + flux_evaluateur.template flux_faces_bord_comp(donnee, donnee, face, bc_view, ndeb, k, flux_k); + const int e0 = elem(face, 0), e1 = elem(face, 1); + if (e0 > -1) + { + Kokkos::atomic_add(&resu(e0, k), +0.5 * flux_k); + if (face < (ndeb + nb_faces / 2)) Kokkos::atomic_add(&flux_bords(face, k), +flux_k); + } + if (e1 > -1) + { + Kokkos::atomic_add(&resu(e1, k), -0.5 * flux_k); + if ((ndeb + nb_faces / 2) <= face) Kokkos::atomic_add(&flux_bords(face, k), -flux_k); + } + }); + end_gpu_timer(__KERNEL_NAME__); + Type_Double aii(N * (multiscalar_diff_ ? N : 1)), ajj(N * (multiscalar_diff_ ? N : 1)), aef(N); Matrice_Morse *m_vit = mats.count("vitesse") ? mats.at("vitesse") : nullptr, *mat = (!is_pb_multiphase() && mats.count(nom_ch_inco_)) ? mats.at(nom_ch_inco_) : nullptr; VectorDeriv d_cc; fill_derivee_cc(mats, semi_impl, d_cc); @@ -294,14 +359,14 @@ void Iterateur_VDF_Elem<_TYPE_>::ajouter_blocs_bords_(const Periodique& cl, cons if (m_vit) for (int face = ndeb; face < nfin; face++) { - const int e0 = elem(face, 0), e1 = elem(face, 1); - flux_evaluateur.coeffs_face_bloc_vitesse(donnee, DoubleTab(), face, cl, ndeb, aef); + const int e0 = elem_(face, 0), e1 = elem_(face, 1); + flux_evaluateur.coeffs_face_bloc_vitesse(tab_donnee, DoubleTab(), face, cl, ndeb, aef); if (e0 > -1) for (int i = 0; i < N; i++) - if (face < (ndeb + frontiere_dis.nb_faces() / 2)) (*m_vit)(e0 * N + i, face * N + i) += aef[i]; + if (face < (ndeb + nb_faces / 2)) (*m_vit)(e0 * N + i, face * N + i) += aef[i]; if (e1 > -1) for (int i = 0; i < N; i++) - if ((ndeb + frontiere_dis.nb_faces() / 2) <= face) (*m_vit)(e1 * N + i, face * N + i) -= aef[i]; + if ((ndeb + nb_faces / 2) <= face) (*m_vit)(e1 * N + i, face * N + i) -= aef[i]; } //derivees : champ convecte @@ -333,10 +398,11 @@ void Iterateur_VDF_Elem<_TYPE_>::ajouter_blocs_bords_(const Echange_externe_impo boundary_index = num_cl; int e, Mv = le_ch_v ? le_ch_v->valeurs().line_size() : N; + ToDo_Kokkos("BC Echange_externe_impose"); for (int face = ndeb; face < nfin; face++) { const int local_face = le_dom->front_VF(boundary_index).num_local_face(face); - flux_evaluateur.flux_face(donnee, boundary_index, face, local_face, cl, ndeb, flux); + flux_evaluateur_.flux_face(donnee, boundary_index, face, local_face, cl, ndeb, flux); fill_flux_tables_(face, N, 1.0 /* coeff */, flux, resu); } @@ -351,10 +417,10 @@ void Iterateur_VDF_Elem<_TYPE_>::ajouter_blocs_bords_(const Echange_externe_impo for (int face = ndeb; face < nfin; face++) { const int local_face = le_dom->front_VF(boundary_index).num_local_face(face); - flux_evaluateur.coeffs_face_bloc_vitesse(donnee, val_b, boundary_index, face, local_face, cl, ndeb, aef); + flux_evaluateur_.coeffs_face_bloc_vitesse(donnee, val_b, boundary_index, face, local_face, cl, ndeb, aef); for (int i = 0; i < 2; i++) - if ((e = elem(face, i)) >= 0) + if ((e = elem_(face, i)) >= 0) for (int n = 0, m = 0; n < N; n++, m += (Mv > 1)) (*m_vit)(N * e + n, Mv * face + m) += (i ? -1.0 : 1.0) * aef(n); } } @@ -364,7 +430,7 @@ void Iterateur_VDF_Elem<_TYPE_>::ajouter_blocs_bords_(const Echange_externe_impo for (int face = ndeb; face < nfin; face++) { const int local_face = le_dom->front_VF(boundary_index).num_local_face(face); - flux_evaluateur.coeffs_face(donnee, boundary_index, face, local_face, ndeb, cl, aii, ajj); + flux_evaluateur_.coeffs_face(donnee, boundary_index, face, local_face, ndeb, cl, aii, ajj); fill_coeffs_matrices(face, aii, ajj, mat, d_cc); // XXX : Attention Yannick pour d_cc c'est pas tout a fait comme avant ... N et M ... } } @@ -390,21 +456,21 @@ inline void Iterateur_VDF_Elem<_TYPE_>::fill_coeffs_matrices(const int f, const if (mat) { for (int i = 0; i < 2; i++) - for (int j = 0, e = elem(f, i); j < 2; j++) - for (int n = 0, eb = elem(f, j); n < N; n++) + for (int j = 0, e = elem_(f, i); j < 2; j++) + for (int n = 0, eb = elem_(f, j); n < N; n++) for (int m = (multiscalar_diff_ ? 0 : n); m < (multiscalar_diff_ ? N : n + 1); m++) (*mat)(N * e + n, N * eb + m) += (i == j ? 1.0 : -1.0) * coeff * (j ? ajj[multiscalar_diff_ ? N * n + m : n] : aii[multiscalar_diff_ ? N * n + m : n]); } else for (auto &&d_m_i : d_cc) for (int i = 0; i < 2; i++) - for (int j = 0, e = elem(f, i); j < 2; j++) + for (int j = 0, e = elem_(f, i); j < 2; j++) { const int M = std::get<2> (d_m_i); const DoubleTab& d_var_cc = *std::get<0> (d_m_i); Matrice_Morse& d_var_operateur = *std::get<1> (d_m_i); - for (int n = 0, m = 0, eb = elem(f, j); n < N; n++, m += (M > 1)) + for (int n = 0, m = 0, eb = elem_(f, j); n < N; n++, m += (M > 1)) d_var_operateur(N * e + n, M * eb + m) += (i == j ? 1.0 : -1.0) * coeff * (j ? ajj[n] : aii[n]) * d_var_cc(eb, m); } } @@ -412,7 +478,7 @@ inline void Iterateur_VDF_Elem<_TYPE_>::fill_coeffs_matrices(const int f, const template template inline void Iterateur_VDF_Elem<_TYPE_>::fill_coeffs_matrices(const int face, Type_Double& aii, Type_Double& ajj, Matrice_Morse *mat, VectorDeriv& d_cc) const { - const int e0 = elem(face, 0), e1 = elem(face, 1); + const int e0 = elem_(face, 0), e1 = elem_(face, 1); const int N = multiscalar_diff_ ? int(sqrt(aii.size_array())) : aii.size_array(); if (mat) diff --git a/src/VDF/Operateurs/Iterateurs/Iterateur_VDF_Elem_FT_TCL.tpp b/src/VDF/Operateurs/Iterateurs/Iterateur_VDF_Elem_FT_TCL.tpp index 16746197ca..6286e20c0a 100644 --- a/src/VDF/Operateurs/Iterateurs/Iterateur_VDF_Elem_FT_TCL.tpp +++ b/src/VDF/Operateurs/Iterateurs/Iterateur_VDF_Elem_FT_TCL.tpp @@ -1,5 +1,5 @@ /**************************************************************************** -* Copyright (c) 2023, CEA +* Copyright (c) 2025, CEA * All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: @@ -21,11 +21,29 @@ * Elie Saikali : NOTA BENE : fichier surcharger dans trio pour le FT, TCL model */ +template +KOKKOS_INLINE_FUNCTION void Iterateur_VDF_Elem<_TYPE_>::fill_flux_tables_(const int face, const int ncomp, const double coeff, CIntTabView elem, DoubleArrView flux, DoubleTabView resu, DoubleTabView flux_bords) const +{ + const int elem1 = elem(face, 0), elem2 = elem(face, 1); + if (elem1 > -1) + for (int k = 0; k < ncomp; k++) + { + Kokkos::atomic_add(&resu(elem1, k), + coeff * flux[k]); + flux_bords(face, k) += coeff * flux[k]; + } + if (elem2 > -1) + for (int k = 0; k < ncomp; k++) + { + Kokkos::atomic_add(&resu(elem2, k), - coeff * flux[k]); + flux_bords(face, k) -= coeff * flux[k]; + } +} + template template inline void Iterateur_VDF_Elem<_TYPE_>::fill_flux_tables_(const int face, const int ncomp, const double coeff, const Type_Double& flux, DoubleTab& resu) const { DoubleTab& flux_bords = op_base->flux_bords(); - const int elem1 = elem(face, 0), elem2 = elem(face, 1); + const int elem1 = elem_(face, 0), elem2 = elem_(face, 1); if (elem1 > -1) for (int k = 0; k < ncomp; k++) { diff --git a/src/VDF/Operateurs/Iterateurs/Iterateur_VDF_Elem_Multiphase_Parietal.tpp b/src/VDF/Operateurs/Iterateurs/Iterateur_VDF_Elem_Multiphase_Parietal.tpp index 67820dc4ed..5f6d67c5b0 100644 --- a/src/VDF/Operateurs/Iterateurs/Iterateur_VDF_Elem_Multiphase_Parietal.tpp +++ b/src/VDF/Operateurs/Iterateurs/Iterateur_VDF_Elem_Multiphase_Parietal.tpp @@ -122,9 +122,9 @@ void Iterateur_VDF_Elem<_TYPE_>::ajouter_blocs_bords_flux_parietal_(const BC& cl nv = 0.; d_nuc = 0.; - const int e = elem(face, 0) > -1 ? elem(face, 0) : elem(face, 1); + const int e = elem_(face, 0) > -1 ? elem_(face, 0) : elem_(face, 1); - const double y = elem(face, 0) > -1 ? le_dom->dist_face_elem0(face, e) : le_dom->dist_face_elem1(face, e); + const double y = elem_(face, 0) > -1 ? le_dom->dist_face_elem0(face, e) : le_dom->dist_face_elem1(face, e); // fill in struct in.N = N; @@ -181,7 +181,7 @@ void Iterateur_VDF_Elem<_TYPE_>::ajouter_blocs_bords_flux_parietal_(const BC& cl (*pdTf_qpi)(e, k, l, m) += dTf_qpi(k, l, m) * fs(face); for (int k = 0; k < N; k++) - flux[k] = (elem(face, 0) != -1) ? qpk(k) * fs(face) * pf(face) : -qpk(k) * fs(face) * pf(face); + flux[k] = (elem_(face, 0) != -1) ? qpk(k) * fs(face) * pf(face) : -qpk(k) * fs(face) * pf(face); fill_flux_tables_(face, N, 1.0, flux, resu); @@ -269,7 +269,7 @@ void Iterateur_VDF_Elem<_TYPE_>::ajouter_blocs_bords_flux_parietal_(const BC& cl (*pdTf_qpi)(e, k, l, m) += dTf_qpi(k, l, m) * fs(face); for (int k = 0; k < N; k++) - flux[k] = (elem(face, 0) != -1) ? qpk(k) * fs(face) : -qpk(k) * fs(face); + flux[k] = (elem_(face, 0) != -1) ? qpk(k) * fs(face) : -qpk(k) * fs(face); fill_flux_tables_(face, N, 1.0, flux, resu); diff --git a/src/VDF/Operateurs/Iterateurs/Iterateur_VDF_Elem_bis.tpp b/src/VDF/Operateurs/Iterateurs/Iterateur_VDF_Elem_bis.tpp index 13587fcf40..5335a8c284 100644 --- a/src/VDF/Operateurs/Iterateurs/Iterateur_VDF_Elem_bis.tpp +++ b/src/VDF/Operateurs/Iterateurs/Iterateur_VDF_Elem_bis.tpp @@ -17,17 +17,15 @@ #define Iterateur_VDF_Elem_bis_TPP_included template -void Iterateur_VDF_Elem<_TYPE_>::modifier_flux() const +void Iterateur_VDF_Elem<_TYPE_>::modifier_flux() const { if (op_base->equation().inconnue().le_nom().debute_par("temperature") && !( sub_type(Operateur_Diff_base,op_base.valeur()) && ref_cast(Operateur_Diff_base,op_base.valeur()).diffusivite().le_nom() == "conductivite" ) ) { - DoubleTab& flux_bords=op_base->flux_bords(); const Domaine_VDF& le_dom_vdf=ref_cast(Domaine_VDF,op_base->equation().domaine_dis()); - const Champ_base& rho = (op_base->equation()).milieu().masse_volumique(); + const Champ_base& masse_volumique = (op_base->equation()).milieu().masse_volumique(); const Champ_Don_base& Cp = (op_base->equation()).milieu().capacite_calorifique(); - const IntTab& face_voisins=le_dom_vdf.face_voisins(); - int rho_uniforme = sub_type(Champ_Uniforme,rho) ? 1 : 0, cp_uniforme = sub_type(Champ_Uniforme,Cp) ? 1 : 0; + int rho_uniforme = sub_type(Champ_Uniforme,masse_volumique) ? 1 : 0, cp_uniforme = sub_type(Champ_Uniforme,Cp) ? 1 : 0; int is_rho_u=op_base->equation().probleme().is_dilatable(); if (is_rho_u) { @@ -37,13 +35,21 @@ void Iterateur_VDF_Elem<_TYPE_>::modifier_flux() const if (ref_cast(Op_Conv_VDF_base,op).vitesse().le_nom()=="rho_u") is_rho_u = 1; } const int nb_faces_bords = le_dom_vdf.nb_faces_bord(); - for (int face = 0; face < nb_faces_bords; face++) - for(int k = 0; k < flux_bords.dimension(1); k++) + + CIntTabView face_voisins = le_dom_vdf.face_voisins().view_ro(); + CDoubleTabView rho = masse_volumique.valeurs().view_ro(); + CDoubleTabView cp = Cp.valeurs().view_ro(); + DoubleTabView flux_bords = op_base->flux_bords().view_rw(); + Kokkos::parallel_for(start_gpu_timer(__KERNEL_NAME__), range_1D(0, nb_faces_bords), KOKKOS_LAMBDA(const int face) + { + for(int k = 0; k < (int)flux_bords.extent(1); k++) { int e = (face_voisins(face, 0) != -1) ? face_voisins(face, 0) : face_voisins(face, 1); - const double rho_ = (is_rho_u) ? 1.0 : rho.valeurs()(!rho_uniforme * e, k); - flux_bords(face, k) *= rho_ * Cp.valeurs()(!cp_uniforme * e, k); + const double rho_ = (is_rho_u) ? 1.0 : rho(!rho_uniforme * e, k); + flux_bords(face, k) *= rho_ * cp(!cp_uniforme * e, k); } + }); + end_gpu_timer(__KERNEL_NAME__); } } @@ -55,34 +61,40 @@ int Iterateur_VDF_Elem<_TYPE_>::impr(Sortie& os) const const int impr_bord=(madomaine.bords_a_imprimer().est_vide() ? 0:1); const Schema_Temps_base& sch = la_zcl->equation().probleme().schema_temps(); double temps = sch.temps_courant(); - DoubleTab& flux_bords=op_base->flux_bords(); - DoubleVect bilan(flux_bords.dimension(1)); - int k,face; + DoubleTab& tab_flux_bords=op_base->flux_bords(); + DoubleVect bilan(tab_flux_bords.dimension(1)); int nb_front_Cl=le_dom->nb_front_Cl(); - DoubleTrav flux_bords2( 3, nb_front_Cl , flux_bords.dimension(1)); - flux_bords2=0; + DoubleTrav tab_flux_bords2( 3, nb_front_Cl , tab_flux_bords.dimension(1)); + tab_flux_bords2=0; /*flux_bord(k) -> flux_bords2(0,num_cl,k) */ /*flux_bord_perio1(k) -> flux_bords2(1,num_cl,k) */ /*flux_bord_perio2(k) -> flux_bords2(2,num_cl,k) */ + const int ncomp = tab_flux_bords.dimension(1); + CDoubleTabView flux_bords = tab_flux_bords.view_ro(); + DoubleTabView3 flux_bords2 = tab_flux_bords2.view_rw<3>(); for (int num_cl=0; num_clles_conditions_limites(num_cl); const Front_VF& frontiere_dis = ref_cast(Front_VF,la_cl->frontiere_dis()); - int ndeb = frontiere_dis.num_premiere_face(); - int nfin = ndeb + frontiere_dis.nb_faces(); - int periodicite = (type_cl(la_cl)==periodique?1:0); - for (face=ndeb; faceouvrir_fichier(Flux,"",1); @@ -91,18 +103,18 @@ int Iterateur_VDF_Elem<_TYPE_>::impr(Sortie& os) const { const Cond_lim& la_cl = la_zcl->les_conditions_limites(num_cl); int periodicite = (type_cl(la_cl)==periodique?1:0); - for(k=0; k::impr(Sortie& os) const if (madomaine.bords_a_imprimer().contient(la_fr.le_nom())) { Flux_face << "# Flux par face sur " << la_fr.le_nom() << " au temps " << temps << " : " << finl; - for (face=ndeb; face= 0) + size(n_type)++; + } + for (int n_type = 0; n_type < n_types; n_type++) + { + aretes_bord_par_type_[n_type].resize(size(n_type)); + size(n_type) = 0; + } + for (int num_arete = 0; num_arete < type_arete_bord.size(); num_arete++) + { + int n_type = type_arete_bord[num_arete]; + if (n_type >= 0) + { + aretes_bord_par_type_[n_type][size(n_type)] = premiere_arete_bord + num_arete; + size(n_type)++; + } } } + + ajouter_blocs_aretes_bords_<_TYPE_::CALC_ARR_PAR, Type_Flux_Arete::PAROI, Type_Double>(aretes_bord_par_type_[TypeAreteBordVDF::PAROI_PAROI],ncomp,mats,secmem,semi_impl); + ajouter_blocs_aretes_bords_<_TYPE_::CALC_ARR_PAR_FL, Type_Flux_Arete::PAROI_FLUIDE, Type_Double>(aretes_bord_par_type_[TypeAreteBordVDF::PAROI_FLUIDE], ncomp, mats, secmem, semi_impl); + ajouter_blocs_aretes_bords_<_TYPE_::CALC_ARR_FL, Type_Flux_Arete::FLUIDE, Type_Double>(aretes_bord_par_type_[TypeAreteBordVDF::FLUIDE_FLUIDE],ncomp,mats,secmem,semi_impl); + ajouter_blocs_aretes_bords_<_TYPE_::CALC_ARR_NAVIER_PAR, Type_Flux_Arete::NAVIER_PAROI, Type_Double>(aretes_bord_par_type_[TypeAreteBordVDF::PAROI_NAVIER],ncomp,mats,secmem,semi_impl); + ajouter_blocs_aretes_bords_<_TYPE_::CALC_ARR_NAVIER_FL, Type_Flux_Arete::NAVIER_FLUIDE, Type_Double>(aretes_bord_par_type_[TypeAreteBordVDF::FLUIDE_NAVIER], ncomp, mats, secmem, semi_impl); + ajouter_blocs_aretes_bords_<_TYPE_::CALC_ARR_NAVIER, Type_Flux_Arete::NAVIER, Type_Double>(aretes_bord_par_type_[TypeAreteBordVDF::NAVIER_NAVIER],ncomp,mats,secmem,semi_impl); + ajouter_blocs_aretes_bords_<_TYPE_::CALC_ARR_PERIO, Type_Flux_Arete::PERIODICITE, Type_Double>(aretes_bord_par_type_[TypeAreteBordVDF::PERIO_PERIO], ncomp, mats, secmem, semi_impl); } template template std::enable_if_t< Arete_Type == Type_Flux_Arete::PAROI || Arete_Type == Type_Flux_Arete::NAVIER || Arete_Type == Type_Flux_Arete::NAVIER_PAROI, void> -Iterateur_VDF_Face<_TYPE_>::ajouter_blocs_aretes_bords_(const int n_arete, const int ncomp, const matrices_t& mats, DoubleTab& secmem, const tabs_t& semi_impl) const +Iterateur_VDF_Face<_TYPE_>::ajouter_blocs_aretes_bords_(const IntVect& tab_aretes_bord, const int ncomp, const matrices_t& mats, DoubleTab& tab_resu, const tabs_t& semi_impl) const { if (should_calc_flux) { constexpr bool is_PAROI = (Arete_Type == Type_Flux_Arete::PAROI); - Type_Double flux(ncomp), aii1_2(ncomp), aii3_4(ncomp), ajj1_2(ncomp); - const int n = le_dom->nb_faces_bord(), fac1 = Qdm(n_arete, 0), fac2 = Qdm(n_arete, 1), fac3 = Qdm(n_arete, 2), signe = Qdm(n_arete, 3); - DoubleTab& tab_flux_bords = op_base->flux_bords(); - const DoubleTab& inco = semi_impl.count(nom_ch_inco_) ? semi_impl.at(nom_ch_inco_) : le_champ_convecte_ou_inc->valeurs(); + const DoubleTab& tab_inco = semi_impl.count(nom_ch_inco_) ? semi_impl.at(nom_ch_inco_) : le_champ_convecte_ou_inc->valeurs(); - const DoubleTab* a_r = (!is_pb_multi || !is_conv_op_) ? nullptr : semi_impl.count("alpha_rho") ? &semi_impl.at("alpha_rho") : - &ref_cast(Pb_Multiphase,op_base->equation().probleme()).equation_masse().champ_conserve().valeurs(); + const DoubleTab* tab_a_r = (!is_pb_multi || !is_conv_op_) ? nullptr : semi_impl.count("alpha_rho") ? &semi_impl.at("alpha_rho") : + &ref_cast(Pb_Multiphase,op_base->equation().probleme()).equation_masse().champ_conserve().valeurs(); // second membre - flux_evaluateur.template flux_arete < Arete_Type > (inco, a_r, fac1, fac2, fac3, signe, flux); - for (int k = 0; k < ncomp; k++) - { - secmem(fac3, k) += signe * flux[k]; - if (is_PAROI) - { - if (fac1 < n) tab_flux_bords(fac1, orientation(fac3)) -= 0.5 * signe * flux[k]; - if (fac2 < n) tab_flux_bords(fac2, orientation(fac3)) -= 0.5 * signe * flux[k]; - } - } + const int n = le_dom->nb_faces_bord(); + // Copy of flux_evaluateur_ is mandatory here cause it's an attribute of the class + _TYPE_ flux_evaluateur = flux_evaluateur_; + flux_evaluateur.view_ro(); + CIntTabView Qdm = Qdm_.view_ro(); + CIntArrView aretes_bord = tab_aretes_bord.view_ro(); + CIntArrView orientation = orientation_.view_ro(); + CDoubleTabView a_r; + CDoubleTabView inco = tab_inco.view_ro(); + CDoubleTabView val_imp_face_bord = val_imp_face_bord_.view_ro(); + CDoubleTabView coeff_frottement_face_bord = coeff_frottement_face_bord_.view_ro(); + DoubleTabView resu = tab_resu.view_rw(); + DoubleTabView flux_bords = op_base->flux_bords().view_rw(); + int size = tab_aretes_bord.size(); + Kokkos::parallel_for(start_gpu_timer(__KERNEL_NAME__), + Kokkos::MDRangePolicy>({0, 0}, {size, ncomp}), + KOKKOS_LAMBDA(const int index, const int k) + { + const int n_arete = aretes_bord(index); + const int fac1 = Qdm(n_arete, 0), fac2 = Qdm(n_arete, 1), fac3 = Qdm(n_arete, 2), signe = Qdm(n_arete, 3); + double flux; + flux_evaluateur.template flux_arete_comp(inco, val_imp_face_bord, coeff_frottement_face_bord, a_r, fac1, fac2, fac3, signe, k, flux); + Kokkos::atomic_add(&resu(fac3, k), + signe * flux); + if (is_PAROI) + { + if (fac1 < n) Kokkos::atomic_add(&flux_bords(fac1, orientation(fac3)), - 0.5 * signe * flux); + if (fac2 < n) Kokkos::atomic_add(&flux_bords(fac2, orientation(fac3)), - 0.5 * signe * flux); + } + }); + end_gpu_timer(__KERNEL_NAME__); // derivees : champ convecte Matrice_Morse *matrice = (is_pb_multi && is_conv_op_) ? (mats.count(nom_ch_inco_) && !semi_impl.count(nom_ch_inco_) ? mats.at(nom_ch_inco_) : nullptr) : (mats.count(nom_ch_inco_) ? mats.at(nom_ch_inco_) : nullptr); if (matrice) { - flux_evaluateur.template coeffs_arete < Arete_Type > (a_r, fac1, fac2, fac3, signe, aii1_2, aii3_4, ajj1_2); - for (int i = 0; i < ncomp; i++) - fill_coeff_matrice_morse < Type_Double > (fac3, i, ncomp, signe, aii3_4, *matrice); + Type_Double aii1_2(ncomp), aii3_4(ncomp), ajj1_2(ncomp); + for (int index = 0; index < tab_aretes_bord.size(); index++) + { + const int n_arete = tab_aretes_bord(index); + const int fac1 = Qdm_(n_arete, 0), fac2 = Qdm_(n_arete, 1), fac3 = Qdm_(n_arete, 2), signe = Qdm_(n_arete, 3); + flux_evaluateur.template coeffs_arete(val_imp_face_bord_, coeff_frottement_face_bord_, tab_a_r, fac1, fac2, fac3, signe, aii1_2, aii3_4, ajj1_2); + for (int i = 0; i < ncomp; i++) + fill_coeff_matrice_morse(fac3, i, ncomp, signe, aii3_4, *matrice); + } } } } template template std::enable_if_t -Iterateur_VDF_Face<_TYPE_>::ajouter_blocs_aretes_bords_(const int n_arete, const int ncomp, const matrices_t& mats, DoubleTab& secmem, const tabs_t& semi_impl) const +Iterateur_VDF_Face<_TYPE_>::ajouter_blocs_aretes_bords_(const IntVect& tab_aretes_bord, const int ncomp, const matrices_t& mats, DoubleTab& tab_resu, const tabs_t& semi_impl) const { if (should_calc_flux) { constexpr bool is_FLUIDE = (Arete_Type == Type_Flux_Arete::FLUIDE), is_PAROI_FL = (Arete_Type == Type_Flux_Arete::PAROI_FLUIDE); - Type_Double flux3(ncomp), flux1_2(ncomp), aii1_2(ncomp), aii3_4(ncomp), ajj1_2(ncomp); - const int n = le_dom->nb_faces_bord(), fac1 = Qdm(n_arete, 0), fac2 = Qdm(n_arete, 1), fac3 = Qdm(n_arete, 2), signe = Qdm(n_arete, 3); - DoubleTab& tab_flux_bords = op_base->flux_bords(); - const DoubleTab& inco = semi_impl.count(nom_ch_inco_) ? semi_impl.at(nom_ch_inco_) : le_champ_convecte_ou_inc->valeurs(); + const DoubleTab& tab_inco = semi_impl.count(nom_ch_inco_) ? semi_impl.at(nom_ch_inco_) : le_champ_convecte_ou_inc->valeurs(); - const DoubleTab* a_r = (!is_pb_multi || !is_conv_op_) ? nullptr : semi_impl.count("alpha_rho") ? &semi_impl.at("alpha_rho") : - &ref_cast(Pb_Multiphase,op_base->equation().probleme()).equation_masse().champ_conserve().valeurs(); + const DoubleTab* tab_a_r = (!is_pb_multi || !is_conv_op_) ? nullptr : semi_impl.count("alpha_rho") ? &semi_impl.at("alpha_rho") : + &ref_cast(Pb_Multiphase,op_base->equation().probleme()).equation_masse().champ_conserve().valeurs(); // second membre - flux_evaluateur.template flux_arete < Arete_Type > (inco, a_r, fac1, fac2, fac3, signe, flux3, flux1_2); - for (int k = 0; k < ncomp; k++) - secmem(fac3, k) += signe * flux3[k]; - - fill_resu_tab < Type_Double > (fac1, fac2, ncomp, flux1_2, secmem); - - if (is_FLUIDE || is_PAROI_FL) - { - if (fac1 < n) - for (int k = 0; k < ncomp; k++) tab_flux_bords(fac1, orientation(fac3)) -= 0.5 * signe * flux3[k]; - - if (fac2 < n) - for (int k = 0; k < ncomp; k++) tab_flux_bords(fac2, orientation(fac3)) -= 0.5 * signe * flux3[k]; - } + const int n = le_dom->nb_faces_bord(); + // Copy of flux_evaluateur_ is mandatory here cause it's an attribute of the class + _TYPE_ flux_evaluateur = flux_evaluateur_; + flux_evaluateur.view_ro(); + CIntTabView Qdm = Qdm_.view_ro(); + CIntArrView aretes_bord = tab_aretes_bord.view_ro(); + CIntArrView orientation = orientation_.view_ro(); + CDoubleTabView a_r; + if (tab_a_r!=nullptr) a_r = tab_a_r->view_ro(); + CDoubleTabView inco = tab_inco.view_ro(); + CDoubleTabView val_imp_face_bord = val_imp_face_bord_.view_ro(); + CDoubleTabView coeff_frottement_face_bord = coeff_frottement_face_bord_.view_ro(); + DoubleTabView resu = tab_resu.view_rw(); + DoubleTabView flux_bords = op_base->flux_bords().view_rw(); + int size = tab_aretes_bord.size(); + Kokkos::parallel_for(start_gpu_timer(__KERNEL_NAME__), + Kokkos::MDRangePolicy>({0, 0}, {size, ncomp}), + KOKKOS_LAMBDA(const int index, const int k) + { + const int n_arete = aretes_bord(index); + const int fac1 = Qdm(n_arete, 0), fac2 = Qdm(n_arete, 1), fac3 = Qdm(n_arete, 2), signe = Qdm(n_arete, 3); + double flux3, flux1_2; + flux_evaluateur.template flux_arete_comp(inco, val_imp_face_bord, coeff_frottement_face_bord, a_r, fac1, fac2, fac3, signe, ncomp, k, flux3, flux1_2); + Kokkos::atomic_add(&resu(fac3, k), + signe * flux3); + Kokkos::atomic_add(&resu(fac1, k), + flux1_2); + Kokkos::atomic_add(&resu(fac2, k), - flux1_2); + if (is_FLUIDE || is_PAROI_FL) + { + if (fac1 < n) Kokkos::atomic_add(&flux_bords(fac1, orientation(fac3)), - 0.5 * signe * flux3); + if (fac2 < n) Kokkos::atomic_add(&flux_bords(fac2, orientation(fac3)), - 0.5 * signe * flux3); + } + }); + end_gpu_timer(__KERNEL_NAME__); // derivees : champ convecte Matrice_Morse *matrice = (is_pb_multi && is_conv_op_) ? (mats.count(nom_ch_inco_) && !semi_impl.count(nom_ch_inco_) ? mats.at(nom_ch_inco_) : nullptr) : (mats.count(nom_ch_inco_) ? mats.at(nom_ch_inco_) : nullptr); if (matrice) { - flux_evaluateur.template coeffs_arete < Arete_Type > (a_r, fac1, fac2, fac3, signe, aii1_2, aii3_4, ajj1_2); - for (int i = 0; i < ncomp; i++) + Type_Double aii1_2(ncomp), aii3_4(ncomp), ajj1_2(ncomp); + for (int index = 0; index < tab_aretes_bord.size(); index++) { - fill_coeff_matrice_morse < Type_Double > (fac3, i, ncomp, signe, aii3_4, *matrice); - fill_coeff_matrice_morse < Type_Double > (fac1, fac2, i, ncomp, aii1_2, ajj1_2, *matrice); + const int n_arete = tab_aretes_bord(index); + const int fac1 = Qdm_(n_arete, 0), fac2 = Qdm_(n_arete, 1), fac3 = Qdm_(n_arete, 2), signe = Qdm_(n_arete, 3); + flux_evaluateur.template coeffs_arete(val_imp_face_bord_, coeff_frottement_face_bord_, tab_a_r, fac1, fac2, fac3, signe, aii1_2, aii3_4, ajj1_2); + for (int i = 0; i < ncomp; i++) + { + fill_coeff_matrice_morse(fac3, i, ncomp, signe, aii3_4, *matrice); + fill_coeff_matrice_morse(fac1, fac2, i, ncomp, aii1_2, ajj1_2, *matrice); + } } } } @@ -192,41 +253,58 @@ Iterateur_VDF_Face<_TYPE_>::ajouter_blocs_aretes_bords_(const int n_arete, const template template std::enable_if_t -Iterateur_VDF_Face<_TYPE_>::ajouter_blocs_aretes_bords_(const int n_arete, const int ncomp, const matrices_t& mats, DoubleTab& secmem, const tabs_t& semi_impl) const +Iterateur_VDF_Face<_TYPE_>::ajouter_blocs_aretes_bords_(const IntVect& tab_aretes_bord, const int ncomp, const matrices_t& mats, DoubleTab& tab_resu, const tabs_t& semi_impl) const { if (should_calc_flux) { - Type_Double flux3_4(ncomp), flux1_2(ncomp), aii(ncomp), ajj(ncomp); - const int fac1 = Qdm(n_arete, 0), fac2 = Qdm(n_arete, 1), fac3 = Qdm(n_arete, 2), fac4 = Qdm(n_arete, 3); - const DoubleTab& inco = semi_impl.count(nom_ch_inco_) ? semi_impl.at(nom_ch_inco_) : le_champ_convecte_ou_inc->valeurs(); - - + const DoubleTab& tab_inco = semi_impl.count(nom_ch_inco_) ? semi_impl.at(nom_ch_inco_) : le_champ_convecte_ou_inc->valeurs(); // second membre - flux_evaluateur.template flux_arete < Arete_Type > (inco, nullptr, fac1, fac2, fac3, fac4, flux3_4, flux1_2); - for (int k = 0; k < ncomp; k++) - { - secmem(fac3, k) += 0.5 * flux3_4[k]; - secmem(fac4, k) -= 0.5 * flux3_4[k]; - } - - fill_resu_tab < Type_Double > (fac1, fac2, ncomp, flux1_2, secmem); + // Copy of flux_evaluateur_ is mandatory here cause it's an attribute of the class + _TYPE_ flux_evaluateur = flux_evaluateur_; + flux_evaluateur.view_ro(); + CIntTabView Qdm = Qdm_.view_ro(); + CIntArrView aretes_bord = tab_aretes_bord.view_ro(); + CDoubleTabView a_r; + CDoubleTabView inco = tab_inco.view_ro(); + DoubleTabView resu = tab_resu.view_rw(); + int size = tab_aretes_bord.size(); + Kokkos::parallel_for(start_gpu_timer(__KERNEL_NAME__), + Kokkos::MDRangePolicy>({0, 0}, {size, ncomp}), + KOKKOS_LAMBDA(const int index, const int k) + { + const int n_arete = aretes_bord(index); + const int fac1 = Qdm(n_arete, 0), fac2 = Qdm(n_arete, 1), fac3 = Qdm(n_arete, 2), fac4 = Qdm(n_arete, 3); + double flux3_4, flux1_2; + flux_evaluateur.template flux_arete_comp(inco, a_r, fac1, fac2, fac3, fac4, k, flux3_4, flux1_2); + Kokkos::atomic_add(&resu(fac3, k), + 0.5 * flux3_4); + Kokkos::atomic_add(&resu(fac4, k), - 0.5 * flux3_4); + Kokkos::atomic_add(&resu(fac1, k), + flux1_2); + Kokkos::atomic_add(&resu(fac2, k), - flux1_2); + }); + end_gpu_timer(__KERNEL_NAME__); // derivees : champ convecte Matrice_Morse *matrice = (is_pb_multi && is_conv_op_) ? (mats.count(nom_ch_inco_) && !semi_impl.count(nom_ch_inco_) ? mats.at(nom_ch_inco_) : nullptr) : (mats.count(nom_ch_inco_) ? mats.at(nom_ch_inco_) : nullptr); if (matrice) { - flux_evaluateur.template coeffs_arete < Arete_Type > (nullptr, fac3, fac4, fac1, fac2, aii, ajj); - for (int i = 0; i < ncomp; i++) - fill_coeff_matrice_morse(fac1, fac2, i, ncomp, aii, ajj, *matrice); - - flux_evaluateur.template coeffs_arete < Arete_Type > (nullptr, fac1, fac2, fac3, fac4, aii, ajj); - for (int i = 0; i < ncomp; i++) + Type_Double aii(ncomp), ajj(ncomp); + for (int index = 0; index < tab_aretes_bord.size(); index++) { - aii[i] *= 0.5; - ajj[i] *= 0.5; - fill_coeff_matrice_morse < Type_Double > (fac3, fac4, i, ncomp, aii, ajj, *matrice); + const int n_arete = tab_aretes_bord(index); + const int fac1 = Qdm_(n_arete, 0), fac2 = Qdm_(n_arete, 1), fac3 = Qdm_(n_arete, 2), fac4 = Qdm_(n_arete, 3); + flux_evaluateur.template coeffs_arete(nullptr, fac3, fac4, fac1, fac2, aii, ajj); + for (int i = 0; i < ncomp; i++) + fill_coeff_matrice_morse(fac1, fac2, i, ncomp, aii, ajj, *matrice); + + flux_evaluateur_.template coeffs_arete(nullptr, fac1, fac2, fac3, fac4, aii, ajj); + for (int i = 0; i < ncomp; i++) + { + aii[i] *= 0.5; + ajj[i] *= 0.5; + fill_coeff_matrice_morse(fac3, fac4, i, ncomp, aii, ajj, *matrice); + } } } } @@ -238,107 +316,163 @@ Iterateur_VDF_Face<_TYPE_>::ajouter_blocs_aretes_bords_(const int n_arete, const template template void Iterateur_VDF_Face<_TYPE_>::ajouter_blocs_aretes_coins(const int ncomp, matrices_t mats, DoubleTab& secmem, const tabs_t& semi_impl) const { - for (int n_arete = premiere_arete_coin; n_arete < derniere_arete_coin; n_arete++) + if (aretes_coin_par_type_.size()==0) { - const int n_type = type_arete_coin(n_arete - premiere_arete_coin); - switch(n_type) + // Fill aretes_coins_par_type_ + const int n_types = 17; // See enum type_arete in Domaine_Cl_VDF.h + aretes_coin_par_type_.resize(n_types); + ArrOfInt size(n_types); + for (int num_arete = 0; num_arete < type_arete_coin.size(); num_arete++) { - case TypeAreteCoinVDF::PAROI_FLUIDE: - ajouter_blocs_aretes_coins_<_TYPE_::CALC_ARR_PAR, Type_Flux_Arete::PAROI, TypeAreteCoinVDF::PAROI_FLUIDE, Type_Double>(n_arete, ncomp, mats, secmem, semi_impl); - break; - case TypeAreteCoinVDF::FLUIDE_PAROI: - ajouter_blocs_aretes_coins_<_TYPE_::CALC_ARR_PAR, Type_Flux_Arete::PAROI, TypeAreteCoinVDF::FLUIDE_PAROI, Type_Double>(n_arete, ncomp, mats, secmem, semi_impl); - break; - case TypeAreteCoinVDF::PERIO_PAROI: - ajouter_blocs_aretes_coins_<_TYPE_::CALC_ARR_PAR, Type_Flux_Arete::PAROI, TypeAreteCoinVDF::PERIO_PAROI, Type_Double>(n_arete, ncomp, mats, secmem, semi_impl); - break; - case TypeAreteCoinVDF::FLUIDE_FLUIDE: - ajouter_blocs_aretes_coins_<_TYPE_::CALC_ARR_COIN_FL, Type_Flux_Arete::COIN_FLUIDE, Type_Double>(n_arete, ncomp, mats, secmem, semi_impl); - break; - case TypeAreteCoinVDF::PERIO_PERIO: - ajouter_blocs_aretes_coins_<_TYPE_::CALC_ARR_PERIO, Type_Flux_Arete::PERIODICITE, Type_Double>(n_arete, ncomp, mats, secmem, semi_impl); - break; - default: - break; + int n_type = type_arete_coin[num_arete]; + if (n_type >= 0) + size(n_type)++; + } + for (int n_type = 0; n_type < n_types; n_type++) + { + aretes_coin_par_type_[n_type].resize(size(n_type)); + size(n_type) = 0; + } + for (int num_arete = 0; num_arete < type_arete_coin.size(); num_arete++) + { + int n_type = type_arete_coin[num_arete]; + if (n_type >= 0) + { + aretes_coin_par_type_[n_type][size(n_type)] = premiere_arete_coin + num_arete; + size(n_type)++; + } } } + ajouter_blocs_aretes_coins_<_TYPE_::CALC_ARR_PAR, Type_Flux_Arete::PAROI, TypeAreteCoinVDF::PAROI_FLUIDE, Type_Double>(aretes_coin_par_type_[TypeAreteCoinVDF::PAROI_FLUIDE], ncomp, mats, secmem, semi_impl); + ajouter_blocs_aretes_coins_<_TYPE_::CALC_ARR_PAR, Type_Flux_Arete::PAROI, TypeAreteCoinVDF::FLUIDE_PAROI, Type_Double>(aretes_coin_par_type_[TypeAreteCoinVDF::FLUIDE_PAROI], ncomp, mats, secmem, semi_impl); + ajouter_blocs_aretes_coins_<_TYPE_::CALC_ARR_PAR, Type_Flux_Arete::PAROI, TypeAreteCoinVDF::PERIO_PAROI, Type_Double>(aretes_coin_par_type_[TypeAreteCoinVDF::PERIO_PAROI], ncomp, mats, secmem, semi_impl); + ajouter_blocs_aretes_coins_<_TYPE_::CALC_ARR_COIN_FL, Type_Flux_Arete::COIN_FLUIDE, Type_Double>(aretes_coin_par_type_[TypeAreteCoinVDF::FLUIDE_FLUIDE], ncomp, mats, secmem, semi_impl); + ajouter_blocs_aretes_coins_<_TYPE_::CALC_ARR_PERIO, Type_Flux_Arete::PERIODICITE, Type_Double>(aretes_coin_par_type_[TypeAreteCoinVDF::PERIO_PERIO], ncomp, mats, secmem, semi_impl); } template template std::enable_if_t< Arete_Type == Type_Flux_Arete::PAROI, void> -Iterateur_VDF_Face<_TYPE_>::ajouter_blocs_aretes_coins_(const int n_arete, const int ncomp, matrices_t mats, DoubleTab& secmem, const tabs_t& semi_impl) const +Iterateur_VDF_Face<_TYPE_>::ajouter_blocs_aretes_coins_(const IntVect& tab_aretes_coin, const int ncomp, matrices_t mats, DoubleTab& tab_resu, const tabs_t& semi_impl) const { if (should_calc_flux) { constexpr bool is_PERIO_PAROI = (Arete_Type_Coin == TypeAreteCoinVDF::PERIO_PAROI); - Type_Double flux(ncomp), aii1_2(ncomp), aii3_4(ncomp), ajj1_2(ncomp); - const int fac1 = Qdm(n_arete, 0), fac2 = Qdm(n_arete, 1), fac3 = Qdm(n_arete, 2), signe = Qdm(n_arete, 3); - DoubleTab& tab_flux_bords = op_base->flux_bords(); - const DoubleTab& inco = semi_impl.count(nom_ch_inco_) ? semi_impl.at(nom_ch_inco_) : le_champ_convecte_ou_inc->valeurs(); - const DoubleTab* a_r = (!is_pb_multi || !is_conv_op_) ? nullptr : semi_impl.count("alpha_rho") ? &semi_impl.at("alpha_rho") : - &ref_cast(Pb_Multiphase,op_base->equation().probleme()).equation_masse().champ_conserve().valeurs(); + const DoubleTab& tab_inco = semi_impl.count(nom_ch_inco_) ? semi_impl.at(nom_ch_inco_) : le_champ_convecte_ou_inc->valeurs(); + + const DoubleTab* tab_a_r = (!is_pb_multi || !is_conv_op_) ? nullptr : semi_impl.count("alpha_rho") ? &semi_impl.at("alpha_rho") : + &ref_cast(Pb_Multiphase,op_base->equation().probleme()).equation_masse().champ_conserve().valeurs(); // second membre - flux_evaluateur.template flux_arete < Arete_Type > (inco, a_r, fac1, fac2, fac3, signe, flux); - for (int k = 0; k < ncomp; k++) - { - secmem(fac3, k) += signe * flux[k]; - if (is_PERIO_PAROI) /* on met 0.25 sur les deux faces (car on ajoutera deux fois la contrib) */ - { - tab_flux_bords(fac1, orientation(fac3)) -= 0.25 * signe * flux[k]; - tab_flux_bords(fac2, orientation(fac3)) -= 0.25 * signe * flux[k]; - } - else - tab_flux_bords(fac1, orientation(fac3)) -= 0.5 * signe * flux[k]; - } + // Copy of flux_evaluateur_ is mandatory here cause it's an attribute of the class + _TYPE_ flux_evaluateur = flux_evaluateur_; + flux_evaluateur.view_ro(); + CIntTabView Qdm = Qdm_.view_ro(); + CIntArrView aretes_coin = tab_aretes_coin.view_ro(); + CIntArrView orientation = orientation_.view_ro(); + CDoubleTabView a_r; + if (tab_a_r!=nullptr) a_r = tab_a_r->view_ro(); + CDoubleTabView inco = tab_inco.view_ro(); + CDoubleTabView val_imp_face_bord = val_imp_face_bord_.view_ro(); + CDoubleTabView coeff_frottement_face_bord = coeff_frottement_face_bord_.view_ro(); + DoubleTabView resu = tab_resu.view_rw(); + DoubleTabView flux_bords = op_base->flux_bords().view_rw(); + int size = tab_aretes_coin.size(); + Kokkos::parallel_for(start_gpu_timer(__KERNEL_NAME__), + Kokkos::MDRangePolicy>({0, 0}, {size, ncomp}), + KOKKOS_LAMBDA(const int index, const int k) + { + const int n_arete = aretes_coin(index); + const int fac1 = Qdm(n_arete, 0), fac2 = Qdm(n_arete, 1), fac3 = Qdm(n_arete, 2), signe = Qdm(n_arete, 3); + double flux; + flux_evaluateur.template flux_arete_comp(inco, val_imp_face_bord, coeff_frottement_face_bord, a_r, fac1, fac2, fac3, signe, k, flux); + Kokkos::atomic_add(&resu(fac3, k), + signe * flux); + if (is_PERIO_PAROI) + { + Kokkos::atomic_add(&flux_bords(fac1, orientation(fac3)), - 0.25 * signe * flux); + Kokkos::atomic_add(&flux_bords(fac2, orientation(fac3)), - 0.25 * signe * flux); + } + else + Kokkos::atomic_add(&flux_bords(fac1, orientation(fac3)), - 0.5 * signe * flux); + }); + end_gpu_timer(__KERNEL_NAME__); // derivees : champ convecte Matrice_Morse *matrice = (is_pb_multi && is_conv_op_) ? (mats.count(nom_ch_inco_) && !semi_impl.count(nom_ch_inco_) ? mats.at(nom_ch_inco_) : nullptr) : (mats.count(nom_ch_inco_) ? mats.at(nom_ch_inco_) : nullptr); if (matrice) { - flux_evaluateur.template coeffs_arete < Arete_Type > (a_r, fac1, fac2, fac3, signe, aii1_2, aii3_4, ajj1_2); - for (int i = 0; i < ncomp; i++) - fill_coeff_matrice_morse(fac3, i, ncomp, signe, aii3_4, *matrice); + Type_Double aii1_2(ncomp), aii3_4(ncomp), ajj1_2(ncomp); + for (int index=0; index(val_imp_face_bord_, coeff_frottement_face_bord_, tab_a_r, fac1, fac2, fac3, signe, aii1_2, aii3_4, ajj1_2); + for (int i = 0; i < ncomp; i++) + fill_coeff_matrice_morse(fac3, i, ncomp, signe, aii3_4, *matrice); + } } } } template template std::enable_if_t -Iterateur_VDF_Face<_TYPE_>::ajouter_blocs_aretes_coins_(const int n_arete, const int ncomp, matrices_t mats, DoubleTab& secmem, const tabs_t& semi_impl) const +Iterateur_VDF_Face<_TYPE_>::ajouter_blocs_aretes_coins_(const IntVect& tab_aretes_coin, const int ncomp, matrices_t mats, DoubleTab& tab_resu, const tabs_t& semi_impl) const { if (should_calc_flux) { - Type_Double flux3(ncomp), flux1_2(ncomp), aii1_2(ncomp), aii3_4(ncomp), ajj1_2(ncomp); - const int n = le_dom->nb_faces_bord(), fac1 = Qdm(n_arete, 0), fac2 = Qdm(n_arete, 1), fac3 = Qdm(n_arete, 2), signe = Qdm(n_arete, 3); - DoubleTab& tab_flux_bords = op_base->flux_bords(); - const DoubleTab& inco = semi_impl.count(nom_ch_inco_) ? semi_impl.at(nom_ch_inco_) : le_champ_convecte_ou_inc->valeurs(); - const DoubleTab* a_r = (!is_pb_multi || !is_conv_op_) ? nullptr : semi_impl.count("alpha_rho") ? &semi_impl.at("alpha_rho") : - &ref_cast(Pb_Multiphase,op_base->equation().probleme()).equation_masse().champ_conserve().valeurs(); + const DoubleTab& tab_inco = semi_impl.count(nom_ch_inco_) ? semi_impl.at(nom_ch_inco_) : le_champ_convecte_ou_inc->valeurs(); + + const DoubleTab* tab_a_r = (!is_pb_multi || !is_conv_op_) ? nullptr : semi_impl.count("alpha_rho") ? &semi_impl.at("alpha_rho") : + &ref_cast(Pb_Multiphase,op_base->equation().probleme()).equation_masse().champ_conserve().valeurs(); // second membre - flux_evaluateur.template flux_arete < Arete_Type > (inco, a_r, fac1, fac2, fac3, signe, flux3, flux1_2); - for (int k = 0; k < ncomp; k++) - { - secmem(fac3, k) += signe * flux3[k]; - secmem(fac1, k) += flux1_2[k]; - if (fac1 < n) tab_flux_bords(fac1, orientation(fac3)) -= 0.5 * signe * flux3[k]; - } + const int n = le_dom->nb_faces_bord(); + // Copy of flux_evaluateur_ is mandatory here cause it's an attribute of the class + _TYPE_ flux_evaluateur = flux_evaluateur_; + flux_evaluateur.view_ro(); + CIntTabView Qdm = Qdm_.view_ro(); + CIntArrView aretes_coin = tab_aretes_coin.view_ro(); + CIntArrView orientation = orientation_.view_ro(); + CDoubleTabView a_r; + if (tab_a_r!=nullptr) a_r = tab_a_r->view_ro(); + CDoubleTabView inco = tab_inco.view_ro(); + CDoubleTabView val_imp_face_bord = val_imp_face_bord_.view_ro(); + CDoubleTabView coeff_frottement_face_bord = coeff_frottement_face_bord_.view_ro(); + DoubleTabView resu = tab_resu.view_rw(); + DoubleTabView flux_bords = op_base->flux_bords().view_rw(); + int size = tab_aretes_coin.size(); + Kokkos::parallel_for(start_gpu_timer(__KERNEL_NAME__), + Kokkos::MDRangePolicy>({0, 0}, {size, ncomp}), + KOKKOS_LAMBDA(const int index, const int k) + { + const int n_arete = aretes_coin(index); + const int fac1 = Qdm(n_arete, 0), fac2 = Qdm(n_arete, 1), fac3 = Qdm(n_arete, 2), signe = Qdm(n_arete, 3); + double flux3, flux1_2; + flux_evaluateur.template flux_arete_comp(inco, val_imp_face_bord, coeff_frottement_face_bord, a_r, fac1, fac2, fac3, signe, k, flux3, flux1_2); + Kokkos::atomic_add(&resu(fac3, k), + signe * flux3); + Kokkos::atomic_add(&resu(fac1, k), + flux1_2); + if (fac1 < n) Kokkos::atomic_add(&flux_bords(fac1, orientation(fac3)), - 0.5 * signe * flux3); + }); + end_gpu_timer(__KERNEL_NAME__); // derivees : champ convecte Matrice_Morse *matrice = (is_pb_multi && is_conv_op_) ? (mats.count(nom_ch_inco_) && !semi_impl.count(nom_ch_inco_) ? mats.at(nom_ch_inco_) : nullptr) : (mats.count(nom_ch_inco_) ? mats.at(nom_ch_inco_) : nullptr); if (matrice) { - flux_evaluateur.template coeffs_arete < Arete_Type > (a_r, fac1, fac2, fac3, signe, aii1_2, aii3_4, ajj1_2); - - for (int i = 0; i < ncomp; i++) + Type_Double aii1_2(ncomp), aii3_4(ncomp), ajj1_2(ncomp); + for (int index=0; index (fac1, i, ncomp, 1, aii1_2, *matrice); + const int n_arete = tab_aretes_coin(index); + const int fac1 = Qdm_(n_arete, 0), fac2 = Qdm_(n_arete, 1), fac3 = Qdm_(n_arete, 2), signe = Qdm_(n_arete, 3); + flux_evaluateur.template coeffs_arete(val_imp_face_bord_, coeff_frottement_face_bord_, tab_a_r, fac1, fac2, fac3, signe, aii1_2, aii3_4, ajj1_2); + for (int i = 0; i < ncomp; i++) + { + fill_coeff_matrice_morse(fac3, i, ncomp, signe, aii3_4, *matrice); + fill_coeff_matrice_morse(fac1, i, ncomp, 1, aii1_2, *matrice); + } } } } @@ -346,36 +480,55 @@ Iterateur_VDF_Face<_TYPE_>::ajouter_blocs_aretes_coins_(const int n_arete, const template template std::enable_if_t -Iterateur_VDF_Face<_TYPE_>::ajouter_blocs_aretes_coins_(const int n_arete, const int ncomp, matrices_t mats, DoubleTab& secmem, const tabs_t& semi_impl) const +Iterateur_VDF_Face<_TYPE_>::ajouter_blocs_aretes_coins_(const IntVect& tab_aretes_coin, const int ncomp, matrices_t mats, DoubleTab& tab_resu, const tabs_t& semi_impl) const { if (should_calc_flux) { - Type_Double flux3_4(ncomp), flux1_2(ncomp), aii(ncomp), ajj(ncomp); - const int fac1 = Qdm(n_arete, 0), fac2 = Qdm(n_arete, 1), fac3 = Qdm(n_arete, 2), fac4 = Qdm(n_arete, 3); - const DoubleTab& inco = semi_impl.count(nom_ch_inco_) ? semi_impl.at(nom_ch_inco_) : le_champ_convecte_ou_inc->valeurs(); + const DoubleTab& tab_inco = semi_impl.count(nom_ch_inco_) ? semi_impl.at(nom_ch_inco_) : le_champ_convecte_ou_inc->valeurs(); // second membre - flux_evaluateur.template flux_arete < Arete_Type > (inco, nullptr, fac1, fac2, fac3, fac4, flux3_4, flux1_2); - for (int k = 0; k < ncomp; k++) - { - secmem(fac3, k) += 0.5 * flux3_4[k]; - secmem(fac4, k) -= 0.5 * flux3_4[k]; - secmem(fac1, k) += 0.5 * flux1_2[k]; - secmem(fac2, k) -= 0.5 * flux1_2[k]; - } + // Copy of flux_evaluateur_ is mandatory here cause it's an attribute of the class + _TYPE_ flux_evaluateur = flux_evaluateur_; + flux_evaluateur.view_ro(); + CIntTabView Qdm = Qdm_.view_ro(); + CIntArrView aretes_coin = tab_aretes_coin.view_ro(); + CDoubleTabView a_r; + CDoubleTabView inco = tab_inco.view_ro(); + DoubleTabView resu = tab_resu.view_rw(); + int size = tab_aretes_coin.size(); + Kokkos::parallel_for(start_gpu_timer(__KERNEL_NAME__), + Kokkos::MDRangePolicy>({0, 0}, {size, ncomp}), + KOKKOS_LAMBDA(const int index, const int k) + { + const int n_arete = aretes_coin(index); + const int fac1 = Qdm(n_arete, 0), fac2 = Qdm(n_arete, 1), fac3 = Qdm(n_arete, 2), fac4 = Qdm(n_arete, 3); + double flux3_4, flux1_2; + flux_evaluateur.template flux_arete_comp(inco, a_r, fac1, fac2, fac3, fac4, k, flux3_4, flux1_2); + Kokkos::atomic_add(&resu(fac3, k), + 0.5 * flux3_4); + Kokkos::atomic_add(&resu(fac4, k), - 0.5 * flux3_4); + Kokkos::atomic_add(&resu(fac1, k), + 0.5 * flux1_2); + Kokkos::atomic_add(&resu(fac2, k), - 0.5 * flux1_2); + }); + end_gpu_timer(__KERNEL_NAME__); // derivees : champ convecte Matrice_Morse *matrice = (is_pb_multi && is_conv_op_) ? (mats.count(nom_ch_inco_) && !semi_impl.count(nom_ch_inco_) ? mats.at(nom_ch_inco_) : nullptr) : (mats.count(nom_ch_inco_) ? mats.at(nom_ch_inco_) : nullptr); if (matrice) { - flux_evaluateur.template coeffs_arete < Arete_Type > (nullptr, fac3, fac4, fac1, fac2, aii, ajj); - for (int i = 0; i < ncomp; i++) - fill_coeff_matrice_morse < Type_Double > (fac1, fac2, i, ncomp, aii, ajj, *matrice); - - flux_evaluateur.template coeffs_arete < Arete_Type > (nullptr, fac1, fac2, fac3, fac4, aii, ajj); - for (int i = 0; i < ncomp; i++) - fill_coeff_matrice_morse < Type_Double > (fac3, fac4, i, ncomp, aii, ajj, *matrice); + Type_Double aii(ncomp), ajj(ncomp); + for (int index=0; index(nullptr, fac3, fac4, fac1, fac2, aii, ajj); + for (int i = 0; i < ncomp; i++) + fill_coeff_matrice_morse(fac1, fac2, i, ncomp, aii, ajj, *matrice); + + flux_evaluateur.template coeffs_arete(nullptr, fac1, fac2, fac3, fac4, aii, ajj); + for (int i = 0; i < ncomp; i++) + fill_coeff_matrice_morse(fac3, fac4, i, ncomp, aii, ajj, *matrice); + } } } } @@ -401,87 +554,87 @@ void Iterateur_VDF_Face<_TYPE_>::ajouter_blocs_aretes_mixtes(const int ncomp, ma template template std::enable_if_t -Iterateur_VDF_Face<_TYPE_>::ajouter_blocs_aretes_generique_(const int debut, const int fin, const int ncomp, matrices_t mats, DoubleTab& secmem, const tabs_t& semi_impl) const +Iterateur_VDF_Face<_TYPE_>::ajouter_blocs_aretes_generique_(const int ndeb, const int nfin, const int ncomp, matrices_t mats, DoubleTab& tab_secmem, const tabs_t& semi_impl) const { - // XXX : tab_flux_bords rempli seulement si MIXTE ... ie pas INTERNE ! + if (should_calc_flux) { constexpr bool is_MIXTE = (Arete_Type == Type_Flux_Arete::MIXTE); - Type_Double flux(ncomp), aii(ncomp), ajj(ncomp); - DoubleTab& tab_flux_bords = op_base->flux_bords(); - const DoubleTab& inco = semi_impl.count(nom_ch_inco_) ? semi_impl.at(nom_ch_inco_) : le_champ_convecte_ou_inc->valeurs(); - - const DoubleTab* a_r = (!is_pb_multi || !is_conv_op_) ? nullptr : semi_impl.count("alpha_rho") ? &semi_impl.at("alpha_rho") : - &ref_cast(Pb_Multiphase,op_base->equation().probleme()).equation_masse().champ_conserve().valeurs(); - + const DoubleTab& tab_inco = semi_impl.count(nom_ch_inco_) ? semi_impl.at(nom_ch_inco_) : le_champ_convecte_ou_inc->valeurs(); + const DoubleTab* tab_a_r = (!is_pb_multi || !is_conv_op_) ? nullptr : semi_impl.count("alpha_rho") ? &semi_impl.at("alpha_rho") : + &ref_cast(Pb_Multiphase,op_base->equation().probleme()).equation_masse().champ_conserve().valeurs(); + const int n = le_dom->nb_faces_bord(), n2 = le_dom->nb_faces_tot(); /* GF pour assurer bilan seq = para */ + // Copy of flux_evaluateur_ is mandatory here cause it's an attribute of the class + _TYPE_ flux_evaluateur = flux_evaluateur_; + flux_evaluateur.view_ro(); + CIntTabView Qdm = Qdm_.view_ro(); + CIntArrView orientation = orientation_.view_ro(); + CDoubleTabView inco = tab_inco.view_ro(); + CDoubleTabView a_r = tab_a_r != nullptr ? tab_a_r->view_ro() : ConstView(); + DoubleTabView secmem = tab_secmem.view_rw(); + DoubleTabView flux_bords = op_base->flux_bords().view_rw(); // XXX : flux_bords rempli seulement si MIXTE ... ie pas INTERNE ! // second membre - for (int n_arete = debut; n_arete < fin; n_arete++) - { - flux = 0.; - const int fac1 = Qdm(n_arete, 0), fac2 = Qdm(n_arete, 1), fac3 = Qdm(n_arete, 2), fac4 = Qdm(n_arete, 3); - const int n = le_dom->nb_faces_bord(), n2 = le_dom->nb_faces_tot(); /* GF pour assurer bilan seq = para */ - flux_evaluateur.template flux_arete < Arete_Type > (inco, a_r, fac1, fac2, fac3, fac4, flux); - fill_resu_tab < Type_Double > (fac3, fac4, ncomp, flux, secmem); - - if (is_MIXTE) - { - if (fac4 < n2) - { - if (fac1 < n) - for (int k = 0; k < ncomp; k++) tab_flux_bords(fac1, orientation(fac3)) -= flux[k]; - - if (fac2 < n) - for (int k = 0; k < ncomp; k++) tab_flux_bords(fac2, orientation(fac4)) -= flux[k]; - } - if (fac3 < n2) - { - if (fac1 < n) - for (int k = 0; k < ncomp; k++) tab_flux_bords(fac1, orientation(fac3)) += flux[k]; - - if (fac2 < n) - for (int k = 0; k < ncomp; k++) tab_flux_bords(fac2, orientation(fac4)) += flux[k]; - } - } - - flux_evaluateur.template flux_arete < Arete_Type > (inco, a_r, fac3, fac4, fac1, fac2, flux); - fill_resu_tab < Type_Double > (fac1, fac2, ncomp, flux, secmem); - if (is_MIXTE) - { - if (fac2 < n2) - { - if (fac3 < n) - for (int k = 0; k < ncomp; k++) tab_flux_bords(fac3, orientation(fac1)) -= flux[k]; - - if (fac4 < n) - for (int k = 0; k < ncomp; k++) tab_flux_bords(fac4, orientation(fac2)) -= flux[k]; - } - if (fac1 < n2) - { - if (fac3 < n) - for (int k = 0; k < ncomp; k++) tab_flux_bords(fac3, orientation(fac1)) += flux[k]; - - if (fac4 < n) - for (int k = 0; k < ncomp; k++) tab_flux_bords(fac4, orientation(fac2)) += flux[k]; - } - } - } + Kokkos::parallel_for(start_gpu_timer(__KERNEL_NAME__), + Kokkos::MDRangePolicy>({ndeb, 0}, {nfin, ncomp}), + KOKKOS_LAMBDA(const int n_arete, const int k) + { + const int fac1 = Qdm(n_arete, 0), fac2 = Qdm(n_arete, 1), fac3 = Qdm(n_arete, 2), fac4 = Qdm(n_arete, 3); + double flux_34; + flux_evaluateur.template flux_arete_comp(inco, a_r, fac1, fac2, fac3, fac4, k, flux_34); + Kokkos::atomic_add(&secmem(fac3, k), +flux_34); + Kokkos::atomic_add(&secmem(fac4, k), -flux_34); + if (is_MIXTE) + { + if (fac4 < n2) + { + if (fac1 < n) Kokkos::atomic_add(&flux_bords(fac1, orientation(fac3)), -flux_34); + if (fac2 < n) Kokkos::atomic_add(&flux_bords(fac2, orientation(fac4)), -flux_34); + } + if (fac3 < n2) + { + if (fac1 < n) Kokkos::atomic_add(&flux_bords(fac1, orientation(fac3)), +flux_34); + if (fac2 < n) Kokkos::atomic_add(&flux_bords(fac2, orientation(fac4)), +flux_34); + } + } + double flux_12; + flux_evaluateur.template flux_arete_comp(inco, a_r, fac3, fac4, fac1, fac2, k, flux_12); + Kokkos::atomic_add(&secmem(fac1, k), +flux_12); + Kokkos::atomic_add(&secmem(fac2, k), -flux_12); + if (is_MIXTE) + { + if (fac2 < n2) + { + if (fac3 < n) Kokkos::atomic_add(&flux_bords(fac3, orientation(fac1)), -flux_12); + if (fac4 < n) Kokkos::atomic_add(&flux_bords(fac4, orientation(fac2)), -flux_12); + } + if (fac1 < n2) + { + if (fac3 < n) Kokkos::atomic_add(&flux_bords(fac3, orientation(fac1)), +flux_12); + if (fac4 < n) Kokkos::atomic_add(&flux_bords(fac4, orientation(fac2)), +flux_12); + } + } + }); + end_gpu_timer(__KERNEL_NAME__); // derivees : champ convecte Matrice_Morse *matrice = (is_pb_multi && is_conv_op_) ? (mats.count(nom_ch_inco_) && !semi_impl.count(nom_ch_inco_) ? mats.at(nom_ch_inco_) : nullptr) : (mats.count(nom_ch_inco_) ? mats.at(nom_ch_inco_) : nullptr); if (matrice) - for (int n_arete = debut; n_arete < fin; n_arete++) - { - aii = 0., ajj = 0.; - const int fac1 = Qdm(n_arete, 0), fac2 = Qdm(n_arete, 1), fac3 = Qdm(n_arete, 2), fac4 = Qdm(n_arete, 3); + { + Type_Double aii(ncomp), ajj(ncomp); + for (int n_arete = ndeb; n_arete < nfin; n_arete++) + { + aii = 0., ajj = 0.; + const int fac1 = Qdm_(n_arete, 0), fac2 = Qdm_(n_arete, 1), fac3 = Qdm_(n_arete, 2), fac4 = Qdm_(n_arete, 3); - flux_evaluateur.template coeffs_arete < Arete_Type > (a_r, fac3, fac4, fac1, fac2, aii, ajj); - for (int i = 0; i < ncomp; i++) - fill_coeff_matrice_morse < Type_Double > (fac1, fac2, i, ncomp, aii, ajj, *matrice); + flux_evaluateur.template coeffs_arete(tab_a_r, fac3, fac4, fac1, fac2, aii, ajj); + for (int i = 0; i < ncomp; i++) + fill_coeff_matrice_morse(fac1, fac2, i, ncomp, aii, ajj, *matrice); - flux_evaluateur.template coeffs_arete < Arete_Type > (a_r, fac1, fac2, fac3, fac4, aii, ajj); - for (int i = 0; i < ncomp; i++) - fill_coeff_matrice_morse < Type_Double > (fac3, fac4, i, ncomp, aii, ajj, *matrice); - } + flux_evaluateur.template coeffs_arete(tab_a_r, fac1, fac2, fac3, fac4, aii, ajj); + for (int i = 0; i < ncomp; i++) + fill_coeff_matrice_morse(fac3, fac4, i, ncomp, aii, ajj, *matrice); + } + } } } @@ -517,44 +670,60 @@ void Iterateur_VDF_Face<_TYPE_>::ajouter_blocs_fa7_sortie_libre(const int ncomp, } template template -void Iterateur_VDF_Face<_TYPE_>::ajouter_blocs_fa7_sortie_libre_(const int num_cl, const int ncomp , matrices_t mats, DoubleTab& secmem, const tabs_t& semi_impl) const +void Iterateur_VDF_Face<_TYPE_>::ajouter_blocs_fa7_sortie_libre_(const int num_cl, const int ncomp , matrices_t mats, DoubleTab& tab_resu, const tabs_t& semi_impl) const { // TODO : FIXME : tab_flux_bords pas rempli ... if (should_calc_flux) { - Type_Double flux(ncomp), aii(ncomp), ajj(ncomp); - const DoubleTab& inco = semi_impl.count(nom_ch_inco_) ? semi_impl.at(nom_ch_inco_) : le_champ_convecte_ou_inc->valeurs(); + + const DoubleTab& tab_inco = semi_impl.count(nom_ch_inco_) ? semi_impl.at(nom_ch_inco_) : le_champ_convecte_ou_inc->valeurs(); const Cond_lim& la_cl = la_zcl->les_conditions_limites(num_cl); const Front_VF& frontiere_dis = ref_cast(Front_VF, la_cl->frontiere_dis()); const int ndeb = frontiere_dis.num_premiere_face(), nfin = ndeb + frontiere_dis.nb_faces(); - const DoubleTab* a_r = (!is_pb_multi || !is_conv_op_) ? nullptr : semi_impl.count("alpha_rho") ? &semi_impl.at("alpha_rho") : - &ref_cast(Pb_Multiphase,op_base->equation().probleme()).equation_masse().champ_conserve().valeurs(); + const DoubleTab* tab_a_r = (!is_pb_multi || !is_conv_op_) ? nullptr : semi_impl.count("alpha_rho") ? &semi_impl.at("alpha_rho") : + &ref_cast(Pb_Multiphase,op_base->equation().probleme()).equation_masse().champ_conserve().valeurs(); // second membre - for (int face = ndeb; face < nfin; face++) - { - flux_evaluateur.template flux_fa7 < Fa7_Type > (inco, a_r, face, (const Neumann_sortie_libre&) la_cl.valeur(), ndeb, flux); - if ((elem(face, 0)) > -1) - for (int k = 0; k < ncomp; k++) secmem(face, k) += flux[k]; - - if ((elem(face, 1)) > -1) - for (int k = 0; k < ncomp; k++) secmem(face, k) -= flux[k]; - } + // Copy of flux_evaluateur_ is mandatory here cause it's an attribute of the class + _TYPE_ flux_evaluateur = flux_evaluateur_; + flux_evaluateur.view_ro(); + CIntTabView elem = elem_.view_ro(); + CDoubleTabView inco = tab_inco.view_ro(); + CDoubleTabView a_r; + if (tab_a_r != nullptr) a_r = tab_a_r->view_ro(); + CDoubleTabView flux_impose = ref_cast(Neumann_sortie_libre, la_cl.valeur()).tab_flux_impose().view_ro(); // Used by Genepi+ !!! + DoubleTabView resu = tab_resu.view_rw(); + // second membre + Kokkos::parallel_for(start_gpu_timer(__KERNEL_NAME__), + Kokkos::MDRangePolicy>({ndeb, 0}, {nfin, ncomp}), + KOKKOS_LAMBDA(const int face, const int k) + { + double flux; + flux_evaluateur.template flux_fa7_comp(inco, a_r, face, flux_impose, ndeb, k, flux); + if (elem(face, 0) > -1) Kokkos::atomic_add(&resu(face, k), +flux); + if (elem(face, 1) > -1) Kokkos::atomic_add(&resu(face, k), -flux); + }); + end_gpu_timer(__KERNEL_NAME__); // derivees : champ convecte Matrice_Morse *matrice = (is_pb_multi && is_conv_op_) ? (mats.count(nom_ch_inco_) && !semi_impl.count(nom_ch_inco_) ? mats.at(nom_ch_inco_) : nullptr) : (mats.count(nom_ch_inco_) ? mats.at(nom_ch_inco_) : nullptr); if (matrice) - for (int face = ndeb; face < nfin; face++) - { - flux_evaluateur.template coeffs_fa7 < Fa7_Type > (a_r, face, (const Neumann_sortie_libre&) la_cl.valeur(), aii, ajj); - if ((elem(face, 0)) > -1) - for (int i = 0; i < ncomp; i++) fill_coeff_matrice_morse < Type_Double > (face, i, ncomp, 1, aii, *matrice); - - if ((elem(face, 1)) > -1) - for (int i = 0; i < ncomp; i++) fill_coeff_matrice_morse < Type_Double > (face, i, ncomp, 1, ajj, *matrice); - } + { + Type_Double aii(ncomp), ajj(ncomp); + for (int face = ndeb; face < nfin; face++) + { + flux_evaluateur.template coeffs_fa7(tab_a_r, face, (const Neumann_sortie_libre&) la_cl.valeur(), aii, ajj); + if ((elem_(face, 0)) > -1) + for (int i = 0; i < ncomp; i++) + fill_coeff_matrice_morse(face, i, ncomp, 1, aii, *matrice); + + if ((elem_(face, 1)) > -1) + for (int i = 0; i < ncomp; i++) + fill_coeff_matrice_morse(face, i, ncomp, 1, ajj, *matrice); + } + } } } @@ -562,109 +731,146 @@ void Iterateur_VDF_Face<_TYPE_>::ajouter_blocs_fa7_sortie_libre_(const int num_c * ====== FA7 ELEM ===== * ===================== */ template template -void Iterateur_VDF_Face<_TYPE_>::ajouter_blocs_fa7_elem(const int ncomp, matrices_t mats, DoubleTab& secmem, const tabs_t& semi_impl) const +void Iterateur_VDF_Face<_TYPE_>::ajouter_blocs_fa7_elem(const int ncomp, matrices_t mats, DoubleTab& tab_secmem, const tabs_t& semi_impl) const { - DoubleTab& tab_flux_bords = op_base->flux_bords(); - const DoubleTab& inco = semi_impl.count(nom_ch_inco_) ? semi_impl.at(nom_ch_inco_) : le_champ_convecte_ou_inc->valeurs(); - Type_Double flux(ncomp), aii(ncomp), ajj(ncomp); + const DoubleTab& tab_inco = semi_impl.count(nom_ch_inco_) ? semi_impl.at(nom_ch_inco_) : le_champ_convecte_ou_inc->valeurs(); const int n_fc_bd = le_dom->nb_faces_bord(); - const DoubleTab* a_r = (!is_pb_multi || !is_conv_op_) ? nullptr : semi_impl.count("alpha_rho") ? &semi_impl.at("alpha_rho") : - &ref_cast(Pb_Multiphase,op_base->equation().probleme()).equation_masse().champ_conserve().valeurs(); -// const IntTab& f_e = le_dom->face_voisins(); + const DoubleTab* tab_a_r = (!is_pb_multi || !is_conv_op_) ? nullptr : semi_impl.count("alpha_rho") ? &semi_impl.at("alpha_rho") : + &ref_cast(Pb_Multiphase,op_base->equation().probleme()).equation_masse().champ_conserve().valeurs(); + + int dim = dimension; + // Copy of flux_evaluateur_ is mandatory here cause it's an attribute of the class + _TYPE_ flux_evaluateur = flux_evaluateur_; + flux_evaluateur.view_ro(); + CIntTabView elem_faces = elem_faces_.view_ro(); + CIntArrView orientation = orientation_.view_ro(); + CDoubleTabView inco = tab_inco.view_ro(); + CDoubleTabView a_r = tab_a_r != nullptr ? tab_a_r->view_ro() : ConstView(); + DoubleTabView secmem = tab_secmem.view_rw(); + DoubleTabView flux_bords = op_base->flux_bords().view_rw(); // XXX : flux_bords rempli seulement si MIXTE ... ie pas INTERNE ! // second membre - for (int num_elem = 0; num_elem < nb_elem; num_elem++) - for (int fa7 = 0; fa7 < dimension; fa7++) + Kokkos::parallel_for(start_gpu_timer(__KERNEL_NAME__), + Kokkos::MDRangePolicy>({0, 0}, {nb_elem, ncomp}), + KOKKOS_LAMBDA(const int num_elem, const int k) + { + for (int fa7 = 0; fa7 < dim; fa7++) { - int fac1 = elem_faces(num_elem, fa7), fac2 = elem_faces(num_elem, fa7 + dimension); - flux_evaluateur.template flux_fa7 < Type_Flux_Fa7::ELEM > (inco, a_r, num_elem, fac1, fac2, flux); - - fill_resu_tab < Type_Double > (fac1, fac2, ncomp, flux, secmem); - - if (fac1 < n_fc_bd) - for (int k = 0; k < ncomp; k++) tab_flux_bords(fac1, orientation(fac1)) += flux[k]; - - if (fac2 < n_fc_bd) - for (int k = 0; k < ncomp; k++) tab_flux_bords(fac2, orientation(fac2)) -= flux[k]; + const int fac1 = elem_faces(num_elem, fa7), fac2 = elem_faces(num_elem, fa7 + dim); + double flux; + flux_evaluateur.template flux_fa7_comp(inco, a_r, num_elem, fac1, fac2, k, flux); + Kokkos::atomic_add(&secmem(fac1, k), +flux); + Kokkos::atomic_add(&secmem(fac2, k), -flux); + if (fac1 < n_fc_bd) Kokkos::atomic_add(&flux_bords(fac1, orientation(fac1)), +flux); + if (fac2 < n_fc_bd) Kokkos::atomic_add(&flux_bords(fac2, orientation(fac2)), -flux); } + }); + end_gpu_timer(__KERNEL_NAME__); // derivees : champ convecte Matrice_Morse *matrice = (is_pb_multi && is_conv_op_) ? (mats.count(nom_ch_inco_) && !semi_impl.count(nom_ch_inco_) ? mats.at(nom_ch_inco_) : nullptr) : (mats.count(nom_ch_inco_) ? mats.at(nom_ch_inco_) : nullptr); if (matrice) { + Type_Double aii(ncomp), ajj(ncomp); + ToDo_Kokkos("critical"); for (int num_elem = 0; num_elem < nb_elem; num_elem++) for (int fa7 = 0; fa7 < dimension; fa7++) { - const int fac1 = elem_faces(num_elem, fa7), fac2 = elem_faces(num_elem, fa7 + dimension); - flux_evaluateur.template coeffs_fa7 < Type_Flux_Fa7::ELEM > (a_r, num_elem, fac1, fac2, aii, ajj); + const int fac1 = elem_faces_(num_elem, fa7), fac2 = elem_faces_(num_elem, fa7 + dimension); + flux_evaluateur.template coeffs_fa7 < Type_Flux_Fa7::ELEM > (tab_a_r, num_elem, fac1, fac2, aii, ajj); for (int i = 0; i < ncomp; i++) fill_coeff_matrice_morse < Type_Double > (fac1, fac2, i, ncomp, aii, ajj, *matrice); } } // On corrige si cl periodique ... - corriger_fa7_elem_periodicite(ncomp, mats, secmem, semi_impl); + corriger_fa7_elem_periodicite(ncomp, mats, tab_secmem, semi_impl); } template template -void Iterateur_VDF_Face<_TYPE_>::corriger_fa7_elem_periodicite(const int ncomp, matrices_t mats, DoubleTab& secmem, const tabs_t& semi_impl) const +void Iterateur_VDF_Face<_TYPE_>::corriger_fa7_elem_periodicite(const int ncomp, matrices_t mats, DoubleTab& tab_resu, const tabs_t& semi_impl) const { - Type_Double flux(ncomp), aii(ncomp), ajj(ncomp); - const DoubleTab& inco = semi_impl.count(nom_ch_inco_) ? semi_impl.at(nom_ch_inco_) : le_champ_convecte_ou_inc->valeurs(); - Matrice_Morse *matrice = (is_pb_multi && is_conv_op_) ? (mats.count(nom_ch_inco_) && !semi_impl.count(nom_ch_inco_) ? mats.at(nom_ch_inco_) : nullptr) : (mats.count(nom_ch_inco_) ? mats.at(nom_ch_inco_) : nullptr); + const DoubleTab& tab_inco = semi_impl.count(nom_ch_inco_) ? semi_impl.at(nom_ch_inco_) + : le_champ_convecte_ou_inc->valeurs(); + Matrice_Morse *matrice = (is_pb_multi && is_conv_op_) ? (mats.count(nom_ch_inco_) && !semi_impl.count(nom_ch_inco_) + ? mats.at(nom_ch_inco_) : nullptr) : (mats.count( + nom_ch_inco_) ? mats.at(nom_ch_inco_) : nullptr); for (int num_cl = 0; num_cl < le_dom->nb_front_Cl(); num_cl++) - { + { const Cond_lim& la_cl = la_zcl->les_conditions_limites(num_cl); if (sub_type(Periodique, la_cl.valeur())) { const Periodique& la_cl_perio = ref_cast(Periodique, la_cl.valeur()); const Front_VF& le_bord = ref_cast(Front_VF, la_cl_perio.frontiere_dis()); - int num_elem, signe, fac1, fac2, ndeb = le_bord.num_premiere_face(), nfin = ndeb + le_bord.nb_faces(); + int ndeb = le_bord.num_premiere_face(), nfin = ndeb + le_bord.nb_faces(); // second membre - for (int face = ndeb; face < nfin; face++) - { - flux = 0.; - corriger_fa7_elem_periodicite__(face, num_elem, signe, fac1, fac2); - - flux_evaluateur.template flux_fa7 < Type_Flux_Fa7::ELEM > (inco, nullptr, num_elem, fac1, fac2, flux); - for (int k = 0; k < ncomp; k++) secmem(face, k) += signe * flux[k]; - } + // Copy of flux_evaluateur_ is mandatory here cause it's an attribute of the class + _TYPE_ flux_evaluateur = flux_evaluateur_; + flux_evaluateur.view_ro(); + int dim = Objet_U::dimension; + CIntTabView elem = elem_.view_ro(); + CIntTabView elem_faces = elem_faces_.view_ro(); + CIntArrView orientation = orientation_.view_ro(); + CDoubleTabView a_r; + CDoubleTabView inco = tab_inco.view_ro(); + DoubleTabView resu = tab_resu.view_rw(); + // second membre + Kokkos::parallel_for(start_gpu_timer(__KERNEL_NAME__), + Kokkos::MDRangePolicy>({ndeb, 0}, {nfin, ncomp}), + KOKKOS_LAMBDA(const int face, const int k) + { + const int elem1 = elem(face, 0), elem2 = elem(face, 1), ori = orientation(face); + int num_elem, signe; + if ((face == elem_faces(elem1, ori)) || (face == elem_faces(elem1, ori + dim))) + { num_elem = elem2; signe = 1; } + else + { num_elem = elem1; signe = -1; } + const int fac1 = elem_faces(num_elem, ori), fac2 = elem_faces(num_elem, ori + dim); + double flux; + flux_evaluateur.template flux_fa7_comp(inco, a_r, num_elem, fac1, fac2, k, flux); + Kokkos::atomic_add(&resu(face, k), +signe * flux); + }); + end_gpu_timer(__KERNEL_NAME__); // derivees : champ convecte if (matrice) - for (int face = ndeb; face < nfin; face++) - { - aii = 0., ajj = 0.; - corriger_fa7_elem_periodicite__(face, num_elem, signe, fac1, fac2); - - flux_evaluateur.template coeffs_fa7 < Type_Flux_Fa7::ELEM > (nullptr, num_elem, fac1, fac2, aii, ajj); - const auto& tab1 = (*matrice).get_set_tab1(); - const auto& tab2 = (*matrice).get_set_tab2(); - auto& coeff = (*matrice).get_set_coeff(); - if (signe > 0) /* on a oublie a droite la contribution de la gauche */ - { - for (int i = 0; i < ncomp; i++) - for (auto k = tab1[face * ncomp + i] - 1; k < tab1[face * ncomp + 1 + i] - 1; k++) - if (tab2[k] - 1 == face * ncomp + i) coeff[k] += aii[i]; - - for (int i = 0; i < ncomp; i++) - for (auto k = tab1[face * ncomp + i] - 1; k < tab1[face * ncomp + 1 + i] - 1; k++) - if (tab2[k] - 1 == fac2 * ncomp + i) coeff[k] -= ajj[i]; - } - else /* on a oublie a gauche la contribution de la droite */ - { - for (int i = 0; i < ncomp; i++) - for (auto k = tab1[face * ncomp + i] - 1; k < tab1[face * ncomp + 1 + i] - 1; k++) - if (tab2[k] - 1 == fac1 * ncomp + i) coeff[k] -= aii[i]; - - for (int i = 0; i < ncomp; i++) - for (auto k = tab1[face * ncomp + i] - 1; k < tab1[face * ncomp + 1 + i] - 1; k++) - if (tab2[k] - 1 == face * ncomp + i) coeff[k] += ajj[i]; - } - } + { + Type_Double aii(ncomp), ajj(ncomp); + int num_elem, signe, fac1, fac2; + for (int face = ndeb; face < nfin; face++) + { + aii = 0., ajj = 0.; + corriger_fa7_elem_periodicite__(face, num_elem, signe, fac1, fac2); + + flux_evaluateur.template coeffs_fa7(nullptr, num_elem, fac1, fac2, aii, ajj); + const auto& tab1 = (*matrice).get_set_tab1(); + const auto& tab2 = (*matrice).get_set_tab2(); + auto& coeff = (*matrice).get_set_coeff(); + if (signe > 0) /* on a oublie a droite la contribution de la gauche */ + { + for (int i = 0; i < ncomp; i++) + for (auto k = tab1[face * ncomp + i] - 1; k < tab1[face * ncomp + 1 + i] - 1; k++) + if (tab2[k] - 1 == face * ncomp + i) coeff[k] += aii[i]; + + for (int i = 0; i < ncomp; i++) + for (auto k = tab1[face * ncomp + i] - 1; k < tab1[face * ncomp + 1 + i] - 1; k++) + if (tab2[k] - 1 == fac2 * ncomp + i) coeff[k] -= ajj[i]; + } + else /* on a oublie a gauche la contribution de la droite */ + { + for (int i = 0; i < ncomp; i++) + for (auto k = tab1[face * ncomp + i] - 1; k < tab1[face * ncomp + 1 + i] - 1; k++) + if (tab2[k] - 1 == fac1 * ncomp + i) coeff[k] -= aii[i]; + + for (int i = 0; i < ncomp; i++) + for (auto k = tab1[face * ncomp + i] - 1; k < tab1[face * ncomp + 1 + i] - 1; k++) + if (tab2[k] - 1 == face * ncomp + i) coeff[k] += ajj[i]; + } + } + } } } } @@ -672,8 +878,8 @@ void Iterateur_VDF_Face<_TYPE_>::corriger_fa7_elem_periodicite(const int ncomp, template void Iterateur_VDF_Face<_TYPE_>::corriger_fa7_elem_periodicite__(const int face, int& num_elem, int& signe, int& fac1, int& fac2) const { - const int elem1 = elem(face, 0), elem2 = elem(face, 1), ori = orientation(face); - if ((face == elem_faces(elem1, ori)) || (face == elem_faces(elem1, ori + dimension))) + const int elem1 = elem_(face, 0), elem2 = elem_(face, 1), ori = orientation_(face); + if ((face == elem_faces_(elem1, ori)) || (face == elem_faces_(elem1, ori + dimension))) { num_elem = elem2; signe = 1; @@ -683,7 +889,7 @@ void Iterateur_VDF_Face<_TYPE_>::corriger_fa7_elem_periodicite__(const int face, num_elem = elem1; signe = -1; } - fac1 = elem_faces(num_elem, ori), fac2 = elem_faces(num_elem, ori + dimension); + fac1 = elem_faces_(num_elem, ori), fac2 = elem_faces_(num_elem, ori + dimension); } /* ========================= * @@ -721,16 +927,6 @@ void Iterateur_VDF_Face<_TYPE_>::ajouter_pour_compressible(const int ncomp, matr // =================================================================================================== -template template -inline void Iterateur_VDF_Face<_TYPE_>::fill_resu_tab(const int fac1, const int fac2, const int ncomp, const Type_Double& flux, DoubleTab& resu) const -{ - for (int k = 0; k < ncomp; k++) - { - resu(fac1, k) += flux[k]; - resu(fac2, k) -= flux[k]; - } -} - template template void Iterateur_VDF_Face<_TYPE_>::fill_coeff_matrice_morse(const int face, const int i, const int ncomp, const int signe, const Type_Double& A, Matrice_Morse& matrice) const { @@ -759,6 +955,7 @@ void Iterateur_VDF_Face<_TYPE_>::fill_coeff_matrice_morse(const int fac1, const } } + #include #endif /* Iterateur_VDF_Face_TPP_included */ diff --git a/src/VDF/Operateurs/Iterateurs/Iterateur_VDF_Face_bis.tpp b/src/VDF/Operateurs/Iterateurs/Iterateur_VDF_Face_bis.tpp index 8cd860ad60..00f076259a 100644 --- a/src/VDF/Operateurs/Iterateurs/Iterateur_VDF_Face_bis.tpp +++ b/src/VDF/Operateurs/Iterateurs/Iterateur_VDF_Face_bis.tpp @@ -1,5 +1,5 @@ /**************************************************************************** -* Copyright (c) 2024, CEA +* Copyright (c) 2026, CEA * All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: @@ -18,26 +18,28 @@ template inline Iterateur_VDF_Face<_TYPE_>::Iterateur_VDF_Face(const Iterateur_VDF_Face<_TYPE_>& iter) : - Iterateur_VDF_base(iter), flux_evaluateur(iter.flux_evaluateur), nb_elem(iter.nb_elem), premiere_arete_interne(iter.premiere_arete_interne), derniere_arete_interne(iter.derniere_arete_interne), + Iterateur_VDF_base(iter), flux_evaluateur_(iter.flux_evaluateur_), nb_elem(iter.nb_elem), premiere_arete_interne(iter.premiere_arete_interne), derniere_arete_interne(iter.derniere_arete_interne), premiere_arete_mixte(iter.premiere_arete_mixte), derniere_arete_mixte(iter.derniere_arete_mixte), premiere_arete_bord(iter.premiere_arete_bord), derniere_arete_bord(iter.derniere_arete_bord), premiere_arete_coin(iter.premiere_arete_coin), derniere_arete_coin(iter.derniere_arete_coin) { - orientation.ref(iter.orientation); - Qdm.ref(iter.Qdm); - elem.ref(iter.elem); - elem_faces.ref(iter.elem_faces); + orientation_.ref(iter.orientation_); + Qdm_.ref(iter.Qdm_); + elem_.ref(iter.elem_); + elem_faces_.ref(iter.elem_faces_); type_arete_bord.ref(iter.type_arete_bord); type_arete_coin.ref(iter.type_arete_coin); + val_imp_face_bord_.ref(iter.val_imp_face_bord_); + coeff_frottement_face_bord_.ref(iter.coeff_frottement_face_bord_); } template void Iterateur_VDF_Face<_TYPE_>::completer_() { nb_elem = le_dom->nb_elem_tot(); - orientation.ref(le_dom->orientation()); - Qdm.ref(le_dom->Qdm()); - elem.ref(le_dom->face_voisins()); - elem_faces.ref(le_dom->elem_faces()); + orientation_.ref(le_dom->orientation()); + Qdm_.ref(le_dom->Qdm()); + elem_.ref(le_dom->face_voisins()); + elem_faces_.ref(le_dom->elem_faces()); type_arete_bord.ref(la_zcl->type_arete_bord()); type_arete_coin.ref(la_zcl->type_arete_coin()); premiere_arete_interne = le_dom->premiere_arete_interne(); @@ -62,8 +64,12 @@ inline void Iterateur_VDF_Face<_TYPE_>::multiply_by_rho_if_hydraulique(DoubleTab { const double coef = rho.valeurs()(0, 0); const int nb_faces_bord = le_dom->nb_faces_bord(); - for (int face = 0; face < nb_faces_bord; face++) - for (int k = 0; k < tab_flux_bords.line_size(); k++) tab_flux_bords(face, k) *= coef; + DoubleTabView flux_bords = tab_flux_bords.view_rw(); + Kokkos::parallel_for(start_gpu_timer(__KERNEL_NAME__), range_1D(0, nb_faces_bord), KOKKOS_LAMBDA(const int face) + { + for (int k = 0; k < (int)flux_bords.extent(1); k++) flux_bords(face, k) *= coef; + }); + end_gpu_timer(__KERNEL_NAME__); } } } @@ -77,45 +83,51 @@ int Iterateur_VDF_Face<_TYPE_>::impr(Sortie& os) const const Schema_Temps_base& sch = la_zcl->equation().probleme().schema_temps(); DoubleTab& tab_flux_bords = op_base->flux_bords(); DoubleVect bilan(tab_flux_bords.dimension(1)); - DoubleTab xgr; - if (impr_mom) xgr = le_dom->calculer_xgr(); - int k, face, nb_front_Cl = le_dom->nb_front_Cl(); - DoubleTrav flux_bords2(5, nb_front_Cl, tab_flux_bords.dimension(1)); - flux_bords2 = 0; + DoubleTab tab_xgr; + if (impr_mom) tab_xgr = le_dom->calculer_xgr(); + int nb_front_Cl = le_dom->nb_front_Cl(); + DoubleTrav tab_flux_bords2(5, nb_front_Cl, tab_flux_bords.dimension(1)); + tab_flux_bords2 = 0; + const int dim = Objet_U::dimension; + const int ncomp = tab_flux_bords.dimension(1); + CDoubleTabView flux_bords = tab_flux_bords.view_ro(); + DoubleTabView3 flux_bords2 = tab_flux_bords2.view_rw<3>(); + CDoubleTabView xgr; + if (impr_mom) xgr = tab_xgr.view_ro(); for (int num_cl = 0; num_cl < nb_front_Cl; num_cl++) { const Cond_lim& la_cl = la_zcl->les_conditions_limites(num_cl); const Front_VF& frontiere_dis = ref_cast(Front_VF, la_cl->frontiere_dis()); - int ndeb = frontiere_dis.num_premiere_face(), nfin = ndeb + frontiere_dis.nb_faces(), periodicite = (type_cl(la_cl) == periodique ? 1 : 0); - for (face = ndeb; face < nfin; face++) - { - for (k = 0; k < tab_flux_bords.dimension(1); k++) - { - flux_bords2(0, num_cl, k) += tab_flux_bords(face, k); - if (periodicite) - { - if (face < (ndeb + frontiere_dis.nb_faces() / 2)) - flux_bords2(1, num_cl, k) += tab_flux_bords(face, k); - else - flux_bords2(2, num_cl, k) += tab_flux_bords(face, k); - } - if (mon_dom.bords_a_imprimer_sum().contient(frontiere_dis.le_nom())) - flux_bords2(3, num_cl, k) += tab_flux_bords(face, k); - } /* fin for k */ - if (impr_mom) - { - if (dimension == 2) - flux_bords2(4, num_cl, 0) += tab_flux_bords(face, 1) * xgr(face, 0) - tab_flux_bords(face, 0) * xgr(face, 1); - else - { - flux_bords2(4, num_cl, 0) += tab_flux_bords(face, 2) * xgr(face, 1) - tab_flux_bords(face, 1) * xgr(face, 2); - flux_bords2(4, num_cl, 1) += tab_flux_bords(face, 0) * xgr(face, 2) - tab_flux_bords(face, 2) * xgr(face, 0); - flux_bords2(4, num_cl, 2) += tab_flux_bords(face, 1) * xgr(face, 0) - tab_flux_bords(face, 0) * xgr(face, 1); - } - } - } /* fin for face */ + const int ndeb = frontiere_dis.num_premiere_face(), nfin = ndeb + frontiere_dis.nb_faces(), periodicite = (type_cl(la_cl) == periodique ? 1 : 0); + const int first_half_end = ndeb + frontiere_dis.nb_faces() / 2; + const int impr_boundary = mon_dom.bords_a_imprimer_sum().contient(frontiere_dis.le_nom()) ? 1 : 0; + Kokkos::parallel_for(start_gpu_timer(__KERNEL_NAME__), range_1D(ndeb, nfin), KOKKOS_LAMBDA(const int face) + { + for (int k = 0; k < ncomp; k++) + { + Kokkos::atomic_add(&flux_bords2(0, num_cl, k), flux_bords(face, k)); + if (periodicite) + { + if (face < first_half_end) Kokkos::atomic_add(&flux_bords2(1, num_cl, k), flux_bords(face, k)); + else Kokkos::atomic_add(&flux_bords2(2, num_cl, k), flux_bords(face, k)); + } + if (impr_boundary) Kokkos::atomic_add(&flux_bords2(3, num_cl, k), flux_bords(face, k)); + } + if (impr_mom) + { + if (dim == 2) + Kokkos::atomic_add(&flux_bords2(4, num_cl, 0), flux_bords(face, 1) * xgr(face, 0) - flux_bords(face, 0) * xgr(face, 1)); + else + { + Kokkos::atomic_add(&flux_bords2(4, num_cl, 0), flux_bords(face, 2) * xgr(face, 1) - flux_bords(face, 1) * xgr(face, 2)); + Kokkos::atomic_add(&flux_bords2(4, num_cl, 1), flux_bords(face, 0) * xgr(face, 2) - flux_bords(face, 2) * xgr(face, 0)); + Kokkos::atomic_add(&flux_bords2(4, num_cl, 2), flux_bords(face, 1) * xgr(face, 0) - flux_bords(face, 0) * xgr(face, 1)); + } + } + }); + end_gpu_timer(__KERNEL_NAME__); } - mp_sum_for_each_item(flux_bords2); + mp_sum_for_each_item(tab_flux_bords2); if (je_suis_maitre()) { op_base->ouvrir_fichier(Flux, "", 1); @@ -130,30 +142,30 @@ int Iterateur_VDF_Face<_TYPE_>::impr(Sortie& os) const { const Cond_lim& la_cl = la_zcl->les_conditions_limites(num_cl); int periodicite = (type_cl(la_cl) == periodique ? 1 : 0); - for (k = 0; k < tab_flux_bords.dimension(1); k++) + for (int k = 0; k < tab_flux_bords.dimension(1); k++) { if (periodicite) { - Flux.add_col(flux_bords2(1, num_cl, k)); - Flux.add_col(flux_bords2(2, num_cl, k)); + Flux.add_col(tab_flux_bords2(1, num_cl, k)); + Flux.add_col(tab_flux_bords2(2, num_cl, k)); } else - Flux.add_col(flux_bords2(0, num_cl, k)); + Flux.add_col(tab_flux_bords2(0, num_cl, k)); if (impr_sum) - Flux_sum.add_col(flux_bords2(3, num_cl, k)); - bilan(k) += flux_bords2(0, num_cl, k); + Flux_sum.add_col(tab_flux_bords2(3, num_cl, k)); + bilan(k) += tab_flux_bords2(0, num_cl, k); } if (dimension == 3) { - for (k = 0; k < tab_flux_bords.dimension(1); k++) + for (int k = 0; k < tab_flux_bords.dimension(1); k++) if (impr_mom) - Flux_moment.add_col(flux_bords2(4, num_cl, k)); + Flux_moment.add_col(tab_flux_bords2(4, num_cl, k)); } else if (impr_mom) - Flux_moment.add_col(flux_bords2(4, num_cl, 0)); + Flux_moment.add_col(tab_flux_bords2(4, num_cl, 0)); } /* fin for num_cl */ - for (k = 0; k < tab_flux_bords.dimension(1); k++) + for (int k = 0; k < tab_flux_bords.dimension(1); k++) Flux.add_col(bilan(k)); Flux << finl; if (impr_sum) @@ -181,13 +193,13 @@ int Iterateur_VDF_Face<_TYPE_>::impr(Sortie& os) const sch.imprimer_temps_courant(Flux_face); Flux_face << " : " << finl; } - for (face = ndeb; face < nfin; face++) + for (int face = ndeb; face < nfin; face++) { if (dimension == 2) Flux_face << "# Face a x= " << le_dom->xv(face, 0) << " y= " << le_dom->xv(face, 1) << " : "; else if (dimension == 3) Flux_face << "# Face a x= " << le_dom->xv(face, 0) << " y= " << le_dom->xv(face, 1) << " z= " << le_dom->xv(face, 2) << " : "; - for (k = 0; k < tab_flux_bords.dimension(1); k++) + for (int k = 0; k < tab_flux_bords.dimension(1); k++) Flux_face << tab_flux_bords(face, k) << " "; Flux_face << finl; } diff --git a/src/VDF/Operateurs/Iterateurs/Iterateur_VDF_base.h b/src/VDF/Operateurs/Iterateurs/Iterateur_VDF_base.h index 2b04e9208a..879dbd14ca 100644 --- a/src/VDF/Operateurs/Iterateurs/Iterateur_VDF_base.h +++ b/src/VDF/Operateurs/Iterateurs/Iterateur_VDF_base.h @@ -1,5 +1,5 @@ /**************************************************************************** -* Copyright (c) 2025, CEA +* Copyright (c) 2026, CEA * All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: @@ -97,7 +97,9 @@ class Iterateur_VDF_base : public Objet_U virtual DoubleTab& ajouter(const DoubleTab& inco, DoubleTab& secmem) const final { - ajouter_blocs({}, secmem, {{ op_base->equation().inconnue().le_nom().getString(), inco }}); + tabs_t semi_impl; + semi_impl[op_base->equation().inconnue().le_nom().getString()].ref(inco); /* evite la copie de inco dans tabs_t */ + ajouter_blocs({}, secmem, semi_impl); return secmem; } diff --git a/src/VDF/Operateurs/Op_Conv/Op_Conv_VDF_base.cpp b/src/VDF/Operateurs/Op_Conv/Op_Conv_VDF_base.cpp index 5aa0396145..6833bf5650 100644 --- a/src/VDF/Operateurs/Op_Conv/Op_Conv_VDF_base.cpp +++ b/src/VDF/Operateurs/Op_Conv/Op_Conv_VDF_base.cpp @@ -45,12 +45,21 @@ Entree& Op_Conv_VDF_base::readOn(Entree& s) return s; } +// ToDo try to templatize: inline void eval_fluent(const double psc, const int num1, const int num2, const int n, DoubleTab& fluent) { if (psc >= 0) fluent(num2, n) += psc; else fluent(num1, n) -= psc; } +KOKKOS_INLINE_FUNCTION void eval_fluent(const double psc, const int num1, const int num2, const int n, DoubleTabView fluent) +{ + if (psc >= 0) + Kokkos::atomic_add(&fluent(num2, n), +psc); + else + Kokkos::atomic_add(&fluent(num1, n), -psc); +} + void Op_Conv_VDF_base::completer() { Operateur_base::completer(); @@ -110,18 +119,23 @@ void Op_Conv_VDF_base::dimensionner_blocs_elem(matrices_t mats, const tabs_t& se { const IntTab& fcl_v = ref_cast(Champ_Face_VDF, vitesse()).fcl(); + ToDo_Kokkos("critical"); for (f = 0; f < domaine.nb_faces_tot(); f++) if (fcl_v(f, 0) < 2) for (i = 0; i < 2; i++) if ((e = f_e(f, i)) >= 0 && e < domaine.nb_elem_tot()) for (n = 0; n < N; n++) stencil.append_line(N * e + n, M * f + n * (M > 1)); } - else for (f = 0; f < domaine.nb_faces_tot(); f++) - for (i = 0; i < 2; i++) - if ((e = f_e(f, i)) >= 0 && e < domaine.nb_elem_tot()) /* inconnues scalaires */ - for (j = 0; j < 2; j++) - if ((eb = f_e(f, j)) >= 0) - for (n = 0, m = 0; n < N; n++, m += (M > 1)) stencil.append_line(N * e + n, M * eb + m); + else + { + ToDo_Kokkos("critical"); + for (f = 0; f < domaine.nb_faces_tot(); f++) + for (i = 0; i < 2; i++) + if ((e = f_e(f, i)) >= 0 && e < domaine.nb_elem_tot()) /* inconnues scalaires */ + for (j = 0; j < 2; j++) + if ((eb = f_e(f, j)) >= 0) + for (n = 0, m = 0; n < N; n++, m += (M > 1)) stencil.append_line(N * e + n, M * eb + m); + } tableau_trier_retirer_doublons(stencil); const int nl = equation().inconnue().valeurs().size_totale(), @@ -151,6 +165,7 @@ void Op_Conv_VDF_base::dimensionner_blocs_face(matrices_t matrices, const tabs_t /* agit uniquement aux elements; diagonale omise */ + ToDo_Kokkos("critical"); for (int f = 0; f < domaine.nb_faces_tot(); f++) if (f_e(f, 0) >= 0 && (f_e(f, 1) >= 0 || fcl(f, 0) == 3)) for (int i = 0; i < 2; i++) @@ -179,13 +194,9 @@ double Op_Conv_VDF_base::calculer_dt_stab() const { const Domaine_VDF& domaine_VDF = iter_->domaine(); const Domaine_Cl_VDF& domaine_Cl_VDF = iter_->domaine_Cl(); - const IntTab& face_voisins = domaine_VDF.face_voisins(); - const DoubleVect& volumes = domaine_VDF.volumes(); - const DoubleVect& face_surfaces = domaine_VDF.face_surfaces(); const DoubleTab& vit_associe = vitesse().valeurs(); - const DoubleTab& vit= (vitesse_pour_pas_de_temps_?vitesse_pour_pas_de_temps_->valeurs(): vit_associe); - const int N = std::min(vit.line_size(), equation().inconnue().valeurs().line_size()); - const DoubleTab* alp = sub_type(Pb_Multiphase, equation().probleme()) ? &ref_cast(Pb_Multiphase, equation().probleme()).equation_masse().inconnue().passe() : nullptr; + const DoubleTab& tab_vit = (vitesse_pour_pas_de_temps_?vitesse_pour_pas_de_temps_->valeurs(): vit_associe); + const int N = std::min(tab_vit.line_size(), equation().inconnue().valeurs().line_size()); if (!fluent_.get_md_vector()) { fluent_.resize(0, N); @@ -193,8 +204,6 @@ double Op_Conv_VDF_base::calculer_dt_stab() const } fluent_ = 0; // Remplissage du tableau fluent - double psc; - int num1, num2, face, elem1; // On traite les bords for (int n_bord = 0; n_bord < domaine_VDF.nb_front_Cl(); n_bord++) @@ -203,46 +212,72 @@ double Op_Conv_VDF_base::calculer_dt_stab() const if ( sub_type(Dirichlet_entree_fluide,la_cl.valeur()) || sub_type(Neumann_sortie_libre,la_cl.valeur()) ) { const Front_VF& le_bord = ref_cast(Front_VF,la_cl->frontiere_dis()); - num1 = le_bord.num_premiere_face(); - num2 = num1 + le_bord.nb_faces(); - for (face=num1; face 0) fluent_(face_voisins(face,1), n) += psc; + if (psc > 0) + Kokkos::atomic_add(&fluent(face_voisins(face,1), n), +psc); } + }); + end_gpu_timer(__KERNEL_NAME__); } } // Boucle sur les faces internes pour remplir fluent const int domaine_VDF_nb_faces = domaine_VDF.nb_faces(), premiere_face = domaine_VDF.premiere_face_int(); - for (face = premiere_face; face < domaine_VDF_nb_faces; face++) + CIntTabView face_voisins = domaine_VDF.face_voisins().view_ro(); + CDoubleArrView face_surfaces = domaine_VDF.face_surfaces().view_ro(); + CDoubleTabView vit = tab_vit.view_ro(); + DoubleTabView fluent = fluent_.view_rw(); + Kokkos::parallel_for(start_gpu_timer(__KERNEL_NAME__), range_1D(premiere_face, domaine_VDF_nb_faces), KOKKOS_LAMBDA(const int face) + { for (int n = 0; n < N; n++) { - psc = vit(face, n) * face_surfaces(face); - eval_fluent(psc, face_voisins(face, 0), face_voisins(face, 1), n, fluent_); + const double psc = vit(face, n) * face_surfaces(face); + eval_fluent(psc, face_voisins(face, 0), face_voisins(face, 1), n, fluent); } + }); + end_gpu_timer(__KERNEL_NAME__); // Calcul du pas de temps de stabilite a partir du tableau fluent if (vitesse().le_nom()=="rho_u" && equation().probleme().is_dilatable()) diviser_par_rho_si_dilatable(fluent_,equation().milieu()); const double alpha_min_dt = 1e-3; // avoid stupid time steps during vanishing phase - double dt_stab = 1.e30; + double dt_stab; int domaine_VDF_nb_elem=domaine_VDF.nb_elem(); + bool is_pbm = sub_type(Pb_Multiphase, equation().probleme()); + const DoubleTab* ptr_alpha = is_pbm ? &ref_cast(Pb_Multiphase, equation().probleme()).equation_masse().inconnue().passe() : &fluent_ /* not used */; // dt_stab = min ( 1 / ( |U|/dx + |V|/dy + |W|/dz ) ) - for (int num_poly=0; num_polyview_ro(); + CDoubleArrView volumes = domaine_VDF.volumes().view_ro(); + fluent_.view_ro(); + Kokkos::parallel_reduce(start_gpu_timer(__KERNEL_NAME__), domaine_VDF_nb_elem, KOKKOS_LAMBDA(const int num_poly, double& dt_stab_) + { for (int n = 0; n < N; n++) - if ((!alp || (*alp)(num_poly, n) > alpha_min_dt)) + if ((!is_pbm || alpha(num_poly, n) > alpha_min_dt)) { - double dt_elem = volumes(num_poly)/(fluent_(num_poly, n)+DMINFLOAT); - if (dt_elem(dt_stab)); + end_gpu_timer(__KERNEL_NAME__); + dt_stab = std::min(1.e30, dt_stab); // Kokkos initialize to std::numeric_limits::max() which is not what we wanted (1e30). Cause a division per 0 later... dt_stab = Process::mp_min(dt_stab); @@ -284,6 +319,7 @@ void Op_Conv_VDF_base::calculer_dt_local(DoubleTab& dt_face) const // Boucle sur les faces internes pour remplir fluent const int domaine_VDF_nb_faces = domaine_VDF.nb_faces(), premiere_face = domaine_VDF.premiere_face_int(); + ToDo_Kokkos("critical"); for (int face = premiere_face; face < domaine_VDF_nb_faces; face++) { const double value = vit[face]*face_surfaces(face); @@ -303,6 +339,7 @@ void Op_Conv_VDF_base::calculer_dt_local(DoubleTab& dt_face) const const Cond_lim& la_cl = domaine_Cl_VDF.les_conditions_limites(n_bord); const Front_VF& le_bord = ref_cast(Front_VF,la_cl->frontiere_dis()); const int ndeb = le_bord.num_premiere_face(), nfin = ndeb + le_bord.nb_faces(); + ToDo_Kokkos("critical"); for (int num_face = ndeb; num_face < nfin; num_face++) { if( sup_strict(fluent[num_face], 1.e-16) ) dt_face(num_face)= volumes_entrelaces(num_face)/fluent[num_face]; @@ -311,6 +348,7 @@ void Op_Conv_VDF_base::calculer_dt_local(DoubleTab& dt_face) const } // Boucle sur les faces internes + ToDo_Kokkos("critical"); for (int num_face = premiere_face; num_facefrontiere_dis()); const int nb_faces_bord = le_bord.nb_faces(); + ToDo_Kokkos("critical"); for (int ind_face = 0; ind_face < nb_faces_bord; ind_face++) { int ind_face_associee = la_cl_perio.face_associee(ind_face); @@ -379,6 +418,7 @@ void Op_Conv_VDF_base::calculer_pour_post(Champ_base& espace_stockage,const Nom& const Front_VF& le_bord = ref_cast(Front_VF,la_cl->frontiere_dis()); num1 = le_bord.num_premiere_face(); num2 = num1 + le_bord.nb_faces(); + ToDo_Kokkos("critical"); for (face = num1; face < num2; face++) { psc = vit[face]*face_surfaces(face); @@ -394,6 +434,7 @@ void Op_Conv_VDF_base::calculer_pour_post(Champ_base& espace_stockage,const Nom& // Boucle sur les faces internes pour remplir fluent const int domaine_VDF_nb_faces = domaine_VDF.nb_faces(); + ToDo_Kokkos("critical"); for (face = domaine_VDF.premiere_face_int(); face < domaine_VDF_nb_faces; face++) { psc = vit[face]*face_surfaces(face); @@ -490,6 +531,7 @@ void Op_Conv_VDF_base::mettre_a_jour(double temps) { Champ_Face_VDF& c_ph = ref_cast(Champ_Face_VDF, cc_phases_[n].valeur()); DoubleTab& v_ph = c_ph.valeurs(); + ToDo_Kokkos("critical"); for (f = 0; f < domaine.nb_faces(); v_ph(f) *= vit(f, m) * pf(f), f++) for (v_ph(f) = 0, i = 0; i < 2; i++) v_ph(f) += (1. + (vit(f, m) * (i ? -1 : 1) >= 0 ? 1. : -1.) * 1.0 /* FIXME : amont */) / 2 * ((e = f_e(f, i)) >= 0 ? vcc(e, n) : bcc(f, n)); c_ph.changer_temps(temps); @@ -503,6 +545,7 @@ void Op_Conv_VDF_base::mettre_a_jour(double temps) Champ_Face_VDF& c_ph = ref_cast(Champ_Face_VDF, vd_phases_[n].valeur()); DoubleTab& v_ph = c_ph.valeurs(); /* on remplit la partie aux faces, puis on demande au champ d'interpoler aux elements */ + ToDo_Kokkos("critical"); for (f = 0; f < domaine.nb_faces(); v_ph(f) *= vit(f, m) * pf(f), f++) for (v_ph(f) = 0, i = 0; i < 2; i++) v_ph(f) += (1. + (vit(f, m) * (i ? -1 : 1) >= 0 ? 1. : -1.) * 1.0 /* FIXME : amont */) / 2 * ((e = f_e(f, i)) >= 0 ? alp(e, n) : balp(f, n)); c_ph.changer_temps(temps); @@ -510,6 +553,7 @@ void Op_Conv_VDF_base::mettre_a_jour(double temps) DoubleTrav G(N), v(N, D); double Gt; + ToDo_Kokkos("critical"); if (x_phases_.size()) for (e = 0; e < domaine.nb_elem(); e++) //titre : aux elements { diff --git a/src/VDF/Operateurs/Op_Conv/Op_Conv_VDF_base.h b/src/VDF/Operateurs/Op_Conv/Op_Conv_VDF_base.h index 8cd850dcff..0bed89b59a 100644 --- a/src/VDF/Operateurs/Op_Conv/Op_Conv_VDF_base.h +++ b/src/VDF/Operateurs/Op_Conv/Op_Conv_VDF_base.h @@ -89,5 +89,6 @@ class Op_Conv_VDF_base : public Operateur_Conv_base // Fonction utile pour le calcul du pas de temps de stabilite inline void eval_fluent(const double , const int , const int , const int , DoubleTab& ); +KOKKOS_INLINE_FUNCTION void eval_fluent(const double , const int , const int , const int , DoubleTabView ); #endif /* Op_Conv_VDF_base_included */ diff --git a/src/VDF/Operateurs/Op_Diff_Dift/Op_Diff_Dift_base/Op_Diff_VDF_Face_Axi_base.cpp b/src/VDF/Operateurs/Op_Diff_Dift/Op_Diff_Dift_base/Op_Diff_VDF_Face_Axi_base.cpp index de08db801e..5862899aeb 100644 --- a/src/VDF/Operateurs/Op_Diff_Dift/Op_Diff_Dift_base/Op_Diff_VDF_Face_Axi_base.cpp +++ b/src/VDF/Operateurs/Op_Diff_Dift/Op_Diff_Dift_base/Op_Diff_VDF_Face_Axi_base.cpp @@ -48,6 +48,7 @@ double Op_Diff_VDF_Face_Axi_base::calculer_dt_stab() const void Op_Diff_VDF_Face_Axi_base::ajouter_elem(const DoubleTab& inco, DoubleTab& resu) const { if (inco.line_size() > 1) not_implemented(__func__); + ToDo_Kokkos("critical"); for (int num_elem = 0; num_elem < le_dom_vdf->nb_elem(); num_elem++) { const int fx0 = elem_faces(num_elem,0), fx1 = elem_faces(num_elem,dimension), fy0 = elem_faces(num_elem,1), fy1 = elem_faces(num_elem,1+dimension); @@ -74,6 +75,7 @@ void Op_Diff_VDF_Face_Axi_base::ajouter_elem(const DoubleTab& inco, DoubleTab& r void Op_Diff_VDF_Face_Axi_base::ajouter_elem_3D(const DoubleTab& inco, DoubleTab& resu) const { + ToDo_Kokkos("critical"); for (int num_elem = 0; num_elem < le_dom_vdf->nb_elem(); num_elem++) { const int fz0 = elem_faces(num_elem,2), fz1 = elem_faces(num_elem,2+dimension); @@ -98,15 +100,14 @@ void Op_Diff_VDF_Face_Axi_base::ajouter_aretes_bords(const DoubleTab& inco, Dou case TypeAreteBordVDF::PAROI_FLUIDE: { const int fac1 = Qdm(n_arete,0), fac2 = Qdm(n_arete,1), fac3 = Qdm(n_arete,2), signe = Qdm(n_arete,3), ori1 = orientation(fac1), ori3 = orientation(fac3); - const int rang1 = fac1 - le_dom_vdf->premiere_face_bord(), rang2 = fac2 - le_dom_vdf->premiere_face_bord(); double vit_imp, dist3, tps = inconnue->temps(); if (n_type == TypeAreteBordVDF::PAROI_FLUIDE) // arete paroi_fluide :il faut determiner qui est la face fluide { - if (est_egal(inco[fac1],0)) vit_imp = Champ_Face_get_val_imp_face_bord(tps,rang2,ori3,la_zcl_vdf.valeur()); - else vit_imp = Champ_Face_get_val_imp_face_bord(tps,rang1,ori3,la_zcl_vdf.valeur()); + if (est_egal(inco[fac1],0)) vit_imp = Champ_Face_get_val_imp_face_bord(tps,fac2,ori3,la_zcl_vdf.valeur()); + else vit_imp = Champ_Face_get_val_imp_face_bord(tps,fac1,ori3,la_zcl_vdf.valeur()); } - else vit_imp = 0.5*(Champ_Face_get_val_imp_face_bord(tps,rang1,ori3,la_zcl_vdf.valeur())+Champ_Face_get_val_imp_face_bord(tps,rang2,ori3,la_zcl_vdf.valeur())); + else vit_imp = 0.5*(Champ_Face_get_val_imp_face_bord(tps,fac1,ori3,la_zcl_vdf.valeur())+Champ_Face_get_val_imp_face_bord(tps,fac2,ori3,la_zcl_vdf.valeur())); const double db_diffusivite = nu_mean_2_pts_(face_voisins(fac3,0),face_voisins(fac3,1)); @@ -313,6 +314,7 @@ void Op_Diff_VDF_Face_Axi_base::ajouter_contribution_elem(const DoubleTab& inco, const auto& tab1 = matrice.get_set_tab1(); const auto& tab2 = matrice.get_set_tab2(); auto& coeff = matrice.get_set_coeff(); + ToDo_Kokkos("critical"); for (int num_elem = 0; num_elem < le_dom_vdf->nb_elem(); num_elem++) { const int fx0 = elem_faces(num_elem,0), fx1 = elem_faces(num_elem,dimension), fy0 = elem_faces(num_elem,1), fy1 = elem_faces(num_elem,1+dimension); @@ -345,6 +347,7 @@ void Op_Diff_VDF_Face_Axi_base::ajouter_contribution_elem(const DoubleTab& inco, void Op_Diff_VDF_Face_Axi_base::ajouter_contribution_elem_3D(Matrice_Morse& matrice) const { + ToDo_Kokkos("critical"); for (int num_elem = 0; num_elem < le_dom_vdf->nb_elem(); num_elem++) { const int fz0 = elem_faces(num_elem,2), fz1 = elem_faces(num_elem,2+dimension); @@ -572,17 +575,17 @@ void Op_Diff_VDF_Face_Axi_base::contribue_au_second_membre(DoubleTab& resu) cons case TypeAreteBordVDF::PAROI_FLUIDE: { const int fac1 = Qdm(n_arete,0), fac2 = Qdm(n_arete,1), fac3 = Qdm(n_arete,2), signe = Qdm(n_arete,3); - const int ori1 = orientation(fac1), ori3 = orientation(fac3), rang1 = fac1 - le_dom_vdf->premiere_face_bord(), rang2 = fac2 - le_dom_vdf->premiere_face_bord(); + const int ori1 = orientation(fac1), ori3 = orientation(fac3); double vit_imp, tps = inconnue->temps(); if (n_type == TypeAreteBordVDF::PAROI_FLUIDE) // arete paroi_fluide :il faut determiner qui est la face fluide { if (est_egal(inconnue->valeurs()(fac1), 0)) - vit_imp = Champ_Face_get_val_imp_face_bord(tps, rang2, ori3, la_zcl_vdf.valeur()); + vit_imp = Champ_Face_get_val_imp_face_bord(tps, fac2, ori3, la_zcl_vdf.valeur()); else - vit_imp = Champ_Face_get_val_imp_face_bord(tps, rang1, ori3, la_zcl_vdf.valeur()); + vit_imp = Champ_Face_get_val_imp_face_bord(tps, fac1, ori3, la_zcl_vdf.valeur()); } - else vit_imp = 0.5*(Champ_Face_get_val_imp_face_bord(tps,rang1,ori3,la_zcl_vdf.valeur())+Champ_Face_get_val_imp_face_bord(tps,rang2,ori3,la_zcl_vdf.valeur())); + else vit_imp = 0.5*(Champ_Face_get_val_imp_face_bord(tps,fac1,ori3,la_zcl_vdf.valeur())+Champ_Face_get_val_imp_face_bord(tps,fac2,ori3,la_zcl_vdf.valeur())); const double db_diffusivite = nu_mean_2_pts_(face_voisins(fac3,0),face_voisins(fac3,1)); diff --git a/src/VDF/Operateurs/Op_Diff_Dift/Op_Diff_Dift_base/Op_Diff_VDF_base.cpp b/src/VDF/Operateurs/Op_Diff_Dift/Op_Diff_Dift_base/Op_Diff_VDF_base.cpp index a01d47ed95..bfde8592b3 100644 --- a/src/VDF/Operateurs/Op_Diff_Dift/Op_Diff_Dift_base/Op_Diff_VDF_base.cpp +++ b/src/VDF/Operateurs/Op_Diff_Dift/Op_Diff_Dift_base/Op_Diff_VDF_base.cpp @@ -122,6 +122,7 @@ void Op_Diff_VDF_base::ajoute_terme_pour_axi(matrices_t matrices, DoubleTab& sec if (tab_diffusivite.size() == 1) diffu_tot = tab_diffusivite(0, 0); else diffu_tot = tab_diffusivite; + ToDo_Kokkos("critical"); for (face = 0; face < nb_faces; face++) for (int n = 0; n < N; n++) if (ori(face) == 0) @@ -173,39 +174,49 @@ double Op_Diff_VDF_base::calculer_dt_stab_(const Domaine_VDF& zone_VDF) const // initial (comme en thermique) et non le Max sur les volumes de Qdm. double dt_stab = DMAXFLOAT; const Champ_base& ch_diffu = has_champ_masse_volumique() ? diffusivite() : diffusivite_pour_pas_de_temps(); - const DoubleTab& diffu = ch_diffu.valeurs(), *alp = sub_type(Pb_Multiphase, equation().probleme()) ? &ref_cast(Pb_Multiphase, equation().probleme()).equation_masse().inconnue().passe() : nullptr; + const DoubleTab& tab_diffu = ch_diffu.valeurs(), *tab_alp = sub_type(Pb_Multiphase, equation().probleme()) ? &ref_cast(Pb_Multiphase, equation().probleme()).equation_masse().inconnue().passe() : nullptr; const bool Cdiffu = sub_type(Champ_Uniforme, ch_diffu); + const int nb_elem = zone_VDF.nb_elem(); + const int nb_comp = tab_diffu.dimension(1); + const bool has_rho = has_champ_masse_volumique(); // Si la diffusivite est variable, ce doit etre un champ aux elements. - assert(Cdiffu || diffu.size() == diffu.line_size() * zone_VDF.nb_elem()); + assert(Cdiffu || tab_diffu.size() == tab_diffu.line_size() * nb_elem); int rho_comme_diff = 0; - if (has_champ_masse_volumique()) + int dim = Objet_U::dimension; + Domaine_VDF_View dom_VDF(zone_VDF); // Use a view on Domaine_VDF to use dim_elem() method + CDoubleTabView rho; + if (has_rho) { - const DoubleTab& rho = get_champ_masse_volumique().valeurs(); - rho_comme_diff = (rho.dimension(1) == diffu.dimension(1)); - } - - for (int elem = 0; elem < zone_VDF.nb_elem(); elem++) - { - double h = 0; - for (int d = 0 ; d < dimension; d++) - { - const double l = zone_VDF.dim_elem(elem, d); - h += 1. / (l * l); - } - for (int n = 0; n < diffu.dimension(1); n++) - { - double alpha_loc = diffu(Cdiffu ? 0 : elem, n); - if (has_champ_masse_volumique()) - { - const DoubleTab& rho = get_champ_masse_volumique().valeurs(); - alpha_loc/= rho(elem, rho_comme_diff * n); - } - const double dt_loc = (alp ? (*alp)(elem, n) : 1.0) * 0.5 / ((alpha_loc + DMINFLOAT) * h); - if (dt_loc < dt_stab) dt_stab = dt_loc; - } + rho = get_champ_masse_volumique().valeurs().view_ro(); + rho_comme_diff = (get_champ_masse_volumique().valeurs().dimension(1) == tab_diffu.dimension(1)); } + CDoubleTabView alp; + if (tab_alp) alp = tab_alp->view_ro(); + CDoubleTabView diffu = tab_diffu.view_ro(); + Kokkos::parallel_reduce(start_gpu_timer(__KERNEL_NAME__), nb_elem, KOKKOS_LAMBDA(const int elem, double& dtstab) + { + // Calculate mesh coefficient h = sum(1/(l*l)) for each spatial dimension + double h = 0.0; + for (int d = 0; d < dim; d++) + { + const double l = dom_VDF.dim_elem(elem, d); + h += 1.0 / (l * l); + } + // Loop over components + for (int n = 0; n < nb_comp; n++) + { + double alpha_loc = diffu(Cdiffu ? 0 : elem, n); + if (has_rho) + { + alpha_loc /= rho(elem, rho_comme_diff * n); + } + const double dt_loc = (alp.data() ? alp(elem, n) : 1.0) * 0.5 / ((alpha_loc + DMINFLOAT) * h); + if (dt_loc < dtstab) dtstab = dt_loc; + } + }, Kokkos::Min(dt_stab)); + end_gpu_timer(__KERNEL_NAME__); return Process::mp_min(dt_stab); } diff --git a/src/VDF/Operateurs/Op_Diff_Dift/Op_Diff_Dift_base/Op_Diff_VDF_base.h b/src/VDF/Operateurs/Op_Diff_Dift/Op_Diff_Dift_base/Op_Diff_VDF_base.h index dd7e7f96f7..d624bcdee1 100644 --- a/src/VDF/Operateurs/Op_Diff_Dift/Op_Diff_Dift_base/Op_Diff_VDF_base.h +++ b/src/VDF/Operateurs/Op_Diff_Dift/Op_Diff_Dift_base/Op_Diff_VDF_base.h @@ -43,8 +43,10 @@ class Op_Diff_VDF_base : public Operateur_Diff_base void contribuer_au_second_membre(DoubleTab& resu) const override { iter_->contribuer_au_second_membre(resu); } void check_multiphase_compatibility() const override { } -protected: + protected_but_public_for_cuda double calculer_dt_stab_(const Domaine_VDF& zone_VDF) const; + +protected: void ajoute_terme_pour_axi(matrices_t , DoubleTab& , const tabs_t& ) const; OWN_PTR(Iterateur_VDF_base) iter_; diff --git a/src/VDF/Operateurs/Op_Diff_Dift/Op_Diff_Dift_base/Op_Dift_VDF_Elem_base.cpp b/src/VDF/Operateurs/Op_Diff_Dift/Op_Diff_Dift_base/Op_Dift_VDF_Elem_base.cpp index 79eb7f7a07..9e84b572f2 100644 --- a/src/VDF/Operateurs/Op_Diff_Dift/Op_Diff_Dift_base/Op_Dift_VDF_Elem_base.cpp +++ b/src/VDF/Operateurs/Op_Diff_Dift/Op_Diff_Dift_base/Op_Dift_VDF_Elem_base.cpp @@ -1,5 +1,5 @@ /**************************************************************************** -* Copyright (c) 2025, CEA +* Copyright (c) 2026, CEA * All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: @@ -32,36 +32,46 @@ double Op_Dift_VDF_Elem_base::calculer_dt_stab_elem() const { double dt_stab, coef = -1.e10; const Domaine_VDF& domaine_VDF = iter_->domaine(); - const IntTab& elem_faces = domaine_VDF.elem_faces(); + const IntTab& tab_elem_faces = domaine_VDF.elem_faces(); // need elem_faces ro const DoubleVect& alpha_t = diffusivite_turbulente().valeurs(); bool is_concentration = (equation().que_suis_je().debute_par("Convection_Diffusion_Concentration") || equation().que_suis_je().debute_par("Convection_Diffusion_Espece")); - ArrOfInt numfa(2*dimension); - for (int elem = 0; elem < domaine_VDF.nb_elem(); elem++) - { + ArrOfInt tab_numfa(2*dimension); // An array of probably 4 or 6 ints + ToDo_Kokkos("critical"); + CIntTabView elem_faces = tab_elem_faces.view_ro(); + const int Ccp = sub_type(Champ_Uniforme, mon_equation->milieu().capacite_calorifique()); + const int Cr = sub_type(Champ_Uniforme, mon_equation->milieu().masse_volumique()); + const DoubleTab& tab_Cp = mon_equation->milieu().capacite_calorifique().valeurs(); // NEED tab_Cp ro + const DoubleTab& tab_r = mon_equation->milieu().masse_volumique().valeurs(); // NEED tab_r ro + CDoubleTabView Cp = tab_Cp.view_ro(); + CDoubleTabView r = tab_r.view_ro(); + IntArrView numfa = tab_numfa.view_rw(); + auto* self = this; + static int l_dim = dimension; + Kokkos::parallel_reduce(start_gpu_timer(__KERNEL_NAME__), range_1D(0, domaine_VDF.nb_elem()), KOKKOS_LAMBDA (const int elem, double& local_max) + { // choix du facteur double rcp = 1.; if (!is_concentration) { - const int Ccp = sub_type(Champ_Uniforme, mon_equation->milieu().capacite_calorifique()); - const int Cr = sub_type(Champ_Uniforme, mon_equation->milieu().masse_volumique()); - const DoubleTab& tab_Cp = mon_equation->milieu().capacite_calorifique().valeurs(), tab_r = mon_equation->milieu().masse_volumique().valeurs(); - rcp = tab_r(Cr ? 0 : elem, 0) * tab_Cp(Ccp ? 0 : elem, 0); + rcp = r(Cr ? 0 : elem, 0) * Cp(Ccp ? 0 : elem, 0); } double moy = 0.; - for (int i = 0; i < 2 * dimension; i++) numfa[i] = elem_faces(elem, i); + for (int i = 0; i < 2 * l_dim; i++) numfa[i] = elem_faces(elem, i); // small loop over 4 to 6 elements // XXX : E Saikali j'ai corrige pour multi inco parce que c'etait 1/dx et pas 1/dx^2 ... donc attention si ecart ! - // c'etait comme ca : for (int d = 0; d < dimension; d++) moy += 1. / (domaine_VDF.dist_face(numfa[d], numfa[dimension + d], d)); - for (int d = 0; d < dimension; d++) + // c'etait comme ca : for (int d = 0; d < l_dim; d++) moy += 1. / (domaine_VDF.dist_face(numfa[d], numfa[l_dim + d], d)); + for (int d = 0; d < l_dim; d++) // Also small { - const double hd = domaine_VDF.dist_face(numfa[d], numfa[dimension + d], d); + const double hd = domaine_VDF.dist_face(numfa[d], numfa[l_dim + d], d); moy += 1. / (hd * hd); } - const double alpha_local = (alpha_(elem) + alpha_t(elem)) / rcp * moy; - coef = std::max(coef, alpha_local); - } + const double alpha_local = (self->alpha_(elem) + alpha_t(elem)) / rcp * moy; + local_max = std::max(coef, alpha_local); + }, + Kokkos::Max(coef) + ); coef = Process::mp_max(coef); dt_stab = 1. / (2. * (coef + DMINFLOAT)); @@ -88,6 +98,7 @@ double Op_Dift_VDF_Elem_base::calculer_dt_stab_elem_axi() const if (dimension == 2) { int numfa[4]; + ToDo_Kokkos("critical"); for (int elem=0; elemnb_elem(); num_elem++) { const int fx0 = elem_faces(num_elem,0), fx1 = elem_faces(num_elem,dimension), fy0 = elem_faces(num_elem,1), fy1 = elem_faces(num_elem,1+dimension); @@ -91,6 +92,7 @@ void Op_Dift_VDF_Face_Axi_base::ajouter_elem(const DoubleVect& visco_turb, const void Op_Dift_VDF_Face_Axi_base::ajouter_elem_3D(const DoubleVect& visco_turb, const DoubleTab& tau_diag, DoubleTab& resu) const { + ToDo_Kokkos("critical"); for (int num_elem = 0; num_elem < le_dom_vdf->nb_elem(); num_elem++) { const int fz0 = elem_faces(num_elem,2), fz1 = elem_faces(num_elem,2+dimension); @@ -112,18 +114,17 @@ void Op_Dift_VDF_Face_Axi_base::ajouter_aretes_bords(const DoubleVect& visco_tur case TypeAreteBordVDF::PAROI_PAROI: // paroi-paroi { const int fac1 = Qdm(n_arete,0), fac2 = Qdm(n_arete,1), fac3 = Qdm(n_arete,2), ori3 = orientation(fac3); - const int rang1 = (fac1 - le_dom_vdf->premiere_face_bord()), rang2 = (fac2-le_dom_vdf->premiere_face_bord()); double coef; if (is_var()) // XXX : E Saikali : sais pas quoi faire sinon ecarts ... { // Calcul du frottement identique a celui de TRIOVF : On calcule la moyenne des u_star et on l'eleve au carre. On calcule la moyenne des surfaces - const double tau_tan_1 = tau_tan(rang1,ori3), tau_tan_2 = tau_tan(rang2,ori3) ; + const double tau_tan_1 = tau_tan(fac1,ori3), tau_tan_2 = tau_tan(fac2,ori3) ; double tau = 0.5*(tau_tan_1 + tau_tan_2 ), surf = 0.5*(surface(fac1)+surface(fac2)); coef = tau*tau*surf; } else // Autre solution pour le calcul du frottement : On calcule u_star*u_star*surf sur chaque partie de la facette de Qdm { - const double tau1 = tau_tan(rang1,ori3)*0.5*surface(fac1), tau2 = tau_tan(rang2,ori3)*0.5*surface(fac2); + const double tau1 = tau_tan(fac1,ori3)*0.5*surface(fac1), tau2 = tau_tan(fac2,ori3)*0.5*surface(fac2); coef = tau1+tau2; } @@ -345,6 +346,7 @@ void Op_Dift_VDF_Face_Axi_base::ajouter_contribution_elem(const DoubleVect& visc auto& tab1 = matrice.get_set_tab1(); auto& tab2 = matrice.get_set_tab2(); auto& coeff = matrice.get_set_coeff(); + ToDo_Kokkos("critical"); for (int num_elem = 0; num_elem < le_dom_vdf->nb_elem(); num_elem++) { const int fx0 = elem_faces(num_elem,0), fx1 = elem_faces(num_elem,dimension), fy0 = elem_faces(num_elem,1), fy1 = elem_faces(num_elem,1+dimension); @@ -380,6 +382,7 @@ void Op_Dift_VDF_Face_Axi_base::ajouter_contribution_elem(const DoubleVect& visc void Op_Dift_VDF_Face_Axi_base::ajouter_contribution_elem_3D(const DoubleVect& visco_turb, const DoubleTab& tau_diag, Matrice_Morse& matrice) const { + ToDo_Kokkos("critical"); for (int num_elem = 0; num_elem < le_dom_vdf->nb_elem(); num_elem++) { const int fz0 = elem_faces(num_elem,2), fz1 = elem_faces(num_elem,2+dimension); @@ -593,17 +596,16 @@ void Op_Dift_VDF_Face_Axi_base::contribue_au_second_membre(DoubleTab& resu ) con case TypeAreteBordVDF::PAROI_PAROI: { const int fac1 = Qdm(n_arete,0), fac2 = Qdm(n_arete,1), fac3 = Qdm(n_arete,2), ori3 = orientation(fac3); - const int rang1 = (fac1 - le_dom_vdf->premiere_face_bord()), rang2 = (fac2 - le_dom_vdf->premiere_face_bord()); double coef; if (is_var()) { // Calcul du frottement identique a celui de TRIOVF : On calcule la moyenne des u_star et on l'eleve au carre. On calcule la moyenne des surfaces - const double tau = 0.5*(sqrt(tau_tan(rang1,ori3)) + sqrt(tau_tan(rang2,ori3))), surf = 0.5*(surface(fac1)+surface(fac2)); + const double tau = 0.5*(sqrt(tau_tan(fac1,ori3)) + sqrt(tau_tan(fac2,ori3))), surf = 0.5*(surface(fac1)+surface(fac2)); coef = tau*tau*surf; } else // Autre solution pour le calcul du frottement : On calcule u_star*u_star*surf sur chaque partie de la facette de Qdm { - const double tau1 = tau_tan(rang1,ori3)*0.5*surface(fac1), tau2 = tau_tan(rang2,ori3)*0.5*surface(fac2); + const double tau1 = tau_tan(fac1,ori3)*0.5*surface(fac1), tau2 = tau_tan(fac2,ori3)*0.5*surface(fac2); coef = tau1+tau2; } diff --git a/src/VDF/Operateurs/Op_Diff_Dift/Op_Diff_Dift_base/Op_Dift_VDF_Face_base.cpp b/src/VDF/Operateurs/Op_Diff_Dift/Op_Diff_Dift_base/Op_Dift_VDF_Face_base.cpp index 5883abcf0f..ef62eba3e5 100644 --- a/src/VDF/Operateurs/Op_Diff_Dift/Op_Diff_Dift_base/Op_Dift_VDF_Face_base.cpp +++ b/src/VDF/Operateurs/Op_Diff_Dift/Op_Diff_Dift_base/Op_Dift_VDF_Face_base.cpp @@ -1,5 +1,5 @@ /**************************************************************************** -* Copyright (c) 2024, CEA +* Copyright (c) 2026, CEA * All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: @@ -36,67 +36,67 @@ double Op_Dift_VDF_Face_base::calculer_dt_stab() const */ double Op_Dift_VDF_Face_base::calculer_dt_stab(const Domaine_VDF& domaine_VDF) const { - double dt_stab, coef = -1.e10; - const DoubleTab& diffu = diffusivite().valeurs(), &diffu_turb = diffusivite_turbulente().valeurs(); + double coef = -1.e10; + const DoubleTab& tab_diffu = diffusivite().valeurs(), &tab_diffu_turb = diffusivite_turbulente().valeurs(); // B.Mat. 9/3/2005: pour traiter monophasique/qc/front-tracking de facon generique. Mettre a jour le qc et l'ancien ft pour utiliser ce mecanisme const int nb_elem = domaine_VDF.nb_elem(), dim = Objet_U::dimension; + Domaine_VDF_View dom_VDF(domaine_VDF); + CDoubleTabView diffu = tab_diffu.view_ro(); + CDoubleTabView diffu_turb = tab_diffu_turb.view_ro(); + const int nb_comp_diffu = tab_diffu.line_size(), nb_comp_diffu_turb = tab_diffu_turb.line_size(); + if (has_champ_masse_volumique()) { - const DoubleTab& valeurs_rho = get_champ_masse_volumique().valeurs(); - for (int elem = 0; elem < nb_elem; elem++) - { - double diflo = 0.; - for (int i = 0; i < dim; i++) - { - const double h = domaine_VDF.dim_elem(elem, i); - diflo += 1. / (h * h); - } - double mu_physique = diffu(elem, 0), mu_turbulent = diffu_turb(elem, 0); - - for (int ncomp = 1; ncomp < diffu.line_size(); ncomp++) mu_physique = std::max(mu_physique, diffu(elem, ncomp)); - for (int ncomp = 1; ncomp < diffu_turb.line_size(); ncomp++) mu_turbulent = std::max(mu_turbulent, diffu_turb(elem, ncomp)); - - const double inv_rho = 1./valeurs_rho(elem) ; - diflo *= (mu_physique + mu_turbulent) * inv_rho; - coef = std::max(coef, diflo); - } + const DoubleTab& tab_rho = get_champ_masse_volumique().valeurs(); + CDoubleTabView rho = tab_rho.view_ro(); + Kokkos::parallel_reduce(start_gpu_timer(__KERNEL_NAME__), nb_elem, KOKKOS_LAMBDA(const int elem, double& coef_) + { + double diflo = 0.; + for (int i = 0; i < dim; i++) + { + const double h = dom_VDF.dim_elem(elem, i); + diflo += 1. / (h * h); + } + double mu_physique = diffu(elem, 0), mu_turbulent = diffu_turb(elem, 0); + for (int ncomp = 1; ncomp < nb_comp_diffu; ncomp++) mu_physique = Kokkos::fmax(mu_physique, diffu(elem, ncomp)); + for (int ncomp = 1; ncomp < nb_comp_diffu_turb; ncomp++) mu_turbulent = Kokkos::fmax(mu_turbulent, diffu_turb(elem, ncomp)); + diflo *= (mu_physique + mu_turbulent) / rho(elem, 0); + if (diflo > coef_) coef_ = diflo; + }, Kokkos::Max(coef)); + end_gpu_timer(__KERNEL_NAME__); } else { const Champ_base& champ_diffu = diffusivite_pour_pas_de_temps(); - const DoubleTab& diffu_dt = champ_diffu.valeurs(); - const int diffu_dt_variable = (diffu_dt.dimension(0) == 1) ? 0 : 1, diffu_variable = (diffu.dimension(0) == 1) ? 0 : 1; - for (int elem = 0; elem < nb_elem; elem++) - { - double diflo = 0.; - for (int i = 0; i < dim; i++) - { - const double h = domaine_VDF.dim_elem(elem, i); - diflo += 1. / (h * h); - } - - int item = (diffu_variable ? elem : 0); - double mu_physique = diffu(item, 0), mu_turbulent = diffu_turb(elem, 0); - - for (int ncomp = 1; ncomp < diffu.line_size(); ncomp++) mu_physique = std::max(mu_physique, diffu(item, ncomp)); - for (int ncomp = 1; ncomp < diffu_turb.line_size(); ncomp++) mu_turbulent = std::max(mu_turbulent, diffu_turb(elem, ncomp)); - - item = (diffu_dt_variable ? elem : 0); - double diffu_dt_l = diffu_dt(item, 0); - - for (int ncomp = 1; ncomp < diffu_dt.line_size(); ncomp++) diffu_dt_l = std::max(diffu_dt_l, diffu_dt(item, ncomp)); - - // si on a associe mu au lieu de nu , on a nu sans diffu_dt - // le pas de temps de stab est nu+nu_t, on calcule (mu+mu_t)*(nu/mu)=(mu+mu_t)/rho=nu+nu_t (avantage par rapport a la division par rho ca marche aussi pour alpha et lambda et en VEF - diflo *= (mu_physique + mu_turbulent)*(diffu_dt_l)/mu_physique ; - coef = std::max(coef, diflo); - } + const DoubleTab& tab_diffu_dt = champ_diffu.valeurs(); + const int diffu_dt_variable = (tab_diffu_dt.dimension(0) == 1) ? 0 : 1, diffu_variable = (tab_diffu.dimension(0) == 1) ? 0 : 1; + const int nb_comp_diffu_dt = tab_diffu_dt.line_size(); + CDoubleTabView diffu_dt = tab_diffu_dt.view_ro(); + Kokkos::parallel_reduce(start_gpu_timer(__KERNEL_NAME__), nb_elem, KOKKOS_LAMBDA(const int elem, double& coef_) + { + double diflo = 0.; + for (int i = 0; i < dim; i++) + { + const double h = dom_VDF.dim_elem(elem, i); + diflo += 1. / (h * h); + } + const int item = (diffu_variable ? elem : 0); + double mu_physique = diffu(item, 0), mu_turbulent = diffu_turb(elem, 0); + for (int ncomp = 1; ncomp < nb_comp_diffu; ncomp++) mu_physique = Kokkos::fmax(mu_physique, diffu(item, ncomp)); + for (int ncomp = 1; ncomp < nb_comp_diffu_turb; ncomp++) mu_turbulent = Kokkos::fmax(mu_turbulent, diffu_turb(elem, ncomp)); + const int item_dt = (diffu_dt_variable ? elem : 0); + double diffu_dt_l = diffu_dt(item_dt, 0); + for (int ncomp = 1; ncomp < nb_comp_diffu_dt; ncomp++) diffu_dt_l = Kokkos::fmax(diffu_dt_l, diffu_dt(item_dt, ncomp)); + // si on a associe mu au lieu de nu , on a nu sans diffu_dt + // le pas de temps de stab est nu+nu_t, on calcule (mu+mu_t)*(nu/mu)=(mu+mu_t)/rho=nu+nu_t (avantage par rapport a la division par rho ca marche aussi pour alpha et lambda et en VEF + diflo *= (mu_physique + mu_turbulent) * diffu_dt_l / mu_physique; + if (diflo > coef_) coef_ = diflo; + }, Kokkos::Max(coef)); + end_gpu_timer(__KERNEL_NAME__); } coef = Process::mp_max(coef); - dt_stab = 0.5 / (coef+DMINFLOAT); - - return dt_stab; + return 0.5 / (coef + DMINFLOAT); } void Op_Dift_VDF_Face_base::calculer_borne_locale(DoubleVect& borne_visco_turb,double dt,double dt_diff_sur_dt_conv) const @@ -106,6 +106,7 @@ void Op_Dift_VDF_Face_base::calculer_borne_locale(DoubleVect& borne_visco_turb,d const DoubleVect& diffu = champ_diffu.valeurs(); const int diffu_variable = (diffu.size() == 1) ? 0 : 1, nb_elem = domaine_VDF.nb_elem(); const double diffu_constante = (diffu_variable ? 0. : diffu(0)); + ToDo_Kokkos("critical"); for (int elem=0; elem(); } diff --git a/src/VDF/Operateurs/Op_Diff_Dift/Op_Dift_Multiphase_VDF_Face.cpp b/src/VDF/Operateurs/Op_Diff_Dift/Op_Dift_Multiphase_VDF_Face.cpp index 15d6a6dc56..eac0189304 100644 --- a/src/VDF/Operateurs/Op_Diff_Dift/Op_Dift_Multiphase_VDF_Face.cpp +++ b/src/VDF/Operateurs/Op_Diff_Dift/Op_Dift_Multiphase_VDF_Face.cpp @@ -79,6 +79,7 @@ double Op_Dift_Multiphase_VDF_Face::calculer_dt_stab() const double mu_turbulent, mu_physique, nu_physique; + ToDo_Kokkos("critical"); for (int elem = 0; elem < domaine_VDF.nb_elem(); elem++) { double diflo = 0.; diff --git a/src/VDF/Operateurs/Op_Diff_Dift/Op_Dift_VDF_Elem_leaves.h b/src/VDF/Operateurs/Op_Diff_Dift/Op_Dift_VDF_Elem_leaves.h index 27d0d25e15..0c3f9f2ee1 100644 --- a/src/VDF/Operateurs/Op_Diff_Dift/Op_Dift_VDF_Elem_leaves.h +++ b/src/VDF/Operateurs/Op_Diff_Dift/Op_Dift_VDF_Elem_leaves.h @@ -34,7 +34,7 @@ class Op_Dift_VDF_Elem : public Op_Dift_VDF_Elem_base, public Op_Diff_Dift_VDF(i); } + KOKKOS_INLINE_FUNCTION double alpha_(const int i) const override { return alpha_impl(i); } inline void associer_loipar(const Turbulence_paroi_scal_base& lp ) { associer_loipar_impl(lp); } inline void associer(const Domaine_dis_base& zd, const Domaine_Cl_dis_base& zcd, const Champ_Inc_base& ch) override { associer_impl(zd,zcd,ch); } inline void associer_diffusivite_turbulente(const Champ_Fonc_base& ch) { associer_diffusivite_turbulente_impl(ch); } @@ -55,7 +55,7 @@ class Op_Dift_VDF_Elem_Axi : public Op_Dift_VDF_Elem_base, public Op_Diff_Dift_V public: Op_Dift_VDF_Elem_Axi(); inline double calculer_dt_stab() const override { return calculer_dt_stab_elem_axi(); } - inline double alpha_(const int i) const override { return alpha_impl(i); } + KOKKOS_INLINE_FUNCTION double alpha_(const int i) const override { return alpha_impl(i); } inline void associer_loipar(const Turbulence_paroi_scal_base& lp ) { associer_loipar_impl(lp); } inline void associer(const Domaine_dis_base& zd, const Domaine_Cl_dis_base& zcd, const Champ_Inc_base& ch) override { associer_impl(zd,zcd,ch); } inline void associer_diffusivite_turbulente(const Champ_Fonc_base& ch) { associer_diffusivite_turbulente_impl(ch); } @@ -76,12 +76,11 @@ class Op_Dift_VDF_Multi_inco_Elem : public Op_Dift_VDF_Elem_base, public Op_Diff public: Op_Dift_VDF_Multi_inco_Elem(); inline double calculer_dt_stab() const override { return calculer_dt_stab_elem(); } - inline double alpha_(const int i) const override + KOKKOS_INLINE_FUNCTION double alpha_(const int i) const override { const DoubleTab& alpha = diffusivite_pour_pas_de_temps().valeurs(); - const int is_var = sub_type(Champ_Uniforme, diffusivite()) ? 0 : 1; - double alpha_lam = alpha(is_var * i,0); - for (int k = 1; k < alpha.line_size(); k++) alpha_lam = std::max(alpha_lam, alpha(is_var * i,k)); + double alpha_lam = alpha(is_var_ * i,0); + for (int k = 1; k < alpha.line_size(); k++) alpha_lam = std::max(alpha_lam, alpha(is_var_ * i,k)); return alpha_lam; } inline void associer_loipar(const Turbulence_paroi_scal_base& lp ) { associer_loipar_impl(lp); } @@ -94,6 +93,9 @@ class Op_Dift_VDF_Multi_inco_Elem : public Op_Dift_VDF_Elem_base, public Op_Diff completer_impl(); associer_pb(equation().probleme()); } + +private: + const int is_var_ = sub_type(Champ_Uniforme, diffusivite()) ? 0 : 1; }; // =========================================================================================================================================== diff --git a/src/VDF/Operateurs/Op_Divers/Op_Div_VDF_Elem.cpp b/src/VDF/Operateurs/Op_Divers/Op_Div_VDF_Elem.cpp index 1f463c6b9d..bff2fc795a 100644 --- a/src/VDF/Operateurs/Op_Divers/Op_Div_VDF_Elem.cpp +++ b/src/VDF/Operateurs/Op_Divers/Op_Div_VDF_Elem.cpp @@ -1,5 +1,5 @@ /**************************************************************************** -* Copyright (c) 2024, CEA +* Copyright (c) 2025, CEA * All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: @@ -14,6 +14,8 @@ *****************************************************************************/ #include +#include +#include Implemente_instanciable_sans_constructeur(Op_Div_VDF_Elem,"Op_Div_VDF_Face",Op_Div_VDF_base); @@ -37,11 +39,16 @@ void Op_Div_VDF_Elem::associer(const Domaine_dis_base& domaine_dis, const Domain la_zcl_vdf = zclvdf; } -void Op_Div_VDF_Elem::volumique(DoubleTab& div) const +void Op_Div_VDF_Elem::volumique(DoubleTab& tab_div) const { const Domaine_VDF& domaine_VDF = le_dom_vdf.valeur(); - const DoubleVect& vol = domaine_VDF.volumes(); const int nb_elem = domaine_VDF.domaine().nb_elem_tot(); - for(int num_elem = 0; num_elem < nb_elem; num_elem++) div(num_elem) /= vol(num_elem); + CDoubleArrView vol = domaine_VDF.volumes().view_ro(); + DoubleArrView div = static_cast(tab_div).view_rw(); + Kokkos::parallel_for(start_gpu_timer(__KERNEL_NAME__), range_1D(0, nb_elem), KOKKOS_LAMBDA(const int num_elem) + { + div(num_elem) /= vol(num_elem); + }); + end_gpu_timer(__KERNEL_NAME__); } diff --git a/src/VDF/Operateurs/Op_Divers/Op_Grad_P0_to_Face.cpp b/src/VDF/Operateurs/Op_Divers/Op_Grad_P0_to_Face.cpp index ecc95c7b6f..423bce2eed 100644 --- a/src/VDF/Operateurs/Op_Divers/Op_Grad_P0_to_Face.cpp +++ b/src/VDF/Operateurs/Op_Divers/Op_Grad_P0_to_Face.cpp @@ -1,5 +1,5 @@ /**************************************************************************** -* Copyright (c) 2025, CEA +* Copyright (c) 2026, CEA * All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: @@ -59,49 +59,61 @@ void Op_Grad_P0_to_Face::ajouter_blocs(matrices_t matrices, DoubleTab& secmem, c const int ndeb = le_bord.num_premiere_face(), nfin = ndeb + le_bord.nb_faces(); if (sub_type(Periodique, la_cl.valeur())) // Correction periodicite - for (int num_face = ndeb; num_face < nfin; num_face++) - { - const int n0 = face_voisins(num_face, 0), n1 = face_voisins(num_face, 1); - const double dist = volume_entrelaces(num_face) / face_surfaces(num_face); - secmem(num_face, k) -= (inco(n1, k) - inco(n0, k)) / dist; - } + { + ToDo_Kokkos("critical"); + for (int num_face = ndeb; num_face < nfin; num_face++) + { + const int n0 = face_voisins(num_face, 0), n1 = face_voisins(num_face, 1); + const double dist = volume_entrelaces(num_face) / face_surfaces(num_face); + secmem(num_face, k) -= (inco(n1, k) - inco(n0, k)) / dist; + } + } else if (sub_type(Dirichlet, la_cl.valeur())) // Cas CL Dirichlet { const Dirichlet& cl = ref_cast(Dirichlet, la_cl.valeur()); // XXX Elie Saikali : on calcule pas si champ_front_var n'est pas initialise if (cl.champ_front().has_valeurs_au_temps(cl.champ_front().get_temps_defaut())) - for (int num_face = ndeb, num_face_cl = 0; num_face < nfin; num_face++, num_face_cl++) - { - int n0 = face_voisins(num_face, 0); - if (n0 < 0) - n0 = face_voisins(num_face, 1); - const int ori = orientation(num_face); - secmem(num_face, k) -= (inco(n0, k) - cl.val_imp(num_face_cl, k)) / (xp(n0, ori) - xv(num_face, ori)); - } + { + ToDo_Kokkos("critical"); + for (int num_face = ndeb, num_face_cl = 0; num_face < nfin; num_face++, num_face_cl++) + { + int n0 = face_voisins(num_face, 0); + if (n0 < 0) + n0 = face_voisins(num_face, 1); + const int ori = orientation(num_face); + secmem(num_face, k) -= (inco(n0, k) - cl.val_imp(num_face_cl, k)) / (xp(n0, ori) - xv(num_face, ori)); + } + } } else if (sub_type(Dirichlet_homogene, la_cl.valeur())) // Cas Dirichlet homogene, i.e. valeur nulle a la paroi - for (int num_face = ndeb; num_face < nfin; num_face++) - { - int n0 = face_voisins(num_face, 0); - if (n0 < 0) - n0 = face_voisins(num_face, 1); - const int ori = orientation(num_face); - secmem(num_face, k) -= inco(n0, k) / (xp(n0, ori) - xv(num_face, ori)); - } + { + ToDo_Kokkos("critical"); + for (int num_face = ndeb; num_face < nfin; num_face++) + { + int n0 = face_voisins(num_face, 0); + if (n0 < 0) + n0 = face_voisins(num_face, 1); + const int ori = orientation(num_face); + secmem(num_face, k) -= inco(n0, k) / (xp(n0, ori) - xv(num_face, ori)); + } + } else if (sub_type(Echange_impose_base, la_cl.valeur())) // Cas Echange_impose_base { const Echange_impose_base& cl = ref_cast(Echange_impose_base, la_cl.valeur()); if (cl.has_h_imp_grad()) - for (int num_face = ndeb, num_face_cl = 0; num_face < nfin; num_face++, num_face_cl++) - { - int n0 = face_voisins(num_face, 0); - if (n0 < 0) - n0 = face_voisins(num_face, 1); - if (face_voisins(num_face, 0) >= 0) - secmem(num_face, k) -= (inco(n0, k) - cl.T_ext(num_face_cl, k)) * cl.h_imp_grad(num_face_cl, k); // Si bien oriente - else - secmem(num_face, k) += (inco(n0, k) - cl.T_ext(num_face_cl, k)) * cl.h_imp_grad(num_face_cl, k); // Si oriente a envers - } + { + ToDo_Kokkos("critical"); + for (int num_face = ndeb, num_face_cl = 0; num_face < nfin; num_face++, num_face_cl++) + { + int n0 = face_voisins(num_face, 0); + if (n0 < 0) + n0 = face_voisins(num_face, 1); + if (face_voisins(num_face, 0) >= 0) + secmem(num_face, k) -= (inco(n0, k) - cl.T_ext(num_face_cl, k)) * cl.h_imp_grad(num_face_cl, k); // Si bien oriente + else + secmem(num_face, k) += (inco(n0, k) - cl.T_ext(num_face_cl, k)) * cl.h_imp_grad(num_face_cl, k); // Si oriente a envers + } + } else { /* Do nothing */ } } else if (sub_type(Neumann_paroi, la_cl.valeur())) // Cas Neumann_paroi @@ -109,36 +121,43 @@ void Op_Grad_P0_to_Face::ajouter_blocs(matrices_t matrices, DoubleTab& secmem, c const Neumann_paroi& cl = ref_cast(Neumann_paroi, la_cl.valeur()); // XXX Elie Saikali : on calcule pas si champ_front_var n'est pas initialise if (cl.champ_front().has_valeurs_au_temps(cl.champ_front().get_temps_defaut())) - for (int num_face = ndeb, num_face_cl = 0; num_face < nfin; num_face++, num_face_cl++) - { - if (face_voisins(num_face, 0) >= 0) - secmem(num_face, k) -= cl.flux_impose(num_face_cl, k); // Si bien oriente - else - secmem(num_face, k) += cl.flux_impose(num_face_cl, k); // Si oriente a envers - } + { + ToDo_Kokkos("critical"); + for (int num_face = ndeb, num_face_cl = 0; num_face < nfin; num_face++, num_face_cl++) + { + if (face_voisins(num_face, 0) >= 0) + secmem(num_face, k) -= cl.flux_impose(num_face_cl, k); // Si bien oriente + else + secmem(num_face, k) += cl.flux_impose(num_face_cl, k); // Si oriente a envers + } + } } else if (!sub_type(Neumann_homogene, la_cl.valeur())) // En Neumann homogene, i.e. symetrie, la derivee a la face est nulle => on fait rien - for (int num_face = ndeb; num_face < nfin; num_face++) - { - int n0 = face_voisins(num_face, 0); - if (n0 < 0) - n0 = face_voisins(num_face, 1); + { + ToDo_Kokkos("critical"); + for (int num_face = ndeb; num_face < nfin; num_face++) + { + int n0 = face_voisins(num_face, 0); + if (n0 < 0) + n0 = face_voisins(num_face, 1); - const int ori = orientation(num_face); - int face_opposee = zvdf.elem_faces(n0, ori); - if (face_opposee == num_face) - face_opposee = zvdf.elem_faces(n0, ori + dimension); + const int ori = orientation(num_face); + int face_opposee = zvdf.elem_faces(n0, ori); + if (face_opposee == num_face) + face_opposee = zvdf.elem_faces(n0, ori + dimension); - int n1 = face_voisins(face_opposee, 0); - if ((n1 < 0) || ((n1 == n0) && face_voisins(face_opposee, 1) >= 0)) - n1 = face_voisins(face_opposee, 1); + int n1 = face_voisins(face_opposee, 0); + if ((n1 < 0) || ((n1 == n0) && face_voisins(face_opposee, 1) >= 0)) + n1 = face_voisins(face_opposee, 1); - if (n1 != n0) - secmem(num_face, k) -= (inco(n1, k) - inco(n0, k)) / (xp(n1, ori) - xp(n0, ori)); - } + if (n1 != n0) + secmem(num_face, k) -= (inco(n1, k) - inco(n0, k)) / (xp(n1, ori) - xp(n0, ori)); + } + } } // Boucle sur les faces internes + ToDo_Kokkos("critical"); for (int num_face = zvdf.premiere_face_int(); num_face < zvdf.nb_faces(); num_face++) for (int k = 0; k < N; k++) { diff --git a/src/VDF/Operateurs/Op_Divers/Op_Grad_VDF_Face.cpp b/src/VDF/Operateurs/Op_Divers/Op_Grad_VDF_Face.cpp index cb23356d01..bf23150b81 100644 --- a/src/VDF/Operateurs/Op_Divers/Op_Grad_VDF_Face.cpp +++ b/src/VDF/Operateurs/Op_Divers/Op_Grad_VDF_Face.cpp @@ -47,8 +47,6 @@ void Op_Grad_VDF_Face::calculer_flux_bords() const const Domaine_Cl_VDF& zclvdf = la_zcl_vdf.valeur(); const Navier_Stokes_std& eqn_hydr = ref_cast(Navier_Stokes_std,equation()); const Champ_P0_VDF& la_pression_P0 = ref_cast(Champ_P0_VDF,eqn_hydr.pression_pa()); - const DoubleTab& pression_P0 = la_pression_P0.valeurs(); - const DoubleVect& face_surfaces = zvdf.face_surfaces(); int nb_bord = zvdf.nb_front_Cl(); for (int n_bord=0; n_bordfrontiere_dis()); int ndeb = le_bord.num_premiere_face(); int nfin = ndeb + le_bord.nb_faces(); - for (int face=ndeb; face(la_pression_P0.valeurs()).view_ro(); + DoubleTabView flux_bords = flux_bords_.view_wo(); + Kokkos::parallel_for(start_gpu_timer(__KERNEL_NAME__), range_1D(ndeb, nfin), KOKKOS_LAMBDA(const int face) + { + int elem0 = face_voisins_v(face,0); + int ori = orientation(face); + double n0 = face_surfaces(face)*porosite_surf_v(face); + if (elem0 != -1) flux_bords(face,ori) = (pression_P0(elem0))*n0 ; + else + { + int elem1 = face_voisins_v(face,1); + flux_bords(face,ori) = -(pression_P0(elem1))*n0 ; + } + }); + end_gpu_timer(__KERNEL_NAME__); } // fin for n_bord } @@ -79,9 +84,8 @@ int Op_Grad_VDF_Face::impr(Sortie& os) const const Schema_Temps_base& sch = equation().probleme().schema_temps(); const Domaine_VDF& zvdf = le_dom_vdf.valeur(); const Domaine_Cl_VDF& zclvdf = la_zcl_vdf.valeur(); - int face, ori; - DoubleTab xgr; - if (impr_mom) xgr = zvdf.calculer_xgr(); + DoubleTrav tab_xgr; + if (impr_mom) tab_xgr = zvdf.calculer_xgr(); // flux_bords contains the sum of flux on each boundary: DoubleTrav tab_flux_bords(3,zvdf.nb_front_Cl(),3); tab_flux_bords=0.; @@ -89,7 +93,7 @@ int Op_Grad_VDF_Face::impr(Sortie& os) const flux_bord_perio1(k) -> flux_bords2(1,num_cl,k) flux_bord_perio2(k) -> flux_bords2(2,num_cl,k) moment(k) -> flux_bords2(3,num_cl,k) */ - int nb_bord = zvdf.nb_front_Cl(); + int nb_bord = zvdf.nb_front_Cl(); for (int n_bord=0; n_bord(); + CDoubleTabView xgr; + if (impr_mom) xgr = tab_xgr.view_ro(); + Kokkos::parallel_for(start_gpu_timer(__KERNEL_NAME__), range_1D(ndeb, nfin), KOKKOS_LAMBDA(const int face) + { + int ori = orientation(face); + Kokkos::atomic_add(&sum_flux_bords(0, n_bord, ori), + flux_bords(face, ori)); - for (face=ndeb; face= 0) @@ -239,20 +256,30 @@ void Op_Grad_VDF_Face::dimensionner_blocs(matrices_t matrices, const tabs_t& sem } -void Op_Grad_VDF_Face::ajouter_blocs(matrices_t matrices, DoubleTab& secmem, const tabs_t& semi_impl) const +void Op_Grad_VDF_Face::ajouter_blocs(matrices_t matrices, DoubleTab& tab_secmem, const tabs_t& semi_impl) const { - Matrice_Morse *mat = matrices.count("pression") ? matrices.at("pression") : nullptr; - const DoubleTab& inco = semi_impl.count("pression") ? semi_impl.at("pression") : (le_champ_inco ? le_champ_inco->valeurs() : ref_cast(Navier_Stokes_std, equation()).pression().valeurs()), - *alp = sub_type(Pb_Multiphase, equation().probleme()) ? &ref_cast(Pb_Multiphase, equation().probleme()).equation_masse().inconnue().passe() : nullptr; + const DoubleTab& tab_inco = semi_impl.count("pression") ? semi_impl.at("pression") : (le_champ_inco ? le_champ_inco->valeurs() : ref_cast(Navier_Stokes_std, equation()).pression().valeurs()); + const bool is_pbm = sub_type(Pb_Multiphase, equation().probleme()); + const DoubleTab *alp = is_pbm ? &ref_cast(Pb_Multiphase, equation().probleme()).equation_masse().inconnue().passe() : nullptr; + Matrice_Morse *ptr_mat = matrices.count("pression") ? matrices.at("pression") : nullptr; - assert_espace_virtuel_vect(inco); + assert_espace_virtuel_vect(tab_inco); const Domaine_VDF& zvdf = le_dom_vdf.valeur(); const Domaine_Cl_VDF& zclvdf = la_zcl_vdf.valeur(); - const DoubleVect& face_surfaces = zvdf.face_surfaces(), &vf = zvdf.volumes_entrelaces(); - const DoubleTab& vfd = zvdf.volumes_entrelaces_dir(); - const int M = inco.line_size(), N = secmem.line_size(); + const int M = tab_inco.line_size(), N = tab_secmem.line_size(); + Matrice_Morse_View mat; + if (ptr_mat) mat.set(*ptr_mat); + CDoubleTabView alpha; + if (is_pbm) alpha = alp->view_ro(); + CIntTabView face_voisins_v = face_voisins.view_ro(); + CDoubleArrView porosite_surf_v = porosite_surf.view_ro(); + CDoubleArrView face_surfaces = zvdf.face_surfaces().view_ro(); + CDoubleArrView vf = zvdf.volumes_entrelaces().view_ro(); + CDoubleTabView vfd = zvdf.volumes_entrelaces_dir().view_ro(); + CDoubleTabView inco = tab_inco.view_ro(); + DoubleTabView secmem = tab_secmem.view_rw(); // Boucle sur les bords pour traiter les conditions aux limites for (int n_bord = 0; n_bord < zvdf.nb_front_Cl(); n_bord++) { @@ -263,57 +290,66 @@ void Op_Grad_VDF_Face::ajouter_blocs(matrices_t matrices, DoubleTab& secmem, con if ( sub_type(Neumann_sortie_libre,la_cl.valeur()) ) { const Neumann_sortie_libre& la_cl_typee = ref_cast(Neumann_sortie_libre, la_cl.valeur()); - for (int num_face = ndeb; num_face < nfin; num_face++) + const double coeff_P = Option_VDF::coeff_P_neumann; + CDoubleTabView flux_impose = la_cl_typee.tab_flux_impose().view_ro(); + Kokkos::parallel_for(start_gpu_timer(__KERNEL_NAME__), range_1D(ndeb, nfin), KOKKOS_LAMBDA(const int num_face) + { for (int n = 0, m = 0; n < N; n++, m += (M > 1)) { - const double P_imp = la_cl_typee.flux_impose(num_face-ndeb, m); - - const int n0 = face_voisins(num_face,0); + const double P_imp = flux_impose(num_face - ndeb, m); + const int n0 = face_voisins_v(num_face, 0); if (n0 != -1) { - const double coef = face_surfaces(num_face)*porosite_surf(num_face) * Option_VDF::coeff_P_neumann * (alp ? (*alp)(n0, n) : 1); - if(mat) (*mat)(N * num_face + n, M * n0 + m) -= coef; + const double coef = face_surfaces(num_face) * porosite_surf_v(num_face) * coeff_P * (is_pbm ? alpha(n0, n) : 1.0); + if (ptr_mat) mat.atomic_add(N * num_face + n, M * n0 + m, -coef); secmem(num_face, n) -= coef * (P_imp - inco(n0, m)); } else { - const int n1 = face_voisins(num_face,1); - const double coef = face_surfaces(num_face)*porosite_surf(num_face) * Option_VDF::coeff_P_neumann * (alp ? (*alp)(n1, n) : 1.0); - if(mat) (*mat)(N * num_face + n, M * n1 + m) += coef; + const int n1 = face_voisins_v(num_face, 1); + const double coef = face_surfaces(num_face) * porosite_surf_v(num_face) * coeff_P * (is_pbm ? alpha(n1, n) : 1.0); + if (ptr_mat) mat.atomic_add(N * num_face + n, M * n1 + m, coef); secmem(num_face, n) -= coef * (inco(n1, m) - P_imp); } } + }); + end_gpu_timer(__KERNEL_NAME__); } else if (sub_type(Periodique,la_cl.valeur())) // Correction periodicite { - for (int f = ndeb; f < nfin; f++) + Kokkos::parallel_for(start_gpu_timer(__KERNEL_NAME__), range_1D(ndeb, nfin), KOKKOS_LAMBDA(const int f) + { for (int n = 0, m = 0; n < N; n++, m += (M > 1)) { - const int n0 = face_voisins(f, 0), n1 = face_voisins(f, 1); - const double alpha_face = alp ? (vfd(f, 0) * (*alp)(n0, n) + vfd(f, 1) * (*alp)(n1, n)) / vf(f) : 1.0; - const double coef = face_surfaces(f) * porosite_surf(f) * alpha_face; + const int n0 = face_voisins_v(f, 0), n1 = face_voisins_v(f, 1); + const double alpha_face = is_pbm ? (vfd(f, 0) * alpha(n0, n) + vfd(f, 1) * alpha(n1, n)) / vf(f) : 1.0; + const double coef = face_surfaces(f) * porosite_surf_v(f) * alpha_face; secmem(f, n) -= coef * (inco(n1, m) - inco(n0, m)); } + }); + end_gpu_timer(__KERNEL_NAME__); } else if (sub_type(Symetrie,la_cl.valeur())) { /* Do nothing */ } else if ( (sub_type(Dirichlet,la_cl.valeur())) || (sub_type(Dirichlet_homogene,la_cl.valeur())) ) { /* Do nothing */ } } // Boucle sur les faces internes - for (int f = zvdf.premiere_face_int(); f < zvdf.nb_faces(); f++) + Kokkos::parallel_for(start_gpu_timer(__KERNEL_NAME__), range_1D(zvdf.premiere_face_int(), zvdf.nb_faces()), KOKKOS_LAMBDA(const int f) + { for (int n = 0, m = 0; n < N; n++, m += (M > 1)) { - const int n0 = face_voisins(f, 0), n1 = face_voisins(f, 1); + const int n0 = face_voisins_v(f, 0), n1 = face_voisins_v(f, 1); // XXX : Elie Saikali : attention : on code alpha grad(P) et pas grad(alpha.P) !! Sinon on manque des termes ... (voir avec Antoine sinon) - const double alpha_face = alp ? (vfd(f, 0) * (*alp)(n0, n) + vfd(f, 1) * (*alp)(n1, n)) / vf(f) : 1.0; - const double coef = face_surfaces(f) * porosite_surf(f) * alpha_face; - if(mat) + const double alpha_face = is_pbm ? (vfd(f, 0) * alpha(n0, n) + vfd(f, 1) * alpha(n1, n)) / vf(f) : 1.0; + const double coef = face_surfaces(f) * porosite_surf_v(f) * alpha_face; + if (ptr_mat) { - (*mat)(N * f + n, M * n0 + m) -= coef; - (*mat)(N * f + n, M * n1 + m) += coef; + mat.atomic_add(N * f + n, M * n0 + m, -coef); + mat.atomic_add(N * f + n, M * n1 + m, +coef); } secmem(f, n) -= coef * (inco(n1, m) - inco(n0, m)); } - - secmem.echange_espace_virtuel(); + }); + end_gpu_timer(__KERNEL_NAME__); + tab_secmem.echange_espace_virtuel(); } diff --git a/src/VDF/Operateurs/Op_Divers/Op_Grad_VDF_Face.h b/src/VDF/Operateurs/Op_Divers/Op_Grad_VDF_Face.h index 15a7a8fede..2130345ae0 100644 --- a/src/VDF/Operateurs/Op_Divers/Op_Grad_VDF_Face.h +++ b/src/VDF/Operateurs/Op_Divers/Op_Grad_VDF_Face.h @@ -1,5 +1,5 @@ /**************************************************************************** -* Copyright (c) 2024, CEA +* Copyright (c) 2026, CEA * All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: diff --git a/src/VDF/Operateurs/Op_Divers/Op_Grad_VDF_Face_base.cpp b/src/VDF/Operateurs/Op_Divers/Op_Grad_VDF_Face_base.cpp index c980579fbc..b81417f8ab 100644 --- a/src/VDF/Operateurs/Op_Divers/Op_Grad_VDF_Face_base.cpp +++ b/src/VDF/Operateurs/Op_Divers/Op_Grad_VDF_Face_base.cpp @@ -1,5 +1,5 @@ /**************************************************************************** -* Copyright (c) 2024, CEA +* Copyright (c) 2025, CEA * All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: @@ -31,7 +31,7 @@ void Op_Grad_VDF_Face_base::associer(const Domaine_dis_base& domaine_dis, const porosite_surf.ref(la_zcl_vdf->equation().milieu().porosite_face()); volume_entrelaces.ref(zvdf.volumes_entrelaces()); face_voisins.ref(zvdf.face_voisins()); - orientation.ref(zvdf.orientation()); + orientation_.ref(zvdf.orientation()); xp.ref(zvdf.xp()); } diff --git a/src/VDF/Operateurs/Op_Divers/Op_Grad_VDF_Face_base.h b/src/VDF/Operateurs/Op_Divers/Op_Grad_VDF_Face_base.h index 844b63bcf9..2ad4d84255 100644 --- a/src/VDF/Operateurs/Op_Divers/Op_Grad_VDF_Face_base.h +++ b/src/VDF/Operateurs/Op_Divers/Op_Grad_VDF_Face_base.h @@ -1,5 +1,5 @@ /**************************************************************************** -* Copyright (c) 2024, CEA +* Copyright (c) 2025, CEA * All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: @@ -42,15 +42,15 @@ class Op_Grad_VDF_Face_base : public Operateur_Grad_base inline const double& volume_entrelaces_(int i) const { return volume_entrelaces(i); } inline double porosite_surf_(int i) { return porosite_surf(i); } inline const double& porosite_surf_(int i) const { return porosite_surf(i); } - inline int orientation_(int face) { return orientation(face); } - inline const int& orientation_(int face) const { return orientation(face); } + inline int orientation(int face) { return orientation_(face); } + inline const int& orientation(int face) const { return orientation_(face); } inline double xp_(int elem, int ori) { return xp(elem,ori); } inline const double& xp_(int elem, int ori) const { return xp(elem,ori); } protected: OBS_PTR(Domaine_VDF) le_dom_vdf; OBS_PTR(Domaine_Cl_VDF) la_zcl_vdf; - IntVect orientation; + IntVect orientation_; IntTab face_voisins; DoubleVect porosite_surf, volume_entrelaces; DoubleTab xp; diff --git a/src/VDF/Operateurs/Op_Divers/Op_VDF_Elem.cpp b/src/VDF/Operateurs/Op_Divers/Op_VDF_Elem.cpp index 156bea107c..65fc10f069 100644 --- a/src/VDF/Operateurs/Op_Divers/Op_VDF_Elem.cpp +++ b/src/VDF/Operateurs/Op_Divers/Op_VDF_Elem.cpp @@ -54,6 +54,7 @@ void Op_VDF_Elem::dimensionner_old(const Domaine_VDF& le_dom, const Domaine_Cl_V IntVect rang_voisin(n1*nb_comp); rang_voisin = 1; + ToDo_Kokkos("critical"); for (int num_face = ndeb; num_face < nfin; num_face++) { const int elem1 = face_voisins(num_face,0), elem2 = face_voisins(num_face,1); @@ -73,6 +74,7 @@ void Op_VDF_Elem::dimensionner_old(const Domaine_VDF& le_dom, const Domaine_Cl_V int ind_face_global; IntVect fait(nfaces); fait = 0; + ToDo_Kokkos("critical"); for (int face = 0; face < nfaces; face++) { if (fait[face] == 0) @@ -104,6 +106,7 @@ void Op_VDF_Elem::dimensionner_old(const Domaine_VDF& le_dom, const Domaine_Cl_V } // on traite les faces internes pour les voisins + ToDo_Kokkos("critical"); for (int num_face = ndeb; num_face < nfin; num_face++) { const int elem1 = face_voisins(num_face,0), elem2 = face_voisins(num_face,1); diff --git a/src/VDF/Operateurs/Op_Divers/Op_VDF_Face.cpp b/src/VDF/Operateurs/Op_Divers/Op_VDF_Face.cpp index 75732b4ea3..984affc339 100644 --- a/src/VDF/Operateurs/Op_Divers/Op_VDF_Face.cpp +++ b/src/VDF/Operateurs/Op_Divers/Op_VDF_Face.cpp @@ -42,6 +42,7 @@ void Op_VDF_Face::dimensionner(const Domaine_VDF& le_dom, const Domaine_Cl_VDF& IntVect rang_voisin(nfin); rang_voisin = 1; + ToDo_Kokkos("critical"); for (int num_face = ndeb; num_face < nfin; num_face++) { const int ori = orientation(num_face), face1 = le_dom.face_amont_princ(num_face,0), face2 = le_dom.face_amont_princ(num_face,1), @@ -62,11 +63,13 @@ void Op_VDF_Face::dimensionner(const Domaine_VDF& le_dom, const Domaine_Cl_VDF& // on balaye les faces pour dimensionner tab1 et tab2 tab1(0) = 1; + ToDo_Kokkos("critical"); for (int num_face = ndeb; num_face < nfin; num_face++) for (int k = 0; k < nb_comp; k++) tab1(num_face*nb_comp+1+k) = rang_voisin(num_face) + tab1(num_face*nb_comp+k); la_matrice.dimensionner(nfin*nb_comp,tab1(nfin*nb_comp)-1); + ToDo_Kokkos("critical"); for (int num_face = ndeb; num_face < nfin; num_face++) { const int ori = orientation(num_face), face1 = le_dom.face_amont_princ(num_face,0), face2 = le_dom.face_amont_princ(num_face,1), @@ -102,6 +105,7 @@ void Op_VDF_Face::dimensionner(const Domaine_VDF& le_dom, const Domaine_Cl_VDF& const Front_VF& la_front_dis = ref_cast(Front_VF,la_cl->frontiere_dis()); const int ndeb_p = la_front_dis.num_premiere_face(), nfaces = la_front_dis.nb_faces(), nfin_p = ndeb_p + nfaces; + ToDo_Kokkos("critical"); for (int num_face = ndeb_p; num_face < nfin_p; num_face++) { const int ori = orientation(num_face); @@ -165,6 +169,7 @@ void Op_VDF_Face::modifier_pour_Cl(const Domaine_VDF& le_dom, const Domaine_Cl_V { const Dirichlet& la_cl_Dirichlet = ref_cast(Dirichlet, la_cl.valeur()); + ToDo_Kokkos("critical"); for (int face = numdeb; face < (numdeb + nfaces); face++) for (int comp = 0; comp < nb_comp; comp++) { @@ -177,6 +182,7 @@ void Op_VDF_Face::modifier_pour_Cl(const Domaine_VDF& le_dom, const Domaine_Cl_V if (sub_type(Symetrie, la_cl.valeur())) { + ToDo_Kokkos("critical"); for (int face = numdeb; face < numdeb + nfaces; face++) for (int comp = 0; comp < nb_comp; comp++) { @@ -189,6 +195,7 @@ void Op_VDF_Face::modifier_pour_Cl(const Domaine_VDF& le_dom, const Domaine_Cl_V { const Dirichlet_homogene& la_cl_Dirichlet_homogene = ref_cast(Dirichlet_homogene, la_cl.valeur()); + ToDo_Kokkos("critical"); for (int face = numdeb; face < numdeb + nfaces; face++) for (int comp = 0; comp < nb_comp; comp++) { diff --git a/src/VDF/Solveurs/Assembleur_P_VDF.cpp b/src/VDF/Solveurs/Assembleur_P_VDF.cpp index e4a28e1360..21452913d2 100644 --- a/src/VDF/Solveurs/Assembleur_P_VDF.cpp +++ b/src/VDF/Solveurs/Assembleur_P_VDF.cpp @@ -136,6 +136,7 @@ int Assembleur_P_VDF::construire(Matrice& la_matrice) const int nb_faces_periodiques = liste_faces_periodiques(liste_faces_perio); const int nb_faces_internes = domaine_vdf.nb_faces_internes(); const int premiere_face_interne = domaine_vdf.premiere_face_int(); + ToDo_Kokkos("critical"); for (i = 0; i < nb_faces_internes + nb_faces_periodiques; i++) { int face; @@ -326,6 +327,7 @@ int Assembleur_P_VDF::remplir(Matrice& la_matrice, const DoubleVect& volumes_ent const int nb_faces_periodiques = liste_faces_periodiques(liste_faces_perio); const int nb_faces_internes = domaine_vdf.nb_faces_internes(); const int premiere_face_interne = domaine_vdf.premiere_face_int(); + ToDo_Kokkos("critical"); for (int i_face = 0; i_face < nb_faces_internes + nb_faces_periodiques; i_face++) { @@ -415,6 +417,7 @@ int Assembleur_P_VDF::remplir(Matrice& la_matrice, const DoubleVect& volumes_ent carre.set_est_definie(1); const int ndeb = la_front_dis.num_premiere_face(); const int nfin = ndeb + la_front_dis.nb_faces(); + ToDo_Kokkos("critical"); for (int num_face = ndeb; num_face < nfin; num_face++) { // Calcul de rho sur cette face @@ -447,6 +450,10 @@ int Assembleur_P_VDF::remplir(Matrice& la_matrice, const DoubleVect& volumes_ent } } has_P_ref = (int)mp_max(has_P_ref); + // PL: we specify that the matrix has null space or not + // It will be used by PETSc solver for a better handling of such a case + // ToDo: Same for VEF, EF, PolyMAC but remove the *2 on coefficient ? Nothing was done for VDF + la_matrice->set_has_constant_nullspace(!has_P_ref); // Verification sanitaire: pas d'element nul sur la diagonale for (int i = 0; i < nb_elem; i++) @@ -565,6 +572,7 @@ void Assembleur_P_VDF::modifier_secmem_pression_imposee(const Neumann_sortie_lib { const int nb_faces = frontiere_vf.nb_faces(); const int num_premiere_face = frontiere_vf.num_premiere_face(); + ToDo_Kokkos("critical"); for (int i = 0; i < nb_faces; i++) { const int num_face = num_premiere_face + i; @@ -584,13 +592,9 @@ void Assembleur_P_VDF::modifier_secmem_pression_imposee(const Neumann_sortie_lib */ void Assembleur_P_VDF::modifier_secmem_vitesse_imposee(const Entree_fluide_vitesse_imposee& cond_lim, const Front_VF& frontiere_vf, - DoubleTab& secmem) + DoubleTab& tab_secmem) { const Champ_front_base& champ_front = cond_lim.champ_front(); - const Domaine_VDF& le_dom = le_dom_VDF.valeur(); - const DoubleVect& face_surfaces = le_dom.face_surfaces(); - const IntTab& face_voisins = le_dom.face_voisins(); - if (get_resoudre_en_u()) { if (champ_front.instationnaire()) @@ -600,22 +604,30 @@ void Assembleur_P_VDF::modifier_secmem_vitesse_imposee(const Entree_fluide_vites bool ch_unif = (tab_gpoint.nb_dim()==1 || tab_gpoint.dimension(0)==1); const int nb_faces = frontiere_vf.nb_faces(); const int num_premiere_face = frontiere_vf.num_premiere_face(); - for (int i = 0; i < nb_faces; i++) - { - const int num_face = num_premiere_face + i; - const double surface = face_surfaces(num_face); - const int elem0 = face_voisins(num_face, 0); - const int elem1 = face_voisins(num_face, 1); - // gpoint est relatif a la normale a la face (elle pointe vers elem1) - // La normale est-elle entrante ou sortante ? - const double signe = (elem0 < 0) ? 1. : -1.; - // Numero de l'element adjacent a la face de bord - const int elem = elem0 + elem1 + 1; - const int ori = le_dom.orientation(num_face); - const double gpoint = nb_dim==1 ? tab_gpoint(ori) : tab_gpoint(ch_unif ? 0 : i, ori); - - secmem[elem] += signe * surface * gpoint; - } + const Domaine_VDF& le_dom = le_dom_VDF.valeur(); + const bool nb_dim_1 = (nb_dim == 1); + const int ncols = nb_dim_1 ? 1 : tab_gpoint.dimension(1); + CDoubleArrView gpoint = static_cast(tab_gpoint).view_ro(); + CDoubleArrView face_surfaces = static_cast(le_dom.face_surfaces()).view_ro(); + CIntTabView face_voisins = le_dom.face_voisins().view_ro(); + CIntArrView orientation = le_dom.orientation().view_ro(); + DoubleArrView secmem = static_cast(tab_secmem).view_rw(); + Kokkos::parallel_for(start_gpu_timer(__KERNEL_NAME__), range_1D(0, nb_faces), KOKKOS_LAMBDA(const int i) + { + const int num_face = num_premiere_face + i; + const double surface = face_surfaces(num_face); + const int elem0 = face_voisins(num_face, 0); + const int elem1 = face_voisins(num_face, 1); + // gpoint est relatif a la normale a la face (elle pointe vers elem1) + // La normale est-elle entrante ou sortante ? + const double signe = (elem0 < 0) ? 1. : -1.; + // Numero de l'element adjacent a la face de bord + const int elem = elem0 + elem1 + 1; + const int ori = orientation(num_face); + const int row = nb_dim_1 ? 0 : (ch_unif ? 0 : i); + secmem(elem) += signe * surface * gpoint(row * ncols + ori); + }); + end_gpu_timer(__KERNEL_NAME__); } else { @@ -628,27 +640,29 @@ void Assembleur_P_VDF::modifier_secmem_vitesse_imposee(const Entree_fluide_vites } } -int Assembleur_P_VDF::modifier_solution(DoubleTab& pression) +int Assembleur_P_VDF::modifier_solution(DoubleTab& tab_pression) { - // Projection : - double press_0; if(!has_P_ref) { // On prend la pression minimale comme pression de reference // afin d'avoir la meme pression de reference en sequentiel et parallele - press_0=DMAXFLOAT; + double press_0; int nb_elem=le_dom_VDF->domaine().nb_elem(); - for(int n=0; n(tab_pression).view_ro(); + Kokkos::parallel_reduce(start_gpu_timer(__KERNEL_NAME__), nb_elem, KOKKOS_LAMBDA(const int n, double& press_min) + { + if (pression(n) < press_min) press_min = pression(n); + }, Kokkos::Min(press_0)); + end_gpu_timer(__KERNEL_NAME__); press_0 = mp_min(press_0); - pression -=press_0; - pression.echange_espace_virtuel(); + tab_pression -= press_0; + tab_pression.echange_espace_virtuel(); } return 1; } int Assembleur_P_VDF::assembler_mat(Matrice& matrice,const DoubleVect& volumes_entrelaces,int incr_pression,int resoudre_en_u) { + statistics().begin_count(STD_COUNTERS::matrix_assembly,statistics().get_last_opened_counter_level()+1); if (!matrice) { if (je_suis_maitre()) @@ -660,6 +674,7 @@ int Assembleur_P_VDF::assembler_mat(Matrice& matrice,const DoubleVect& volumes_e set_resoudre_en_u(resoudre_en_u); remplir(matrice,volumes_entrelaces, 0); + statistics().end_count(STD_COUNTERS::matrix_assembly); return 1; } @@ -677,7 +692,6 @@ int Assembleur_P_VDF::assembler(Matrice& matrice) set_resoudre_en_u(1); construire(matrice); const Domaine_VDF& domaine_vdf = le_dom_VDF.valeur(); - const DoubleVect& volumes_entrelaces = domaine_vdf.volumes_entrelaces(); remplir(matrice,volumes_entrelaces, 0); return 1; diff --git a/src/VDF/Solveurs/Assembleur_P_VDF.h b/src/VDF/Solveurs/Assembleur_P_VDF.h index 314dc27192..aa4e5f7886 100644 --- a/src/VDF/Solveurs/Assembleur_P_VDF.h +++ b/src/VDF/Solveurs/Assembleur_P_VDF.h @@ -45,11 +45,13 @@ class Assembleur_P_VDF: public Assembleur_base void assembler_continuite(matrices_t matrices, DoubleTab& secmem, int aux_only = 0) const override; DoubleTab norme_continuite() const override; + protected_but_public_for_cuda + void modifier_secmem_vitesse_imposee(const Entree_fluide_vitesse_imposee& cond_lim, const Front_VF& frontiere_vf, DoubleTab& secmem); + protected: int construire(Matrice& la_matrice); int remplir(Matrice& la_matrice, const DoubleVect& volumes_entrelaces, const Champ_Don_base *rho_ptr); void modifier_secmem_pression_imposee(const Neumann_sortie_libre& cond_lim, const Front_VF& frontiere_vf, DoubleTab& secmem); - void modifier_secmem_vitesse_imposee(const Entree_fluide_vitesse_imposee& cond_lim, const Front_VF& frontiere_vf, DoubleTab& secmem); int liste_faces_periodiques(ArrOfInt& faces); OBS_PTR(Domaine_VDF) le_dom_VDF; diff --git a/src/VDF/Solveurs/Assembleur_P_VDF_Q4.cpp b/src/VDF/Solveurs/Assembleur_P_VDF_Q4.cpp index a0451650aa..ef9391bab8 100644 --- a/src/VDF/Solveurs/Assembleur_P_VDF_Q4.cpp +++ b/src/VDF/Solveurs/Assembleur_P_VDF_Q4.cpp @@ -58,6 +58,7 @@ int Assembleur_P_VDF_Q4::assembler(Matrice& la_matrice) tab2 = 0; coeff = 0; rang_voisins = 1; + ToDo_Kokkos("critical"); for (face=0 ; face(tab_sm).view_rw(); + Kokkos::parallel_for(start_gpu_timer(__KERNEL_NAME__), range_1D(0, nb_elem), KOKKOS_LAMBDA( const int num_elem) + { + sm(num_elem) /= (volumes(num_elem) * porosite_elem(num_elem)); + }); + } else if (nb_dim == 2) { - for (int num_elem = 0; num_elem < nb_elem; num_elem++) + DoubleTabView sm = tab_sm.view_rw(); + Kokkos::parallel_for(start_gpu_timer(__KERNEL_NAME__), range_1D(0, nb_elem), KOKKOS_LAMBDA(const int num_elem) + { for (int k = 0; k < nb_comp; k++) sm(num_elem, k) /= (volumes(num_elem) * porosite_elem(num_elem)); + }); } - else if (sm.nb_dim() == 3) + else if (nb_dim == 3) { //int d0=sm.dimension(0); - int d1 = sm.dimension(1); - int d2 = sm.dimension(2); - for (int num_elem = 0; num_elem < nb_elem; num_elem++) + int d1 = tab_sm.dimension(1); + int d2 = tab_sm.dimension(2); + DoubleTabView3 sm = tab_sm.view_rw<3>(); + Kokkos::parallel_for(start_gpu_timer(__KERNEL_NAME__), range_1D(0, nb_elem), KOKKOS_LAMBDA(const int num_elem) + { for (int k = 0; k < d1; k++) for (int d = 0; d < d2; d++) sm(num_elem, k, d) /= (volumes(num_elem) * porosite_elem(num_elem)); + }); } else { - Cerr << "Masse_VDF_Elem::appliquer ne peut pas s'appliquer a un DoubleTab a " << sm.nb_dim() << " dimensions" << finl; + Cerr << "Masse_VDF_Elem::appliquer ne peut pas s'appliquer a un DoubleTab a " << nb_dim << " dimensions" << finl; Process::exit(); } - sm.echange_espace_virtuel(); - return sm; + end_gpu_timer(__KERNEL_NAME__); + tab_sm.echange_espace_virtuel(); + return tab_sm; } } diff --git a/src/VDF/Solveurs/Masse_VDF_Face.cpp b/src/VDF/Solveurs/Masse_VDF_Face.cpp index 7b15583f02..215daf86bf 100644 --- a/src/VDF/Solveurs/Masse_VDF_Face.cpp +++ b/src/VDF/Solveurs/Masse_VDF_Face.cpp @@ -38,66 +38,55 @@ void Masse_VDF_Face::completer() Solveur_Masse_Face_proto::associer_masse_proto(*this,le_dom_VDF.valeur()); } -DoubleTab& Masse_VDF_Face::appliquer_impl(DoubleTab& sm) const +DoubleTab& Masse_VDF_Face::appliquer_impl(DoubleTab& tab_sm) const { - if (sub_type(Pb_Multiphase, equation().probleme())) return Solveur_Masse_Face_proto::appliquer_impl_proto(sm); + if (sub_type(Pb_Multiphase, equation().probleme())) return Solveur_Masse_Face_proto::appliquer_impl_proto(tab_sm); else { - assert(le_dom_VDF); assert(le_dom_Cl_VDF); const Domaine_VDF& domaine_VDF = le_dom_VDF.valeur(); - const DoubleVect& porosite_face = equation().milieu().porosite_face(); - const DoubleVect& volumes_entrelaces = domaine_VDF.volumes_entrelaces(); - const int nb_faces = domaine_VDF.nb_faces(), N = sm.line_size(); - - if (sm.dimension(0) != nb_faces) - { - Cerr << "Masse_VDF_Face::appliquer : erreur dans la taille de sm" << finl; - Process::exit(); - } + const int nb_faces = domaine_VDF.nb_faces(), N = tab_sm.line_size(); - // Boucle sur les faces joint + if (tab_sm.dimension(0) != nb_faces) Process::exit("Masse_VDF_Face::appliquer : erreur dans la taille de tab_sm"); // Boucle sur les bords // Sur les faces qui portent des conditions aux limites de Dirichlet ou de Symetrie // la vitesse normale reste egale a sa valeur initiale. // Donc sur ces faces vpoint doit rester a 0. - + CDoubleArrView porosite_face = equation().milieu().porosite_face().view_ro(); + CDoubleArrView volumes_entrelaces = domaine_VDF.volumes_entrelaces().view_ro(); + DoubleTabView sm = tab_sm.view_rw(); for (int n_bord = 0; n_bord < domaine_VDF.nb_front_Cl(); n_bord++) { - // pour chaque Condition Limite on regarde son type const Cond_lim& la_cl = le_dom_Cl_VDF->les_conditions_limites(n_bord); const Front_VF& la_front_dis = ref_cast(Front_VF, la_cl->frontiere_dis()); const int ndeb = la_front_dis.num_premiere_face(); const int nfin = ndeb + la_front_dis.nb_faces(); - - if ( sub_type(Dirichlet,la_cl.valeur()) || sub_type(Dirichlet_homogene, la_cl.valeur())) - // Pour les faces de Dirichlet on met sm a 0 - for (int f = ndeb; f < nfin; f++) - for (int n = 0; n < N; n++) - sm(f, n) = 0; - else if (sub_type(Symetrie, la_cl.valeur())) - // Pour les faces de Symetrie on met vpoint a 0 - for (int f = ndeb; f < nfin; f++) - for (int n = 0; n < N; n++) - sm(f, n) = 0; - else - for (int f = ndeb; f < nfin; f++) - for (int n = 0; n < N; n++) - sm(f, n) /= (volumes_entrelaces(f) * porosite_face(f)); - + bool null = sub_type(Dirichlet,la_cl.valeur()) || sub_type(Dirichlet_homogene, la_cl.valeur()) || sub_type(Symetrie, la_cl.valeur()); + // Pour les faces de Dirichlet ou Symertie on met sm a 0 + Kokkos::parallel_for(start_gpu_timer(__KERNEL_NAME__), range_1D(ndeb, nfin), KOKKOS_LAMBDA(const int f) + { + for (int n = 0; n < N; n++) + { + if (null) sm(f, n) = 0; + else sm(f, n) /= (volumes_entrelaces(f) * porosite_face(f)); + } + }); + end_gpu_timer(__KERNEL_NAME__); } // Boucle sur les faces internes const int ndeb = domaine_VDF.premiere_face_int(); - for (int f = ndeb; f < nb_faces; f++) + Kokkos::parallel_for(start_gpu_timer(__KERNEL_NAME__), range_1D(ndeb, nb_faces), KOKKOS_LAMBDA(const int f) + { for (int n = 0; n < N; n++) sm(f, n) /= (volumes_entrelaces(f) * porosite_face(f)); + }); + end_gpu_timer(__KERNEL_NAME__); //sm.echange_espace_virtuel(); - //Debog::verifier("Masse_VDF_Face::appliquer sm",sm); - return sm; + return tab_sm; } } @@ -133,6 +122,7 @@ void Masse_VDF_Face::ajouter_blocs(matrices_t matrices, DoubleTab& secmem, doubl /* faces : si CLs, pas de produit par alpha * rho en multiphase */ DoubleTrav masse(N, N), masse_e(N, N); //masse alpha * rho, contribution + ToDo_Kokkos("critical"); for (f = 0; f < domaine.nb_faces(); f++) //faces reelles { if (!pbm || fcl(f, 0) >= 2) @@ -176,6 +166,7 @@ DoubleTab& Masse_VDF_Face::corriger_solution(DoubleTab& x, const DoubleTab& y, i const DoubleVect& fs = domaine.face_surfaces(); int f, n, N = x.line_size(), d, D = dimension; + ToDo_Kokkos("critical"); for (f = 0; f < domaine.nb_faces_tot(); f++) if (fcl(f, 0) == 2 || fcl(f, 0) == 4) for (n = 0; n < N; n++) x(f, n) = incr ? -vit(f, n) : 0; //Dirichlet homogene / Symetrie: on revient a 0 diff --git a/src/VDF/Solveurs/Rayo_semi_transp_solver_VDF.cpp b/src/VDF/Solveurs/Rayo_semi_transp_solver_VDF.cpp index b7b47c048a..a5ef967c58 100644 --- a/src/VDF/Solveurs/Rayo_semi_transp_solver_VDF.cpp +++ b/src/VDF/Solveurs/Rayo_semi_transp_solver_VDF.cpp @@ -86,6 +86,7 @@ void Rayo_semi_transp_solver_VDF::modifier_matrice() assert(fluide.longueur_rayo().nb_comp() == 1); assert(fluide.kappa().nb_comp() == 1); + ToDo_Kokkos("critical"); for (int face = ndeb; face < nfin; face++) { int elem = face_voisins(face, 0); @@ -222,6 +223,7 @@ void Rayo_semi_transp_solver_VDF::resoudre(double temps) assert(fluide.kappa().nb_comp() == 1); double n = -123., k = -123.; + ToDo_Kokkos("critical"); for (int elem = 0; elem < nb_elem; elem++) { if (sub_type(Champ_Uniforme, fluide.indice())) diff --git a/src/VDF/Sources/Canal/Terme_Source_Canal_RANS_LES_VDF_Elem.cpp b/src/VDF/Sources/Canal/Terme_Source_Canal_RANS_LES_VDF_Elem.cpp index ee67c4c7f3..1d763126ca 100644 --- a/src/VDF/Sources/Canal/Terme_Source_Canal_RANS_LES_VDF_Elem.cpp +++ b/src/VDF/Sources/Canal/Terme_Source_Canal_RANS_LES_VDF_Elem.cpp @@ -1,5 +1,5 @@ /**************************************************************************** -* Copyright (c) 2024, CEA +* Copyright (c) 2026, CEA * All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: @@ -169,6 +169,7 @@ void Terme_Source_Canal_RANS_LES_VDF_Elem::init() SFichier fic_verif("Tverif.RANS"); EFichier fic_vit("temperature_RANS.dat"); + ToDo_Kokkos("critical"); for(int num_elem=0 ; num_elem> trash; Cerr << "trash = " << trash << finl; + ToDo_Kokkos("critical"); for(int num_elem = 0 ; num_elem> utemp_sum(num_elem); @@ -266,6 +269,7 @@ void Terme_Source_Canal_RANS_LES_VDF_Elem::mettre_a_jour(double temps) { if (cptbis==0) { + ToDo_Kokkos("critical"); for (int num_elem=0; num_elem=f_start) { + ToDo_Kokkos("critical"); for (int num_elem=0; num_elemf_start)&&(compteur_reprise > 1))||((moyenne==3)&&(tps>dt_min))) { + ToDo_Kokkos("critical"); for(int num_elem = 0 ; num_elem> trash; Cerr << "trash = " << trash << finl; + ToDo_Kokkos("critical"); for(int num_face = 0 ; num_face> utemp_sum(num_face); @@ -330,6 +333,7 @@ void Terme_Source_Canal_RANS_LES_VDF_Face::init_calcul_moyenne_spat() // remplissage des tableaux ci-dessus // Pour le calcul de u, v, w sur les plans d hmogeneite + ToDo_Kokkos("critical"); for (num_face=0; num_face 1.1)&&(xv(face,1) < 0.9)&&(orientation(face)==0)) @@ -1086,6 +1104,7 @@ void Terme_Source_Canal_RANS_LES_VDF_Face::ajouter_blocs(matrices_t matrices, Do if((tps>dt_min)&&(tps>f_start)) { + ToDo_Kokkos("critical"); for(int num_face = 0 ; num_faceorientation(); - const DoubleVect& porosite_surf = equation().milieu().porosite_face(); - const DoubleVect& volumes_entrelaces = domaine_VF.volumes_entrelaces(); - int ncomp; - ArrOfDouble s(source()); + const IntVect& tab_orientation = le_dom_VDF->orientation(); + const DoubleVect& tab_porosite_surf = equation().milieu().porosite_face(); + const DoubleVect& tab_volumes_entrelaces = domaine_VF.volumes_entrelaces(); + ArrOfDouble tab_s(source()); + + CDoubleArrView volumes_entrelaces = tab_volumes_entrelaces.view_ro(); + CDoubleArrView porosite_surf = tab_porosite_surf.view_ro(); + CIntArrView orientation = tab_orientation.view_ro(); + DoubleTabView secmen = tab_secmen.view_rw(); + + CDoubleArrView s = tab_s.view_ro(); // Boucle sur les conditions limites pour traiter les faces de bord int n_bord, ndeb, nfin; for (n_bord = 0; n_bord < domaine_VF.nb_front_Cl(); n_bord++) @@ -69,12 +75,14 @@ void Terme_Source_Canal_perio_VDF_Face::ajouter_blocs(matrices_t matrices, Doubl ndeb = le_bord.num_premiere_face(); nfin = ndeb + le_bord.nb_faces(); - for (int num_face = ndeb; num_face < nfin; num_face++) - { - double vol = volumes_entrelaces(num_face) * porosite_surf(num_face); - ncomp = orientation(num_face); - secmem(num_face) += s[ncomp] * vol; - } + Kokkos::parallel_for(start_gpu_timer(__KERNEL_NAME__), Kokkos::RangePolicy<>(ndeb, nfin),KOKKOS_LAMBDA(const int num_face) + { + double vol = volumes_entrelaces(num_face) * porosite_surf(num_face); + int ncomp = orientation(num_face); + + secmen(num_face,0) += s[ncomp] * vol; + }); + end_gpu_timer(__KERNEL_NAME__); } else if ((sub_type(Dirichlet, la_cl.valeur())) || (sub_type(Dirichlet_homogene, la_cl.valeur()))) { @@ -85,22 +93,20 @@ void Terme_Source_Canal_perio_VDF_Face::ajouter_blocs(matrices_t matrices, Doubl // Boucle sur les faces internes ndeb = domaine_VF.premiere_face_int(); int nb_faces = domaine_VF.nb_faces(); - for (int num_face = ndeb; num_face < nb_faces; num_face++) - { - double vol = volumes_entrelaces(num_face) * porosite_surf(num_face); - ncomp = orientation(num_face); - secmem(num_face) += s[ncomp] * vol; - } - + Kokkos::parallel_for(start_gpu_timer(__KERNEL_NAME__), Kokkos::RangePolicy<>(ndeb, nb_faces),KOKKOS_LAMBDA(const int num_face) + { + double vol = volumes_entrelaces(num_face) * porosite_surf(num_face); + int ncomp = orientation(num_face); + + secmen(num_face,0) += s[ncomp] * vol; + }); + end_gpu_timer(__KERNEL_NAME__); } void Terme_Source_Canal_perio_VDF_Face::calculer_debit(double& debit_e) const { const Domaine_VF& domaine_VF = le_dom_VDF.valeur(); const Domaine_Cl_dis_base& domaine_Cl_dis = le_dom_Cl_VDF.valeur(); - const DoubleTab& vitesse = equation().inconnue().valeurs(); - const DoubleVect& porosite_surf = equation().milieu().porosite_face(); - int ndeb, nfin, num_face; int nb_bords = domaine_VF.nb_front_Cl(); for (int n_bord = 0; n_bord < nb_bords; n_bord++) { @@ -115,31 +121,31 @@ void Terme_Source_Canal_perio_VDF_Face::calculer_debit(double& debit_e) const int axe = perio.direction_periodicite(); assert(axe == direction_ecoulement_); debit_e = 0.; - ndeb = le_bord.num_premiere_face(); - nfin = ndeb + le_bord.nb_faces() / 2; + int ndeb = le_bord.num_premiere_face(); + int nfin = ndeb + le_bord.nb_faces() / 2; + CDoubleArrView porosite = equation().milieu().porosite_face().view_ro(); + CDoubleArrView vitesse = static_cast(equation().inconnue().valeurs()).view_ro(); + CDoubleTabView face_normales = domaine_VF.face_normales().view_ro(); if (equation().probleme().is_dilatable() == 1) { // Si l'on est en Quasi/Weakly Compressible, il faut conserver // le debit massique et non pas le debit volumique. - // C'est pour cela que dans le cas QC/WC, on multiplie les vecteurs vitesse - // par la masse volumique discretisee aux faces pour que lorsqu'on integre sur la surface, - // on obtienne bien un debit massique et non pas un debit volumique. const DoubleTab& tab_rho_face = ref_cast(Fluide_Dilatable_base,equation().milieu()).rho_discvit(); - - for (num_face = ndeb; num_face < nfin; num_face++) - { - double debit_face = porosite_surf[num_face] * vitesse[num_face] * std::fabs(domaine_VF.face_normales(num_face, axe)); - debit_e += tab_rho_face[num_face] * debit_face; - } + CDoubleArrView rho_face = static_cast(tab_rho_face).view_ro(); + Kokkos::parallel_reduce(start_gpu_timer(__KERNEL_NAME__), range_1D(ndeb, nfin), KOKKOS_LAMBDA(const int num_face, double& sum) + { + sum += rho_face(num_face) * porosite(num_face) * vitesse(num_face) * Kokkos::fabs(face_normales(num_face, axe)); + }, debit_e); + end_gpu_timer(__KERNEL_NAME__); } else { - for (num_face = ndeb; num_face < nfin; num_face++) - { - double debit_face = porosite_surf[num_face] * vitesse[num_face] * std::fabs(domaine_VF.face_normales(num_face, axe)); - debit_e += debit_face; - } + Kokkos::parallel_reduce(start_gpu_timer(__KERNEL_NAME__), range_1D(ndeb, nfin), KOKKOS_LAMBDA(const int num_face, double& sum) + { + sum += porosite(num_face) * vitesse(num_face) * Kokkos::fabs(face_normales(num_face, axe)); + }, debit_e); + end_gpu_timer(__KERNEL_NAME__); } } } diff --git a/src/VDF/Sources/Canal/Terme_Source_Canal_perio_VDF_Face.h b/src/VDF/Sources/Canal/Terme_Source_Canal_perio_VDF_Face.h index 63b6351aaf..ed0e7fb120 100644 --- a/src/VDF/Sources/Canal/Terme_Source_Canal_perio_VDF_Face.h +++ b/src/VDF/Sources/Canal/Terme_Source_Canal_perio_VDF_Face.h @@ -1,5 +1,5 @@ /**************************************************************************** -* Copyright (c) 2024, CEA +* Copyright (c) 2026, CEA * All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: @@ -37,15 +37,16 @@ class Terme_Source_Canal_perio_VDF_Face : public Terme_Source_Canal_perio Declare_instanciable(Terme_Source_Canal_perio_VDF_Face); public : inline void dimensionner_blocs(matrices_t matrices, const tabs_t& semi_impl) const override {} - void ajouter_blocs(matrices_t matrices, DoubleTab& secmem, const tabs_t& semi_impl) const override; + void ajouter_blocs(matrices_t matrices, DoubleTab& tab_secmen, const tabs_t& semi_impl) const override; inline int has_interface_blocs() const override { return 1; } + protected_but_public_for_cuda + void calculer_debit(double&) const override; + protected : OBS_PTR(Domaine_VDF) le_dom_VDF; OBS_PTR(Domaine_Cl_VDF) le_dom_Cl_VDF; void associer_domaines(const Domaine_dis_base& ,const Domaine_Cl_dis_base& ) override; - - void calculer_debit(double&) const override; }; class Terme_Source_Canal_perio_QC_VDF_Face : public Terme_Source_Canal_perio_VDF_Face diff --git a/src/VDF/Sources/Canal/Terme_Source_Canal_perio_VDF_P0.cpp b/src/VDF/Sources/Canal/Terme_Source_Canal_perio_VDF_P0.cpp index b20d5457a7..afe44e1246 100644 --- a/src/VDF/Sources/Canal/Terme_Source_Canal_perio_VDF_P0.cpp +++ b/src/VDF/Sources/Canal/Terme_Source_Canal_perio_VDF_P0.cpp @@ -1,5 +1,5 @@ /**************************************************************************** -* Copyright (c) 2024, CEA +* Copyright (c) 2026, CEA * All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: @@ -54,6 +54,7 @@ ArrOfDouble Terme_Source_Canal_perio_VDF_P0::source_convection_diffusion(double // Compute source term with // Source = -Sum(imposed_heat_flux)/Volume // Loop on the faces + ToDo_Kokkos("critical"); for (int num_elem = 0; num_elem < size; num_elem++) s[num_elem] = -heat_flux/volume; } @@ -68,6 +69,7 @@ void Terme_Source_Canal_perio_VDF_P0::ajouter_blocs(matrices_t matrices, DoubleT // Boucle sur les elements internes int nb_elem = domaine_VF.nb_elem(); + ToDo_Kokkos("critical"); for (int num_elem = 0; num_elem < nb_elem; num_elem++) { double vol = volumes(num_elem); diff --git a/src/VDF/Sources/Dilatable/Source_Fluide_Dilatable_VDF_Proto.cpp b/src/VDF/Sources/Dilatable/Source_Fluide_Dilatable_VDF_Proto.cpp index 94defcf7ab..0f1f9074df 100644 --- a/src/VDF/Sources/Dilatable/Source_Fluide_Dilatable_VDF_Proto.cpp +++ b/src/VDF/Sources/Dilatable/Source_Fluide_Dilatable_VDF_Proto.cpp @@ -1,5 +1,5 @@ /**************************************************************************** -* Copyright (c) 2024, CEA +* Copyright (c) 2025, CEA * All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: @@ -35,29 +35,34 @@ void Source_Fluide_Dilatable_VDF_Proto::associer_volume_porosite_impl(const Doma porosites.ref(le_dom_Cl->equation().milieu().porosite_elem()); } -void Source_Fluide_Dilatable_VDF_Proto::ajouter_impl(const DoubleVect& g,const double rho_m, - const DoubleTab& tab_rho, DoubleTab& resu) const +void Source_Fluide_Dilatable_VDF_Proto::ajouter_impl(const DoubleVect& tab_g,const double rho_m, + const DoubleTab& tab_rho, DoubleTab& tab_resu) const { - - const int nb_faces = le_dom->nb_faces(), premiere_face_interne = le_dom->premiere_face_int(); - const IntVect& orientation = le_dom->orientation(); - const DoubleVect& volumes_entrelaces = le_dom->volumes_entrelaces(), porosite_surf=le_dom_Cl->equation().milieu().porosite_face(); - + CIntArrView orientation = le_dom->orientation().view_ro(); + CDoubleArrView g = tab_g.view_ro(); + CDoubleArrView volumes_entrelaces = le_dom->volumes_entrelaces().view_ro(); + CDoubleArrView porosite_surf = le_dom_Cl->equation().milieu().porosite_face().view_ro(); + CDoubleArrView rho = static_cast(tab_rho).view_ro(); + DoubleArrView resu = static_cast(tab_resu).view_rw(); for (int num_cl=0 ; num_clnb_front_Cl() ; num_cl++) { const Cond_lim& la_cl = le_dom_Cl->les_conditions_limites(num_cl); const Front_VF& le_bord = ref_cast(Front_VF,la_cl->frontiere_dis()); - const int ndeb = le_bord.num_premiere_face(), nfin = ndeb + le_bord.nb_faces(); - if (sub_type(Dirichlet,la_cl.valeur()) || sub_type(Dirichlet_homogene,la_cl.valeur())) { /* Do nothing */ } else { - for (int face=ndeb ; facenb_faces(), premiere_face_interne = le_dom->premiere_face_int(); + Kokkos::parallel_for(start_gpu_timer(__KERNEL_NAME__), range_1D(premiere_face_interne, nb_faces), KOKKOS_LAMBDA(const int face) + { + resu(face) += (rho(face) - rho_m) * g(orientation(face)) * volumes_entrelaces(face) * porosite_surf(face); + }); + end_gpu_timer(__KERNEL_NAME__); } diff --git a/src/VDF/Sources/Dilatable/Source_Fluide_Dilatable_VDF_Proto.h b/src/VDF/Sources/Dilatable/Source_Fluide_Dilatable_VDF_Proto.h index 4b09cba7b8..029b42a626 100644 --- a/src/VDF/Sources/Dilatable/Source_Fluide_Dilatable_VDF_Proto.h +++ b/src/VDF/Sources/Dilatable/Source_Fluide_Dilatable_VDF_Proto.h @@ -1,5 +1,5 @@ /**************************************************************************** -* Copyright (c) 2024, CEA +* Copyright (c) 2026, CEA * All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: @@ -18,6 +18,7 @@ #include #include +#include class Domaine_Cl_VDF; class Equation_base; @@ -27,10 +28,12 @@ class Domaine_Cl_dis_base; class Source_Fluide_Dilatable_VDF_Proto { + protected_but_public_for_cuda + void ajouter_impl( const DoubleVect& g, const double rho_m, const DoubleTab& tab_rho, DoubleTab& resu) const; + protected: void associer_domaines_impl(const Domaine_dis_base& domaine,const Domaine_Cl_dis_base& domaine_cl); void associer_volume_porosite_impl(const Domaine_dis_base& domaine, DoubleVect& volumes, DoubleVect& porosites); - void ajouter_impl( const DoubleVect& g, const double rho_m, const DoubleTab& tab_rho, DoubleTab& resu) const; OBS_PTR(Domaine_Cl_VDF) le_dom_Cl; OBS_PTR(Domaine_VDF) le_dom; diff --git a/src/VDF/Sources/Dilatable/Source_Masse_Fluide_Dilatable_VDF.cpp b/src/VDF/Sources/Dilatable/Source_Masse_Fluide_Dilatable_VDF.cpp index c923df06d6..6b917f25dc 100644 --- a/src/VDF/Sources/Dilatable/Source_Masse_Fluide_Dilatable_VDF.cpp +++ b/src/VDF/Sources/Dilatable/Source_Masse_Fluide_Dilatable_VDF.cpp @@ -18,6 +18,8 @@ #include #include #include +#include +#include Implemente_instanciable(Source_Masse_Fluide_Dilatable_VDF,"Source_Masse_Fluide_Dilatable_VDF",Source_Masse_Fluide_Dilatable_base); @@ -77,21 +79,20 @@ Entree& Source_Masse_Fluide_Dilatable_VDF::readOn(Entree& is) { return Source_Ma * Y, rho at cell center, same as before for surf and V... This gives well the unit 1 / s, as d(Y)/dt ! * */ -void Source_Masse_Fluide_Dilatable_VDF::ajouter_eq_espece(const Convection_Diffusion_Fluide_Dilatable_base& eqn, const Fluide_Dilatable_base& fluide, const bool is_expl, DoubleVect& resu) const +void Source_Masse_Fluide_Dilatable_VDF::ajouter_eq_espece(const Convection_Diffusion_Fluide_Dilatable_base& eqn, const Fluide_Dilatable_base& fluide, const bool is_expl, DoubleVect& tab_resu) const { assert(sub_type(Fluide_Weakly_Compressible,fluide)); - const DoubleTab& Y = eqn.inconnue().valeurs(), &rho = fluide.masse_volumique().valeurs(); + const DoubleTab& tab_Y = eqn.inconnue().valeurs(), &tab_rho = fluide.masse_volumique().valeurs(); const Domaine_Cl_dis_base& zclb = domaine_cl_dis_.valeur(); const Domaine_VF& zvf = ref_cast(Domaine_VF, zclb.domaine_dis()); - const IntTab& face_voisins = zvf.face_voisins(); // pour post Champ_Don_base * post_src_ch = fluide.has_source_masse_espece_champ() ? &ref_cast_non_const(Fluide_Dilatable_base, fluide).source_masse_espece() : nullptr; - // On commence par remplir val_flux seulement pour les bonnes faces ... - DoubleTrav val_flux(zvf.nb_faces(), 1); - fill_val_flux_tab(val_flux); + // On commence par remplir flux seulement pour les bonnes faces ... + DoubleTrav flux(zvf.nb_faces(), 1); + fill_val_flux_tab(flux); // Maintennat on regarde resu ... for (int n_bord = 0; n_bord < domaine_cl_dis_->nb_cond_lim(); n_bord++) @@ -102,19 +103,29 @@ void Source_Masse_Fluide_Dilatable_VDF::ajouter_eq_espece(const Convection_Diffu if (le_bord.le_nom() == nom_bord_) { const int ndeb = le_bord.num_premiere_face(), nfin = ndeb + le_bord.nb_faces(); - for (int num_face = ndeb; num_face < nfin; num_face++) - { - const int elem1 = face_voisins(num_face, 0), elem2 = face_voisins(num_face, 1); - int elem = elem1 == -1 ? elem2 : elem1; - const double surface_elem = zvf.face_surfaces(num_face); - double srcmass = -(Y(elem) * val_flux(num_face, 0) * surface_elem) / rho(elem); - if (is_expl) - srcmass /= zvf.volumes(elem); // on divise par volume (pas de solveur masse dans l'equation ...) - resu(elem) += srcmass; - - if (post_src_ch) - (*post_src_ch).valeurs()(elem) = srcmass; - } + + CIntTabView face_voisins = zvf.face_voisins().view_ro(); + CDoubleArrView face_surfaces = zvf.face_surfaces().view_ro(); + CDoubleTabView val_flux = flux.view_ro(); + CDoubleArrView volumes = zvf.volumes().view_ro(); + CDoubleArrView Y = static_cast(tab_Y).view_ro(); + CDoubleArrView rho = static_cast(tab_rho).view_ro(); + DoubleArrView resu = static_cast(tab_resu).view_rw(); + DoubleArrView post_valeurs = post_src_ch ? static_cast((*post_src_ch).valeurs()).view_wo() : DoubleArrView(); + Kokkos::parallel_for(start_gpu_timer(__KERNEL_NAME__), range_1D(ndeb, nfin), KOKKOS_LAMBDA(const int num_face) + { + const int elem1 = face_voisins(num_face, 0), elem2 = face_voisins(num_face, 1); + int elem = elem1 == -1 ? elem2 : elem1; + const double surface_elem = face_surfaces(num_face); + double srcmass = -(Y(elem) * val_flux(num_face, 0) * surface_elem) / rho(elem); + if (is_expl) + srcmass /= volumes(elem); + Kokkos::atomic_add(&resu(elem), srcmass); + + if (post_src_ch) + Kokkos::atomic_store(&post_valeurs(elem), srcmass); + }); + end_gpu_timer(__KERNEL_NAME__); } } @@ -123,19 +134,18 @@ void Source_Masse_Fluide_Dilatable_VDF::ajouter_eq_espece(const Convection_Diffu (*post_src_ch).mettre_a_jour(fluide.inco_chaleur().temps()); } -void Source_Masse_Fluide_Dilatable_VDF::ajouter_projection(const Fluide_Dilatable_base& fluide, DoubleVect& resu) const +void Source_Masse_Fluide_Dilatable_VDF::ajouter_projection(const Fluide_Dilatable_base& fluide, DoubleVect& tab_resu) const { assert(sub_type(Fluide_Weakly_Compressible,fluide)); const Domaine_Cl_dis_base& zclb = domaine_cl_dis_.valeur(); const Domaine_VF& zvf = ref_cast(Domaine_VF, zclb.domaine_dis()); - const IntTab& face_voisins = zvf.face_voisins(); // pour post Champ_Don_base* post_src_ch = fluide.has_source_masse_projection_champ() ? &ref_cast_non_const(Fluide_Dilatable_base, fluide).source_masse_projection() : nullptr; - // On commence par remplir val_flux seulement pour les bonnes faces ... - DoubleTrav val_flux(zvf.nb_faces(), 1); - fill_val_flux_tab(val_flux); + // On commence par remplir flux seulement pour les bonnes faces ... + DoubleTrav flux(zvf.nb_faces(), 1); + fill_val_flux_tab(flux); // Maintennat on regarde resu ... for (int n_bord = 0; n_bord < domaine_cl_dis_->nb_cond_lim(); n_bord++) @@ -147,17 +157,24 @@ void Source_Masse_Fluide_Dilatable_VDF::ajouter_projection(const Fluide_Dilatabl { const int ndeb = le_bord.num_premiere_face(), nfin = ndeb + le_bord.nb_faces(); - for (int num_face = ndeb; num_face < nfin; num_face++) - { - const int elem1 = face_voisins(num_face, 0), elem2 = face_voisins(num_face, 1); - int elem = elem1 == -1 ? elem2 : elem1; - const double surf = zvf.face_surfaces(num_face); - const double source_per_dv = val_flux(num_face, 0) * surf / zvf.volumes(elem); // TODO multiple elements!! units [kg.s-1] / zvf.volumes(elem) - resu(elem) -= source_per_dv; // in [kg.m-3.s-1] - - if (post_src_ch) - (*post_src_ch).valeurs()(elem) = source_per_dv; - } + CIntTabView face_voisins = zvf.face_voisins().view_ro(); + CDoubleArrView face_surfaces = zvf.face_surfaces().view_ro(); + CDoubleTabView val_flux = flux.view_ro(); + CDoubleArrView volumes = zvf.volumes().view_ro(); + DoubleArrView resu = static_cast(tab_resu).view_rw(); + DoubleArrView post_valeurs = post_src_ch ? static_cast((*post_src_ch).valeurs()).view_rw() : DoubleArrView(); + Kokkos::parallel_for(start_gpu_timer(__KERNEL_NAME__), range_1D(ndeb, nfin), KOKKOS_LAMBDA(const int num_face) + { + const int elem1 = face_voisins(num_face, 0), elem2 = face_voisins(num_face, 1); + int elem = elem1 == -1 ? elem2 : elem1; + const double surf = face_surfaces(num_face); + const double source_per_dv = val_flux(num_face, 0) * surf / volumes(elem); + Kokkos::atomic_add(&resu(elem), -source_per_dv); + + if (post_src_ch) + Kokkos::atomic_store(&post_valeurs(elem), source_per_dv); + }); + end_gpu_timer(__KERNEL_NAME__); } } diff --git a/src/VDF/Sources/Dilatable/Source_WC_Chaleur_VDF.cpp b/src/VDF/Sources/Dilatable/Source_WC_Chaleur_VDF.cpp index de0654fcf0..d33457b71f 100644 --- a/src/VDF/Sources/Dilatable/Source_WC_Chaleur_VDF.cpp +++ b/src/VDF/Sources/Dilatable/Source_WC_Chaleur_VDF.cpp @@ -1,5 +1,5 @@ /**************************************************************************** -* Copyright (c) 2024, CEA +* Copyright (c) 2026, CEA * All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: @@ -133,6 +133,7 @@ void Source_WC_Chaleur_VDF::compute_interpolate_gradP_old(DoubleTab& UgradP_elem const Front_VF& le_bord = ref_cast(Front_VF,la_cl->frontiere_dis()); const int ndeb = le_bord.num_premiere_face(), nfin = ndeb + le_bord.nb_faces(); + ToDo_Kokkos("critical"); for (int num_face=ndeb; num_facecontient(point,elem) ; diff --git a/src/VDF/Sources/Terme_Boussinesq_VDF_Face.cpp b/src/VDF/Sources/Terme_Boussinesq_VDF_Face.cpp index caa7a6c69c..12c74bf5b4 100644 --- a/src/VDF/Sources/Terme_Boussinesq_VDF_Face.cpp +++ b/src/VDF/Sources/Terme_Boussinesq_VDF_Face.cpp @@ -1,5 +1,5 @@ /**************************************************************************** -* Copyright (c) 2024, CEA +* Copyright (c) 2025, CEA * All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: @@ -46,26 +46,27 @@ void Terme_Boussinesq_VDF_Face::associer_domaines(const Domaine_dis_base& domain le_dom_Cl_VDF = ref_cast(Domaine_Cl_VDF, domaine_Cl_dis); } -void Terme_Boussinesq_VDF_Face::ajouter_blocs(matrices_t matrices, DoubleTab& secmem, const tabs_t& semi_impl) const +void Terme_Boussinesq_VDF_Face::ajouter_blocs(matrices_t matrices, DoubleTab& tab_secmem, const tabs_t& semi_impl) const { const Domaine_VDF& domaine_VDF = le_dom_VDF.valeur(); const Domaine_Cl_VDF& domaine_Cl_VDF_hyd = le_dom_Cl_VDF.valeur(); const Domaine_Cl_dis_base& domaine_Cl_scal = equation_scalaire().domaine_Cl_dis(); const Domaine_Cl_VDF& domaine_Cl_VDF_scal = ref_cast(Domaine_Cl_VDF,domaine_Cl_scal); - const DoubleTab& param = equation_scalaire().inconnue().valeurs(); - const DoubleTab& beta_valeurs = beta().valeurs(); - const DoubleVect& grav = gravite().valeurs(); - const IntTab& face_voisins = domaine_VDF.face_voisins(); - const IntVect& orientation = domaine_VDF.orientation(); - const DoubleTab& xv = domaine_VDF.xv(); - const DoubleVect& porosite_surf = equation().milieu().porosite_face(); - const DoubleVect& volumes_entrelaces = domaine_VDF.volumes_entrelaces(); - const DoubleTab& vitesse = equation().inconnue().valeurs(); + const DoubleTab& tab_param = equation_scalaire().inconnue().valeurs(); + CIntTabView face_voisins = domaine_VDF.face_voisins().view_ro(); + CIntArrView orientation = domaine_VDF.orientation().view_ro(); + CDoubleTabView xv = domaine_VDF.xv().view_ro(); + CDoubleArrView porosite_surf = equation().milieu().porosite_face().view_ro(); + CDoubleArrView volumes_entrelaces = domaine_VDF.volumes_entrelaces().view_ro(); + CDoubleArrView vitesse = static_cast(equation().inconnue().valeurs()).view_ro(); + CDoubleTabView param = tab_param.view_ro(); + CDoubleArrView grav = static_cast(gravite().valeurs()).view_ro(); + CDoubleTabView beta_valeurs = beta().valeurs().view_ro(); + CDoubleArrView S0 = getScalaire0().view_ro(); + DoubleArrView secmem = static_cast(tab_secmem).view_rw(); - DoubleVect g(dimension); - g = grav; - - int nb_dim = param.line_size(); + const bool is_axi = domaine_VDF.axi; + int nb_comp = tab_param.line_size(); // Verifie la validite de T0: check(); @@ -85,119 +86,135 @@ void Terme_Boussinesq_VDF_Face::ajouter_blocs(matrices_t matrices, DoubleTab& se if (sub_type(Neumann_sortie_libre,la_cl_scal.valeur())) { const Neumann_sortie_libre& la_cl_neumann_scal = ref_cast(Neumann_sortie_libre, la_cl_scal.valeur()); - for (int num_face=ndeb; num_face0?1:0); - double coef=0; - for (int dim=0; dim0?1:0); + double coef=0; + for (int comp=0; comp0?1:0); + CDoubleTabView val_imp = la_cl_diri_scal.tab_val_imp().view_ro(); + Kokkos::parallel_for(start_gpu_timer(__KERNEL_NAME__), range_1D(ndeb, nfin), KOKKOS_LAMBDA(const int num_face) + { + int outlet; + int elem = face_voisins(num_face,0); + if (elem==-1) + { + outlet = (vitesse(num_face)<0?1:0); + elem = face_voisins(num_face,1); + } + else + outlet = (vitesse(num_face)>0?1:0); - double coef=0; - for (int dim=0; dimvaleurs()) : 0; + ToDo_Kokkos("critical"); for (num_face=premiere_face; num_facefrontiere_dis()); int nb_faces_bord=le_bord.nb_faces(); ArrOfInt fait(nb_faces_bord); - fait = 0; for (int ind_face=0; ind_faceview_ro(); + rho = ptr_rho->view_ro(); + } if (sub_type(Champ_Uniforme, la_source.valeur())) { - const DoubleVect& s = la_source->valeurs(); + const DoubleVect& tab_s = la_source->valeurs(); + CDoubleArrView s = tab_s.view_ro(); - // Boucle sur les conditions limites pour traiter les faces de bord : pour chaque Condition Limite on regarde son type - // Si face de Dirichlet ou de Symetrie on ne fait rien - // Si face de Neumann on calcule la contribution au terme source + // Boucle sur les conditions limites pour traiter les faces de bord for (int n_bord = 0; n_bord < domaine_VDF.nb_front_Cl(); n_bord++) { const Cond_lim& la_cl = domaine_Cl_VDF.les_conditions_limites(n_bord); - + const Front_VF& le_bord = ref_cast(Front_VF, la_cl->frontiere_dis()); + const int ndeb = le_bord.num_premiere_face(); + const int nfin = ndeb + le_bord.nb_faces(); if (sub_type(Periodique, la_cl.valeur())) { - if (alp) Process::exit("Terme_Source_Qdm_VDF_Face : periodic CL not yet available for Pb_Multiphase !"); - - const Front_VF& le_bord = ref_cast(Front_VF, la_cl->frontiere_dis()); - ndeb = le_bord.num_premiere_face(); - nfin = ndeb + le_bord.nb_faces(); - - for (int k = 0; k < nb_comp; k++) - for (num_face = ndeb; num_face < nfin; num_face++) - { - vol = volumes_entrelaces(num_face); - ncomp = orientation(num_face); - resu(num_face, k) += s(nb_comp * ncomp + k) * vol; - } + if (ptr_alp) Process::exit("Terme_Source_Qdm_VDF_Face : periodic CL not yet available for Pb_Multiphase !"); + Kokkos::parallel_for(start_gpu_timer(__KERNEL_NAME__), range_1D(ndeb, nfin), KOKKOS_LAMBDA(const int num_face) + { + const double vol = volumes_entrelaces(num_face); + const int ncomp = orientation(num_face); + for (int k = 0; k < nb_comp; k++) + resu(num_face, k) += s(nb_comp * ncomp + k) * vol; + }); + end_gpu_timer(__KERNEL_NAME__); } else if (sub_type(Neumann_sortie_libre, la_cl.valeur())) { - const Front_VF& le_bord = ref_cast(Front_VF, la_cl->frontiere_dis()); - ndeb = le_bord.num_premiere_face(); - nfin = ndeb + le_bord.nb_faces(); - - for (int k = 0; k < nb_comp; k++) - for (num_face = ndeb; num_face < nfin; num_face++) + Kokkos::parallel_for(start_gpu_timer(__KERNEL_NAME__), range_1D(ndeb, nfin), KOKKOS_LAMBDA(const int num_face) + { + const double vol = volumes_entrelaces(num_face) * porosite_surf(num_face); + const int ncomp = orientation(num_face); + for (int k = 0; k < nb_comp; k++) { - vol = volumes_entrelaces(num_face) * porosite_surf(num_face); - ncomp = orientation(num_face); double alpha_rho = 1.0; - if (alp) + if (has_alp) { - elem1 = face_voisins(num_face, 0), elem2 = face_voisins(num_face, 1); - const int e = ( elem1 > -1 ? elem1 : elem2); - double a = (*alp)(e, k), r = (*rho)(!cR * e, k); - alpha_rho = a * r; + const int elem1 = face_voisins(num_face, 0), elem2 = face_voisins(num_face, 1); + const int e = (elem1 > -1 ? elem1 : elem2); + alpha_rho = alp(e, k) * rho(!cR * e, k); } resu(num_face, k) += s(nb_comp * ncomp + k) * vol * alpha_rho; } - + }); + end_gpu_timer(__KERNEL_NAME__); } else if (sub_type(Symetrie, la_cl.valeur())) { /* Do nothing */} else if ((sub_type(Dirichlet, la_cl.valeur())) || (sub_type(Dirichlet_homogene, la_cl.valeur()))) { /* Do nothing */} } // Boucle sur les faces internes - ndeb = domaine_VDF.premiere_face_int(); - for (int k = 0; k < nb_comp; k++) - for (num_face = domaine_VDF.premiere_face_int(); num_face < domaine_VDF.nb_faces(); num_face++) + const int ndeb_int = domaine_VDF.premiere_face_int(); + const int nfin_int = domaine_VDF.nb_faces(); + Kokkos::parallel_for(start_gpu_timer(__KERNEL_NAME__), range_1D(ndeb_int, nfin_int), KOKKOS_LAMBDA(const int num_face) + { + const double vol = volumes_entrelaces(num_face) * porosite_surf(num_face); + const int ncomp = orientation(num_face); + const int elem1 = face_voisins(num_face, 0); + const int elem2 = face_voisins(num_face, 1); + for (int k = 0; k < nb_comp; k++) { - vol = volumes_entrelaces(num_face) * porosite_surf(num_face); - ncomp = orientation(num_face); double alpha_rho = 1.0; - if (alp) + if (has_alp) { - elem1 = face_voisins(num_face, 0), elem2 = face_voisins(num_face, 1); - double a = 0.5 * ((*alp)(elem1, k) + (*alp)(elem2, k)), r = 0.5 * ((*rho)(!cR * elem1, k) + (*rho)(!cR * elem2, k)); + double a = 0.5 * (alp(elem1, k) + alp(elem2, k)); + double r = 0.5 * (rho(!cR * elem1, k) + rho(!cR * elem2, k)); alpha_rho = a * r; } resu(num_face, k) += s(nb_comp * ncomp + k) * vol * alpha_rho; } + }); + end_gpu_timer(__KERNEL_NAME__); } else // le champ source n'est plus uniforme { @@ -147,83 +151,83 @@ void Terme_Source_Qdm_VDF_Face::ajouter_blocs(matrices_t matrices, DoubleTab& re if (la_source->que_suis_je().contient("_som_")) { // Need to interpolate - const int N = resu.dimension(1), D = dimension; - eval.resize(resu.dimension(0), N * D); + const int N = tab_resu.dimension(1), D = dimension; + eval.resize(tab_resu.dimension(0), N * D); la_source->valeur_aux(domaine_VDF.xp(), eval); s_tmp = &eval; } else s_tmp = &(la_source->valeurs()); - const DoubleTab& s = *s_tmp; + const DoubleTab& tab_s = *s_tmp; + CDoubleTabView s = tab_s.view_ro(); - // Boucle sur les conditions limites pour traiter les faces de bord : pour chaque Condition Limite on regarde son type - // Si face de Dirichlet ou de Symetrie on ne fait rien - // Si face de Neumann on calcule la contribution au terme source + // Boucle sur les conditions limites for (int n_bord = 0; n_bord < domaine_VDF.nb_front_Cl(); n_bord++) { const Cond_lim& la_cl = domaine_Cl_VDF.les_conditions_limites(n_bord); - + const Front_VF& le_bord = ref_cast(Front_VF, la_cl->frontiere_dis()); + const int ndeb = le_bord.num_premiere_face(); + const int nfin = ndeb + le_bord.nb_faces(); if (sub_type(Neumann_sortie_libre, la_cl.valeur())) { - const Front_VF& le_bord = ref_cast(Front_VF, la_cl->frontiere_dis()); - ndeb = le_bord.num_premiere_face(); - nfin = ndeb + le_bord.nb_faces(); - - for (int k = 0; k < nb_comp; k++) - for (num_face = ndeb; num_face < nfin; num_face++) + Kokkos::parallel_for(start_gpu_timer(__KERNEL_NAME__), range_1D(ndeb, nfin), KOKKOS_LAMBDA(const int num_face) + { + const double vol = volumes_entrelaces(num_face) * porosite_surf(num_face); + const int ncomp = orientation(num_face); + const int elem1 = face_voisins(num_face, 0), elem2 = face_voisins(num_face, 1); + const int e = (elem1 > -1 ? elem1 : elem2); + for (int k = 0; k < nb_comp; k++) { - vol = volumes_entrelaces(num_face) * porosite_surf(num_face); - ncomp = orientation(num_face); - elem1 = face_voisins(num_face, 0), elem2 = face_voisins(num_face, 1); - const int e = (elem1 > -1 ? elem1 : elem2); double alpha_rho = 1.0; - if (alp) - { - double a = (*alp)(e, k), r = (*rho)(!cR * e, k); - alpha_rho = a * r; - } + if (has_alp) + alpha_rho = alp(e, k) * rho(!cR * e, k); resu(num_face, k) += s(e, nb_comp * ncomp + k) * vol * alpha_rho; } + }); + end_gpu_timer(__KERNEL_NAME__); } else if (sub_type(Symetrie, la_cl.valeur())) { /* Do nothing */} else if ((sub_type(Dirichlet, la_cl.valeur())) || (sub_type(Dirichlet_homogene, la_cl.valeur()))) { /* Do nothing */} else if (sub_type(Periodique, la_cl.valeur())) { - if (alp) Process::exit("Terme_Source_Qdm_VDF_Face : periodic CL not yet available for Pb_Multiphase !"); - - const Front_VF& le_bord = ref_cast(Front_VF, la_cl->frontiere_dis()); - ndeb = le_bord.num_premiere_face(); - nfin = ndeb + le_bord.nb_faces(); - - for (int k = 0; k < nb_comp; k++) - for (num_face = ndeb; num_face < nfin; num_face++) + if (ptr_alp) Process::exit("Terme_Source_Qdm_VDF_Face : periodic CL not yet available for Pb_Multiphase !"); + Kokkos::parallel_for(start_gpu_timer(__KERNEL_NAME__), range_1D(ndeb, nfin), KOKKOS_LAMBDA(const int num_face) + { + const double vol = volumes_entrelaces(num_face) * porosite_surf(num_face); + const int ncomp = orientation(num_face); + for (int k = 0; k < nb_comp; k++) { - vol = volumes_entrelaces(num_face) * porosite_surf(num_face); - ncomp = orientation(num_face); - double s_face = 0.5 * (s(face_voisins(num_face, 0), nb_comp * ncomp + k) + s(face_voisins(num_face, 1), nb_comp * ncomp + k)); + const double s_face = 0.5 * (s(face_voisins(num_face, 0), nb_comp * ncomp + k) + s(face_voisins(num_face, 1), nb_comp * ncomp + k)); resu(num_face, k) += s_face * vol; } + }); + end_gpu_timer(__KERNEL_NAME__); } } // Boucle sur les faces internes - ndeb = domaine_VDF.premiere_face_int(); - - for (int k = 0; k < nb_comp; k++) - for (num_face = domaine_VDF.premiere_face_int(); num_face < domaine_VDF.nb_faces(); num_face++) + const int ndeb_int = domaine_VDF.premiere_face_int(); + const int nfin_int = domaine_VDF.nb_faces(); + Kokkos::parallel_for(start_gpu_timer(__KERNEL_NAME__), range_1D(ndeb_int, nfin_int), KOKKOS_LAMBDA(const int num_face) + { + const double vol = volumes_entrelaces(num_face) * porosite_surf(num_face); + const int ncomp = orientation(num_face); + const int elem1 = face_voisins(num_face, 0); + const int elem2 = face_voisins(num_face, 1); + for (int k = 0; k < nb_comp; k++) { - vol = volumes_entrelaces(num_face) * porosite_surf(num_face); - ncomp = orientation(num_face); - elem1 = face_voisins(num_face, 0), elem2 = face_voisins(num_face, 1); double alpha_rho = 1.0; - if (alp) + if (has_alp) { - double a = 0.5 * ((*alp)(elem1, k) + (*alp)(elem2, k)), r = 0.5 * ((*rho)(!cR * elem1, k) + (*rho)(!cR * elem2, k)); + double a = 0.5 * (alp(elem1, k) + alp(elem2, k)); + double r = 0.5 * (rho(!cR * elem1, k) + rho(!cR * elem2, k)); alpha_rho = a * r; } - double s_face = 0.5 * (s(elem1, nb_comp * ncomp + k) + s(elem2, nb_comp * ncomp + k)); - resu(num_face,k ) += s_face * vol * alpha_rho; + const double s_face = 0.5 * (s(elem1, nb_comp * ncomp + k) + s(elem2, nb_comp * ncomp + k)); + resu(num_face, k) += s_face * vol * alpha_rho; } + }); + end_gpu_timer(__KERNEL_NAME__); } } diff --git a/src/VDF/Sources/Terme_Source_Solide_SWIFT_VDF.cpp b/src/VDF/Sources/Terme_Source_Solide_SWIFT_VDF.cpp index c8512934ef..0c1c086cf2 100644 --- a/src/VDF/Sources/Terme_Source_Solide_SWIFT_VDF.cpp +++ b/src/VDF/Sources/Terme_Source_Solide_SWIFT_VDF.cpp @@ -1,5 +1,5 @@ /**************************************************************************** -* Copyright (c) 2024, CEA +* Copyright (c) 2026, CEA * All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: @@ -163,6 +163,7 @@ void Terme_Source_Solide_SWIFT_VDF::init_calcul_moyenne(const Conduction& my_eqn j = 0; indic = 0; + ToDo_Kokkos("critical"); for (num_elem = 0; num_elem < nb_elems; num_elem++) { y = xp(num_elem, 1); @@ -252,6 +253,7 @@ void Terme_Source_Solide_SWIFT_VDF::ajouter_blocs(matrices_t matrices, DoubleTab calcul_moyenne(eq_swift.valeur(), Tmoy_swift, corresp_swift, compt_swift); calcul_moyenne(eq_corse.valeur(), Tmoy_corse, corresp_corse, compt_corse); + ToDo_Kokkos("critical"); for (int num_elem = 0; num_elem < nb_elems; num_elem++) { resu(num_elem) += volume(num_elem) * (Tmoy_corse(corresp_SC[corresp_swift[num_elem]]) - Tmoy_swift(corresp_swift[num_elem])) / tau; diff --git a/src/VDF/Sources/Terme_Source_inc_VDF_Face.cpp b/src/VDF/Sources/Terme_Source_inc_VDF_Face.cpp index fe0862c0c6..c84cbf12d7 100644 --- a/src/VDF/Sources/Terme_Source_inc_VDF_Face.cpp +++ b/src/VDF/Sources/Terme_Source_inc_VDF_Face.cpp @@ -1,5 +1,5 @@ /**************************************************************************** -* Copyright (c) 2024, CEA +* Copyright (c) 2026, CEA * All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: @@ -751,37 +751,43 @@ void Terme_Source_inc_VDF_Face::ajouter_blocs(matrices_t matrices, DoubleTab& se // Evaluate the dynamic model coefficient f int elem; if (dimension == 2) - for (elem=0; elem=0) @@ -517,6 +519,7 @@ void Traitement_particulier_NS_Profils_VDF::init_calcul_moyenne() indicw_m = indicw_p = 0; indicuv_m = indicuv_p = 0; //Boucle sur les faces pour avoir la correspondance pour les composantes de la vitesse. + ToDo_Kokkos("critical"); for (num_face=0; num_face diff --git a/src/VDF/Traitement_particulier/Traitement_particulier_NS_canal_VDF.cpp b/src/VDF/Traitement_particulier/Traitement_particulier_NS_canal_VDF.cpp index b8f845583d..d27c25fee4 100644 --- a/src/VDF/Traitement_particulier/Traitement_particulier_NS_canal_VDF.cpp +++ b/src/VDF/Traitement_particulier/Traitement_particulier_NS_canal_VDF.cpp @@ -1,5 +1,5 @@ /**************************************************************************** -* Copyright (c) 2024, CEA +* Copyright (c) 2026, CEA * All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: @@ -73,6 +73,7 @@ void Traitement_particulier_NS_canal_VDF::remplir_Y(DoubleVect& tab_Y, DoubleVe //Remplissage du tableau Y //////////////////////////////////////////////////////// + ToDo_Kokkos("critical"); for (num_elem=0; num_elem @@ -287,6 +288,7 @@ void Traitement_particulier_Solide_canal_VDF::init_calcul_moyenne() compt = 0; corresp = -1; + ToDo_Kokkos("critical"); for (num_elem=0; num_eleminconnue().temps(); - DoubleTab& visco_turb = la_viscosite_turbulente_->valeurs(); + DoubleTab& tab_visco_turb = la_viscosite_turbulente_->valeurs(); if (est_egal(cw_, 0., 1.e-15)) - visco_turb = 0.; + tab_visco_turb = 0.; else { - const int nb_elem = domaine_VDF.domaine().nb_elem(), nb_elem_tot = domaine_VDF.nb_elem_tot(); - - OP1_.resize(nb_elem_tot); // OP1 est le premier operateur spatial du modele WALE. - OP2_.resize(nb_elem_tot); // OP2 est le deuxieme operateur spatial du modele WALE. - + const int nb_elem_tot = domaine_VDF.nb_elem_tot(); + if (OP1_.size_array()!=nb_elem_tot) + { + OP1_.resize(nb_elem_tot); // OP1 est le premier operateur spatial du modele WALE. + OP2_.resize(nb_elem_tot); // OP2 est le deuxieme operateur spatial du modele WALE. + } calculer_OP1_OP2(); - if (visco_turb.size() != nb_elem) + const int nb_elem = domaine_VDF.domaine().nb_elem(); + if (tab_visco_turb.size() != nb_elem) { Cerr << "Size error for the array containing the values of the turbulent viscosity." << finl; exit(); } - for (int elem = 0; elem < nb_elem; elem++) - { - if (OP1_[elem] != 0.) // donc sd2 (et OP2 par voie de consequence) sont differents de zero - visco_turb[elem] = cw_ * cw_ * l_(elem) * l_(elem) * OP1_[elem] / OP2_[elem]; - else - visco_turb[elem] = 0; - } + const double cw = cw_; + CDoubleArrView l = l_.view_ro(); + CDoubleArrView OP1 = OP1_.view_rw(); + CDoubleArrView OP2 = OP2_.view_rw(); + DoubleArrView visco_turb = static_cast(tab_visco_turb).view_wo(); + Kokkos::parallel_for(start_gpu_timer(__KERNEL_NAME__), range_1D(0, nb_elem), KOKKOS_LAMBDA(const int elem) + { + visco_turb[elem] = OP2[elem] == 0. ? 0. : cw * cw * l(elem) * l(elem) * OP1[elem] / OP2[elem]; + }); // fin de la boucle sur les elements + end_gpu_timer(__KERNEL_NAME__); } la_viscosite_turbulente_->changer_temps(temps); @@ -87,92 +92,72 @@ void Modele_turbulence_hyd_LES_Wale_VDF::calculer_OP1_OP2() const Domaine_Cl_VDF& domaine_Cl_VDF = ref_cast(Domaine_Cl_VDF, le_dom_Cl_.valeur()); const int nb_elem = domaine_VDF.domaine().nb_elem_tot(), nb_elem_tot = domaine_VDF.nb_elem_tot(); - const IntTab& face_voisins = domaine_VDF.face_voisins(), &elem_faces = domaine_VDF.elem_faces(); + assert(vitesse.line_size() == 1); + DoubleTrav tab_duidxj(nb_elem_tot, dimension, dimension, vitesse.line_size()); + vit.calcul_duidxj(vitesse, tab_duidxj, domaine_Cl_VDF); + + const int dim = Objet_U::dimension; + CDoubleTabView4 duidxj = tab_duidxj.view_ro<4>(); + CIntTabView elem_faces = domaine_VDF.elem_faces().view_ro(); + CIntTabView face_voisins = domaine_VDF.face_voisins().view_ro(); + DoubleArrView OP1 = OP1_.view_rw(); + DoubleArrView OP2 = OP2_.view_rw(); + Kokkos::parallel_for(start_gpu_timer(__KERNEL_NAME__), range_1D(0, nb_elem), KOKKOS_LAMBDA(const int elem) + { + double gij2[3][3]; + double sd[3][3]; + + // Calcul du terme gij2 + for (int i = 0; i < dim; i++) + for (int j = 0; j < dim; j++) + { + gij2[i][j] = 0; + for (int k = 0; k < dim; k++) + gij2[i][j] += duidxj(elem, i, k, 0) * duidxj(elem, k, j, 0); + } + + // Calcul du terme gkk2 + double gkk2 = 0; + for (int k = 0; k < dim; k++) + gkk2 += gij2[k][k]; - DoubleTrav gij2(dimension, dimension), sd(dimension, dimension); + // Calcul de sd + for (int i = 0; i < dim; i++) + for (int j = 0; j < dim; j++) + { + sd[i][j] = 0.5 * (gij2[i][j] + gij2[j][i]); + if (i == j) + sd[i][j] -= gkk2 / 3.; // Terme derriere le tenseur de Kronecker + } - double gkk2, sd2, Sij, Sij2; + // Calcul de sd2 et Sij2 + double sd2 = 0., Sij2 = 0.; - assert(vitesse.line_size() == 1); - DoubleTab duidxj(nb_elem_tot, dimension, dimension, vitesse.line_size()); + for (int i = 0; i < dim; i++) + for (int j = 0; j < dim; j++) + { + sd2 += sd[i][j] * sd[i][j]; + double Sij = 0.5 * (duidxj(elem, i, j, 0) + duidxj(elem, j, i, 0)); + + if (i == j) // augmentation du stencil de Sii + { + const int face1 = elem_faces(elem, i); + const int face2 = elem_faces(elem, i + dim); + const int elem1 = face_voisins(face1, 0); + const int elem2 = face_voisins(face2, 1); + // si pas de bord a proximite on passe au stencil de 3 mailles + // sinon on reste au stencil a 1 maille + if (elem1 >= 0 && elem2 >= 0) + Sij = (duidxj(elem1, i, i, 0) + duidxj(elem, i, i, 0) + duidxj(elem2, i, i, 0)) / 3.; + } + + Sij2 += Sij * Sij; + } - vit.calcul_duidxj(vitesse, duidxj, domaine_Cl_VDF); + // Calcul de OP1 et OP2 (replace pow by sqrt and multiply, faster) + OP1(elem) = sd2 * Kokkos::sqrt(sd2); + OP2(elem) = Sij2 * Sij2 * Kokkos::sqrt(Sij2) + sd2 * Kokkos::sqrt(Kokkos::sqrt(sd2)); - for (int elem = 0; elem < nb_elem; elem++) - { - //Calcul du terme gij2 - for (int i = 0; i < dimension; i++) - for (int j = 0; j < dimension; j++) - { - gij2(i, j) = 0; - - for (int k = 0; k < dimension; k++) - gij2(i, j) += duidxj(elem, i, k, 0) * duidxj(elem, k, j, 0); - } - - // Calcul du terme gkk2 - gkk2 = 0; - for (int k = 0; k < dimension; k++) - gkk2 += gij2(k, k); - - // Calcul de sd - for (int i = 0; i < dimension; i++) - for (int j = 0; j < dimension; j++) - { - sd(i, j) = 0.5 * (gij2(i, j) + gij2(j, i)); - if (i == j) - sd(i, j) -= gkk2 / 3.; // Terme derriere le tenseur de Kronecker - } - - // Calcul de sd2 et Sij2 - sd2 = 0.; - Sij2 = 0.; - - int face1 = 0, face2 = 0; - int elem1, elem2; - - for (int i = 0; i < dimension; i++) - for (int j = 0; j < dimension; j++) - { - sd2 += sd(i, j) * sd(i, j); - //Deplacement du calcul de sij - Sij = 0.5 * (duidxj(elem, i, j, 0) + duidxj(elem, j, i, 0)); - - // PQ : 24/01/07 : le stencil de Sij est par contruction de : - // - 1 maille pour les termes diagonaux Sii - // - ~2 mailles pour les termes croises Sij - // - // Wale s'appuyant a la fois sur sd2 (porte par Sij) et sur Sij2 (porte principalement par Sii) - // est sensible a cette difference de stencil. - // En portant le stencil a 3 maille spour le calcul de Sii, on retrouve en THI - // le bon taux de dissipation ainsi que des spectres possedant la bonne allure en k^-5/3. - // - // A traiter : Quid sur canal plan ??? - - if (i == j) // augmentation du stencil de Sii - { - face1 = elem_faces(elem, i); - face2 = elem_faces(elem, i + dimension); - - elem1 = face_voisins(face1, 0); - elem2 = face_voisins(face2, 1); - - //if(elem1==elem) elem1=face_voisins(face1,1); // par construction il n'y a pas besoin - //if(elem2==elem) elem2=face_voisins(face2,0); // par construction il n'y a pas besoin - - // si pas de bord a proximite on passe au stencil de 3 mailles - // sinon on reste au stencil a 1 maille - - if (elem1 >= 0 && elem2 >= 0) - Sij = ((duidxj(elem1, i, i, 0) + duidxj(elem, i, i, 0) + duidxj(elem2, i, i, 0))) / 3.; - } - - Sij2 += Sij * Sij; - } - - // Calcul de OP1 et OP2 - OP1_(elem) = pow(sd2, 1.5); - OP2_(elem) = pow(Sij2, 2.5) + pow(sd2, 1.25); - - } // fin de la boucle sur les elements + }); // fin de la boucle sur les elements + end_gpu_timer(__KERNEL_NAME__); } diff --git a/src/VDF/Turbulence/Modele_turbulence_hyd_LES_Wale_VDF.h b/src/VDF/Turbulence/Modele_turbulence_hyd_LES_Wale_VDF.h index 8dd6db18ff..c6363d6573 100644 --- a/src/VDF/Turbulence/Modele_turbulence_hyd_LES_Wale_VDF.h +++ b/src/VDF/Turbulence/Modele_turbulence_hyd_LES_Wale_VDF.h @@ -33,12 +33,13 @@ class Modele_turbulence_hyd_LES_Wale_VDF: public Modele_turbulence_hyd_LES_VDF_b Modele_turbulence_hyd_LES_Wale_VDF(); void set_param(Param& param) const override; + protected_but_public_for_cuda + Champ_Fonc_base& calculer_viscosite_turbulente() override; + void calculer_OP1_OP2(); + protected: double cw_ = 0.5; DoubleVect OP1_, OP2_; - - Champ_Fonc_base& calculer_viscosite_turbulente() override; - void calculer_OP1_OP2(); }; #endif /* Modele_turbulence_hyd_LES_Wale_VDF_included */ diff --git a/src/VDF/Turbulence/Modele_turbulence_hyd_Longueur_Melange_VDF.cpp b/src/VDF/Turbulence/Modele_turbulence_hyd_Longueur_Melange_VDF.cpp index 323fc57bf2..c2624fd53f 100644 --- a/src/VDF/Turbulence/Modele_turbulence_hyd_Longueur_Melange_VDF.cpp +++ b/src/VDF/Turbulence/Modele_turbulence_hyd_Longueur_Melange_VDF.cpp @@ -86,6 +86,7 @@ Champ_Fonc_base& Modele_turbulence_hyd_Longueur_Melange_VDF::calculer_viscosite_ // CANAL PLAN suivant (Ox - h=2) ********************************** + ToDo_Kokkos("critical"); for (int elem = 0; elem < nb_elem; elem++) { double y = xp(elem, direction_); @@ -120,6 +121,7 @@ void Modele_turbulence_hyd_Longueur_Melange_VDF::calculer_Sij2() ch.calcul_duidxj(vitesse, duidxj, domaine_Cl_VDF); + ToDo_Kokkos("critical"); for (int elem = 0; elem < nb_elem; elem++) { for (i = 0; i < dimension; i++) diff --git a/src/VDF/Turbulence/Paroi_negligeable_VDF.cpp b/src/VDF/Turbulence/Paroi_negligeable_VDF.cpp index 29d0bbef22..6bf922d8f8 100644 --- a/src/VDF/Turbulence/Paroi_negligeable_VDF.cpp +++ b/src/VDF/Turbulence/Paroi_negligeable_VDF.cpp @@ -1,5 +1,5 @@ /**************************************************************************** -* Copyright (c) 2025, CEA +* Copyright (c) 2026, CEA * All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: @@ -38,153 +38,61 @@ int Paroi_negligeable_VDF::init_lois_paroi() int Paroi_negligeable_VDF::calculer_hyd(DoubleTab& tab_k_eps) { - const Equation_base& eqn_hydr = mon_modele_turb_hyd->equation(); - if (sub_type(Fluide_base, eqn_hydr.milieu())) - { - int ndeb, nfin, elem, ori, l_unif; - double norm_tau, u_etoile, norm_v = 0, dist, val0, val1, val2, d_visco = 0, visco = 1.; - - const Domaine_VDF& domaine_VDF = ref_cast(Domaine_VDF, le_dom_dis_.valeur()); - const IntTab& face_voisins = domaine_VDF.face_voisins(); - const IntVect& orientation = domaine_VDF.orientation(); - const Fluide_base& le_fluide = ref_cast(Fluide_base, eqn_hydr.milieu()); - const Champ_Don_base& ch_visco_cin = le_fluide.viscosite_cinematique(); - const DoubleTab& tab_visco = ch_visco_cin.valeurs(); - const DoubleTab& vit = eqn_hydr.inconnue().valeurs(); - - if (sub_type(Champ_Uniforme, ch_visco_cin)) - { - visco = tab_visco(0, 0); - l_unif = 1; - } - else - l_unif = 0; - - for (int n_bord = 0; n_bord < domaine_VDF.nb_front_Cl(); n_bord++) - { - const Cond_lim& la_cl = le_dom_Cl_dis_->les_conditions_limites(n_bord); - - if (sub_type(Dirichlet_paroi_fixe, la_cl.valeur())) - { - const Front_VF& le_bord = ref_cast(Front_VF, la_cl->frontiere_dis()); - ndeb = le_bord.num_premiere_face(); - nfin = ndeb + le_bord.nb_faces(); - - for (int num_face = ndeb; num_face < nfin; num_face++) - { - - if (face_voisins(num_face, 0) != -1) - elem = face_voisins(num_face, 0); - else - elem = face_voisins(num_face, 1); - - if (dimension == 2) - { - ori = orientation(num_face); - norm_v = norm_2D_vit(vit, elem, ori, domaine_VDF, val0); - } - else if (dimension == 3) - { - ori = orientation(num_face); - norm_v = norm_3D_vit(vit, elem, ori, domaine_VDF, val1, val2); - } - - if (axi) - dist = domaine_VDF.dist_norm_bord_axi(num_face); - else - dist = domaine_VDF.dist_norm_bord(num_face); - if (l_unif) - d_visco = visco; - else - d_visco = tab_visco[elem]; - - norm_tau = d_visco * norm_v / dist; - u_etoile = sqrt(norm_tau); - tab_u_star_(num_face) = u_etoile; - - } // loop on faces - - } // Fin paroi fixe - - } // Fin boucle sur les bords - - } - return 1; + return calculer_hyd(tab_k_eps, tab_k_eps); // arguments are not used anyway } int Paroi_negligeable_VDF::calculer_hyd(DoubleTab& tab_nu_t, DoubleTab& tab_k) { const Equation_base& eqn_hydr = mon_modele_turb_hyd->equation(); - if (sub_type(Fluide_base, eqn_hydr.milieu())) + if (!sub_type(Fluide_base, eqn_hydr.milieu())) return 1; + + const Domaine_VDF& domaine_VDF = ref_cast(Domaine_VDF, le_dom_dis_.valeur()); + const Domaine_Cl_VDF& dom_Cl_VDF = ref_cast(Domaine_Cl_VDF, le_dom_Cl_dis_.valeur()); + const Champ_Don_base& ch_visco_cin = ref_cast(Fluide_base, eqn_hydr.milieu()).viscosite_cinematique(); + const DoubleTab& tab_visco_cin = ch_visco_cin.valeurs(); + const int l_unif = sub_type(Champ_Uniforme, ch_visco_cin) ? 1 : 0; + const double visco = l_unif ? tab_visco_cin(0, 0) : 1.; + + const int dim = Objet_U::dimension; + const int is_axi = Objet_U::axi; + Domaine_VDF_View dom_vdf(domaine_VDF); + CIntTabView face_voisins = domaine_VDF.face_voisins().view_ro(); + CIntArrView orientation = domaine_VDF.orientation().view_ro(); + CIntTabView elem_faces = domaine_VDF.elem_faces().view_ro(); + CDoubleArrView vitesse = static_cast(eqn_hydr.inconnue().valeurs()).view_ro(); + CDoubleTabView tab_visco; + if (!l_unif) tab_visco = tab_visco_cin.view_ro(); + DoubleArrView u_star = tab_u_star_.view_wo(); + for (int n_bord = 0; n_bord < domaine_VDF.nb_front_Cl(); n_bord++) { - int ndeb, nfin, elem, ori, l_unif; - double norm_tau, u_etoile, norm_v = 0, dist, val0, val1, val2, d_visco = 0, visco = 1.; - - const Domaine_VDF& domaine_VDF = ref_cast(Domaine_VDF, le_dom_dis_.valeur()); - const IntTab& face_voisins = domaine_VDF.face_voisins(); - const IntVect& orientation = domaine_VDF.orientation(); - const Fluide_base& le_fluide = ref_cast(Fluide_base, eqn_hydr.milieu()); - const Champ_Don_base& ch_visco_cin = le_fluide.viscosite_cinematique(); - const DoubleTab& tab_visco = ch_visco_cin.valeurs(); - const DoubleTab& vit = eqn_hydr.inconnue().valeurs(); - - if (sub_type(Champ_Uniforme, ch_visco_cin)) + const Cond_lim& la_cl = dom_Cl_VDF.les_conditions_limites(n_bord); + if (sub_type(Dirichlet_paroi_fixe, la_cl.valeur())) { - visco = tab_visco(0, 0); - l_unif = 1; - } - else - l_unif = 0; - - for (int n_bord = 0; n_bord < domaine_VDF.nb_front_Cl(); n_bord++) - { - const Cond_lim& la_cl = le_dom_Cl_dis_->les_conditions_limites(n_bord); - - if (sub_type(Dirichlet_paroi_fixe, la_cl.valeur())) - { - const Front_VF& le_bord = ref_cast(Front_VF, la_cl->frontiere_dis()); - ndeb = le_bord.num_premiere_face(); - nfin = ndeb + le_bord.nb_faces(); - - for (int num_face = ndeb; num_face < nfin; num_face++) - { - - if (face_voisins(num_face, 0) != -1) - elem = face_voisins(num_face, 0); - else - elem = face_voisins(num_face, 1); - - if (dimension == 2) - { - ori = orientation(num_face); - norm_v = norm_2D_vit(vit, elem, ori, domaine_VDF, val0); - } - else if (dimension == 3) - { - ori = orientation(num_face); - norm_v = norm_3D_vit(vit, elem, ori, domaine_VDF, val1, val2); - } - - if (axi) - dist = domaine_VDF.dist_norm_bord_axi(num_face); - else - dist = domaine_VDF.dist_norm_bord(num_face); - if (l_unif) - d_visco = visco; - else - d_visco = tab_visco[elem]; - - norm_tau = d_visco * norm_v / dist; - u_etoile = sqrt(norm_tau); - tab_u_star_(num_face) = u_etoile; - - } // loop on faces - - } // Fin paroi fixe - - } // Fin boucle sur les bords - - } + const Front_VF& le_bord = ref_cast(Front_VF, la_cl->frontiere_dis()); + const int ndeb = le_bord.num_premiere_face(); + const int nfin = ndeb + le_bord.nb_faces(); + Kokkos::parallel_for(start_gpu_timer(__KERNEL_NAME__), range_1D(ndeb, nfin), KOKKOS_LAMBDA(const int num_face) + { + const int elem = face_voisins(num_face, 0) != -1 ? face_voisins(num_face, 0) : face_voisins(num_face, 1); + const int ori = orientation(num_face); + double norm_v = 0; + if (dim == 2) + { + double val0; + norm_v = norm_2D_vit(vitesse, elem, ori, elem_faces, val0); + } + else + { + double val1, val2; + norm_v = norm_3D_vit(vitesse, elem, ori, elem_faces, val1, val2); + } + const double dist = is_axi ? dom_vdf.dist_norm_bord_axi(num_face) : dom_vdf.dist_norm_bord(num_face); + const double d_visco = l_unif ? visco : tab_visco(elem, 0); + u_star(num_face) = Kokkos::sqrt(d_visco * norm_v / dist); + }); // loop on faces + end_gpu_timer(__KERNEL_NAME__); + } // Fin paroi fixe + } // Fin boucle sur les bords return 1; } diff --git a/src/VDF/Turbulence/Paroi_scal_hyd_base_VDF.cpp b/src/VDF/Turbulence/Paroi_scal_hyd_base_VDF.cpp index aaa6fa549b..1558eb9cf8 100644 --- a/src/VDF/Turbulence/Paroi_scal_hyd_base_VDF.cpp +++ b/src/VDF/Turbulence/Paroi_scal_hyd_base_VDF.cpp @@ -1,5 +1,5 @@ /**************************************************************************** -* Copyright (c) 2025, CEA +* Copyright (c) 2026, CEA * All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: @@ -102,6 +102,7 @@ void Paroi_scal_hyd_base_VDF::compute_nusselt() const ndeb = le_bord.num_premiere_face(); nfin = ndeb + le_bord.nb_faces(); + ToDo_Kokkos("critical"); for (int num_face = ndeb; num_face < nfin; num_face++) { double dist, lambda; @@ -200,6 +201,7 @@ void Paroi_scal_hyd_base_VDF::imprimer_nusselt(Sortie& os) const } ndeb = le_bord.num_premiere_face(); nfin = ndeb + le_bord.nb_faces(); + ToDo_Kokkos("critical"); for (int num_face = ndeb; num_face < nfin; num_face++) { double x = domaine_VDF.xv(num_face, 0); @@ -251,6 +253,7 @@ void Paroi_scal_hyd_base_VDF::imprimer_nusselt(Sortie& os) const ndeb = le_bord.num_premiere_face(); nfin = ndeb + le_bord.nb_faces(); + ToDo_Kokkos("critical"); for (int num_face = ndeb; num_face < nfin; num_face++) { double x = domaine_VDF.xv(num_face, 0); diff --git a/src/VEF/Champs/Champ_Fonc_Tabule_P0_VEF.cpp b/src/VEF/Champs/Champ_Fonc_Tabule_P0_VEF.cpp index 29143d6603..fc90211b54 100644 --- a/src/VEF/Champs/Champ_Fonc_Tabule_P0_VEF.cpp +++ b/src/VEF/Champs/Champ_Fonc_Tabule_P0_VEF.cpp @@ -33,78 +33,7 @@ void Champ_Fonc_Tabule_P0_VEF::associer_param(const VECT(OBS_PTR(Champ_base)) &l void Champ_Fonc_Tabule_P0_VEF::mettre_a_jour(double t) { - const Domaine_VF& domaine_VF = le_dom_VF.valeur(); - const Table& table = la_table.valeur(); - DoubleTab& mes_valeurs = valeurs(); - const int nb_elem = domaine_VF.nb_elem(), nb_elem_tot = domaine_VF.nb_elem_tot(), nb_param = les_ch_param.size(); - const int nbcomp = mes_valeurs.dimension(1); - const DoubleTab& centres_de_gravites = domaine_VF.xp(); - - // ToDo Kokkos: factorize somewhere this array or rewrite valeur_aux_elems ! - IntTrav les_polys(nb_elem_tot); - IntArrView les_polys_v = static_cast(les_polys).view_wo(); - Kokkos::parallel_for(start_gpu_timer(__KERNEL_NAME__), nb_elem_tot, KOKKOS_LAMBDA(const int num_elem) - { - les_polys_v(num_elem) = num_elem; - }); - end_gpu_timer(__KERNEL_NAME__); - - if (nb_param==1 && nbcomp==1 && table.isfonction()==1) - { - // Ported on GPU. ToDo Kokkos, extend to more than one param or more than one nbcomp - DoubleTrav val_param_aux_elems(nb_elem_tot, nbcomp); - les_ch_param[0]->valeur_aux_elems(centres_de_gravites, les_polys, val_param_aux_elems); - // Cree un parser specifique ParserView pour Kokkos: - ParserView parser(table.parser(0)); - parser.parseString(); - CDoubleTabView val_params_aux_elems_v = val_param_aux_elems.view_ro(); - DoubleTabView mes_valeurs_v = mes_valeurs.view_wo(); - Kokkos::parallel_for(start_gpu_timer(__KERNEL_NAME__), nb_elem, KOKKOS_LAMBDA( - const int num_elem) - { - int threadId = parser.acquire(); - for (int ncomp = 0; ncomp < nbcomp; ncomp++) - { - double val = val_params_aux_elems_v(num_elem, ncomp); - - parser.setVar(0, val, threadId); - mes_valeurs_v(num_elem, ncomp) = parser.eval(threadId); - } - parser.release(threadId); - }); - end_gpu_timer(__KERNEL_NAME__); - } - else - { - ToDo_Kokkos("critical"); - DoubleTabs val_params_aux_elems; - for (int i = 0; i < nb_param; i++) - { - DoubleTab vp(nb_elem_tot, mes_valeurs.dimension(1)); - val_params_aux_elems.add(vp); - } - for (int i = 0; i < nb_param; i++) - les_ch_param[i]->valeur_aux_elems(centres_de_gravites, les_polys, val_params_aux_elems[i]); - - if (table.isfonction() != 2) - { - std::vector vals; - vals.reserve(nb_param); // Pre-allocate space once - for (int num_elem = 0; num_elem < nb_elem; num_elem++) - for (int ncomp = 0; ncomp < nbcomp; ncomp++) - { - vals.clear(); - for (int n = 0; n < nb_param; n++) - vals.push_back(val_params_aux_elems[n](num_elem, ncomp)); - mes_valeurs(num_elem, ncomp) = table.val(vals, ncomp); - } - } - else - { - table.valeurs(val_params_aux_elems[0], centres_de_gravites, t, mes_valeurs); - } - } - Champ_Fonc_base::mettre_a_jour(t); + Champ_Fonc_P0_base::mettre_a_jour(t, la_table.valeur(), les_ch_param); } int Champ_Fonc_Tabule_P0_VEF::initialiser(const double un_temps) diff --git a/src/VEF/Champs/Champ_P1NC.cpp b/src/VEF/Champs/Champ_P1NC.cpp index b6c6b58345..59caa2c95b 100644 --- a/src/VEF/Champs/Champ_P1NC.cpp +++ b/src/VEF/Champs/Champ_P1NC.cpp @@ -565,7 +565,7 @@ void Champ_P1NC::calcul_y_plus(const Domaine_Cl_VEF& domaine_Cl_VEF, DoubleVect& } // tab_visco+=DMINFLOAT; - DoubleTab yplus_faces(1, 1); // will contain yplus values if available + DoubleTrav yplus_faces(1, 1); // will contain yplus values if available int yplus_already_computed = 0; // flag const RefObjU& modele_turbulence = eqn_hydr.get_modele(TURBULENCE); @@ -986,60 +986,97 @@ DoubleTab& Champ_P1NC::calcul_duidxj_paroi(DoubleTab& tab_gij, const DoubleTab& CDoubleTabView tau_tan = tab_tau_tan.view_ro(); CDoubleArrView nu = static_cast(tab_nu).view_ro(); CDoubleArrView nu_turb = static_cast(tab_nu_turb).view_ro(); - DoubleTabView3 gij = tab_gij.view_rw<3>(); - Kokkos::parallel_for(start_gpu_timer(__KERNEL_NAME__), Kokkos::RangePolicy<>(ndeb, nfin), KOKKOS_LAMBDA (const int fac) + // Two-pass approach: compute all C values from the unmodified gij (pass 1), + // then apply corrections atomically (pass 2). This avoids a race condition + // when multiple faces share the same adjacent element (corner elements), and + // gives the same result as serial execution when num1 is unique per boundary. + DoubleTrav tab_C(nfin - ndeb); + DoubleArrView C = static_cast(tab_C).view_rw(); + + // Pass 1: read gij (read-only), compute and store C per face { - double P[3][3]; - int num1 = face_voisins(fac, 0); - // definition des vecteurs unitaires constituant le repere local - // stockes dans la matrice de passage P - // vecteur tangentiel (porte par la vitesse tangentielle) - double sum = 0.; - for (int i = 0; i < dim; i++) - sum += tau_tan(fac, i) * tau_tan(fac, i); - double norme_tau_tan = sqrt(sum); - for (int i = 0; i < dim; i++) - P[i][0] = tau_tan(fac, i) / (norme_tau_tan + DMINFLOAT); - - // vecteur normal a la paroi - sum = 0.; - for (int i = 0; i < dim; i++) - sum += face_normale(fac, i) * face_normale(fac, i); - double norme = sqrt(sum); - - int signe = -oriente_normale(fac, num1, face_voisins); // orientation vers l'interieur - for (int i = 0; i < dim; i++) - P[i][1] = signe * face_normale(fac, i) / norme; - - // (3D) on complete la base par le deuxieme vecteur tangentiel - if (dim == 3) - { - P[0][2] = P[1][0] * P[2][1] - P[2][0] * P[1][1]; - P[1][2] = P[2][0] * P[0][1] - P[0][0] * P[2][1]; - P[2][2] = P[0][0] * P[1][1] - P[1][0] * P[0][1]; - } - // determination du terme d(u_t)/dn a enlever - // -1 - // terme identifie a l'aide du produit : F = P . G . P - // - double dutdn_old = 0.; - for (int i = 0; i < dim; i++) - for (int j = 0; j < dim; j++) + CDoubleTabView3 gij = tab_gij.view_ro<3>(); + Kokkos::parallel_for(start_gpu_timer(__KERNEL_NAME__), Kokkos::RangePolicy<>(ndeb, nfin), KOKKOS_LAMBDA (const int fac) + { + double P[3][3]; + int num1 = face_voisins(fac, 0); + double sum = 0.; + for (int i = 0; i < dim; i++) + sum += tau_tan(fac, i) * tau_tan(fac, i); + double norme_tau_tan = sqrt(sum); + for (int i = 0; i < dim; i++) + P[i][0] = tau_tan(fac, i) / (norme_tau_tan + DMINFLOAT); + + sum = 0.; + for (int i = 0; i < dim; i++) + sum += face_normale(fac, i) * face_normale(fac, i); + double norme = sqrt(sum); + + int signe = -oriente_normale(fac, num1, face_voisins); // orientation vers l'interieur + for (int i = 0; i < dim; i++) + P[i][1] = signe * face_normale(fac, i) / norme; + + if (dim == 3) { - double gij_value = Kokkos::atomic_fetch_add(&gij(num1, i, j), 0.0); - dutdn_old += gij_value * P[j][1] * P[i][0]; + P[0][2] = P[1][0] * P[2][1] - P[2][0] * P[1][1]; + P[1][2] = P[2][0] * P[0][1] - P[0][0] * P[2][1]; + P[2][2] = P[0][0] * P[1][1] - P[1][0] * P[0][1]; } - // Correction finale apportee a la matrice G - double C = -dutdn_old + norme_tau_tan / (nu[num1] + nu_turb[num1]) * porosite_face(fac); + // determination du terme d(u_t)/dn a enlever + // -1 + // terme identifie a l'aide du produit : F = P . G . P + // + double dutdn_old = 0.; + for (int i = 0; i < dim; i++) + for (int j = 0; j < dim; j++) + dutdn_old += gij(num1, i, j) * P[j][1] * P[i][0]; + + // Correction finale apportee a la matrice G + // la division par (nu[num1]+nu_turb[num1]) s'impose du fait que l'operateur de diffusion + // fait intervenir le produit : (nu[num1]+nu_turb[num1])*g(i,j) + C(fac - ndeb) = -dutdn_old + norme_tau_tan / (nu[num1] + nu_turb[num1]) * porosite_face(fac); + }); + end_gpu_timer(__KERNEL_NAME__); + } - // la division par (nu[num1]+nu_turb[num1]) s'impose du fait que l'operateur de diffusion - // fait intervenir le produit : (nu[num1]+nu_turb[num1])*g(i,j) - for (int i = 0; i < dim; i++) - for (int j = 0; j < dim; j++) - Kokkos::atomic_add(&gij(num1, i, j), C * P[j][1] * P[i][0]); - }); - end_gpu_timer(__KERNEL_NAME__); + // Pass 2: apply corrections to gij atomically + { + DoubleTabView3 gij = tab_gij.view_rw<3>(); + Kokkos::parallel_for(start_gpu_timer(__KERNEL_NAME__), Kokkos::RangePolicy<>(ndeb, nfin), KOKKOS_LAMBDA (const int fac) + { + double P[3][3]; + int num1 = face_voisins(fac, 0); + double sum = 0.; + for (int i = 0; i < dim; i++) + sum += tau_tan(fac, i) * tau_tan(fac, i); + double norme_tau_tan = sqrt(sum); + for (int i = 0; i < dim; i++) + P[i][0] = tau_tan(fac, i) / (norme_tau_tan + DMINFLOAT); + + sum = 0.; + for (int i = 0; i < dim; i++) + sum += face_normale(fac, i) * face_normale(fac, i); + double norme = sqrt(sum); + + int signe = -oriente_normale(fac, num1, face_voisins); // orientation vers l'interieur + for (int i = 0; i < dim; i++) + P[i][1] = signe * face_normale(fac, i) / norme; + + if (dim == 3) + { + P[0][2] = P[1][0] * P[2][1] - P[2][0] * P[1][1]; + P[1][2] = P[2][0] * P[0][1] - P[0][0] * P[2][1]; + P[2][2] = P[0][0] * P[1][1] - P[1][0] * P[0][1]; + } + + const double coeff = C(fac - ndeb); + for (int i = 0; i < dim; i++) + for (int j = 0; j < dim; j++) + Kokkos::atomic_add(&gij(num1, i, j), coeff * P[j][1] * P[i][0]); + }); + end_gpu_timer(__KERNEL_NAME__); + } } } diff --git a/src/VEF/Champs/Champ_P1NC_implementation.cpp b/src/VEF/Champs/Champ_P1NC_implementation.cpp index 4203a4a587..d23658c66b 100644 --- a/src/VEF/Champs/Champ_P1NC_implementation.cpp +++ b/src/VEF/Champs/Champ_P1NC_implementation.cpp @@ -1794,7 +1794,7 @@ double Champ_P1NC_implementation::valeur_a_sommet_compo(int num_som, int num_ele } KOKKOS_INLINE_FUNCTION -double Champ_P1NC_implementation::valeur_a_sommet_compo(int num_som, int num_elem, int ncomp, CIntTabView elem_faces, CIntTabView sommet_elem, CDoubleTabView ch) const +double Champ_P1NC_implementation::valeur_a_sommet_compo(int num_som, int num_elem, int ncomp, CIntTabView elem_faces, CIntTabView sommet_elem, CDoubleTabView ch) { double val=0; if (num_elem != -1) diff --git a/src/VEF/Champs/Champ_P1NC_implementation.h b/src/VEF/Champs/Champ_P1NC_implementation.h index 9c9b7edfa7..ed630d4e10 100644 --- a/src/VEF/Champs/Champ_P1NC_implementation.h +++ b/src/VEF/Champs/Champ_P1NC_implementation.h @@ -36,12 +36,12 @@ class Champ_P1NC_implementation: public Champ_implementation_divers int fixer_nb_valeurs_nodales(int); - KOKKOS_INLINE_FUNCTION double fonction_forme_2D_v(double x, double y, int le_poly, int face, CIntTabView sommet_poly, CDoubleTabView coord) const + KOKKOS_INLINE_FUNCTION static double fonction_forme_2D_v(double x, double y, int le_poly, int face, CIntTabView sommet_poly, CDoubleTabView coord) { return 1 - 2 * coord_barycentrique_P1_triangle(sommet_poly, coord, x, y, le_poly, face); } - KOKKOS_INLINE_FUNCTION double fonction_forme_3D_v(double x, double y, double z, int le_poly, int face, CIntTabView sommet_poly, CDoubleTabView coord) const + KOKKOS_INLINE_FUNCTION static double fonction_forme_3D_v(double x, double y, double z, int le_poly, int face, CIntTabView sommet_poly, CDoubleTabView coord) { return 1 - 3 * coord_barycentrique_P1_tetraedre(sommet_poly, coord, x, y, z, le_poly, face); } @@ -82,8 +82,8 @@ class Champ_P1NC_implementation: public Champ_implementation_divers DoubleVect& valeur_aux_sommets_compo(const Domaine& dom, DoubleVect& ch_som, int ncomp) const override; // Retourne la valeur de la composante ncomp du champ au sommet num_som sur l'element le_poly double valeur_a_sommet_compo(int num_som, int le_poly, int ncomp) const; - KOKKOS_INLINE_FUNCTION - double valeur_a_sommet_compo(int num_som, int num_elem, int ncomp, CIntTabView elem_faces, CIntTabView sommet_elem, CDoubleTabView ch) const; + KOKKOS_INLINE_FUNCTION static + double valeur_a_sommet_compo(int num_som, int num_elem, int ncomp, CIntTabView elem_faces, CIntTabView sommet_elem, CDoubleTabView ch); DoubleTab& valeur_aux_elems_smooth(const DoubleTab& positions, const IntVect& les_polys, DoubleTab& valeurs); DoubleVect& valeur_aux_elems_compo_smooth(const DoubleTab& positions, const IntVect& les_polys, DoubleVect& valeurs, int ncomp); diff --git a/src/VEF/Champs/Champ_P1iP1B_implementation.cpp b/src/VEF/Champs/Champ_P1iP1B_implementation.cpp index d0fd8708bd..c2c563d661 100644 --- a/src/VEF/Champs/Champ_P1iP1B_implementation.cpp +++ b/src/VEF/Champs/Champ_P1iP1B_implementation.cpp @@ -276,7 +276,7 @@ void assembler(const Domaine_VEF& domaine_VEF, Matrice& matrice) Matrice_Morse_Sym& MatPoisson=ref_cast(Matrice_Morse_Sym, matrice.valeur()); int nb_som_tot = domaine_VEF.domaine().nb_som_tot(); int nb_arete_tot = domaine_VEF.domaine().nb_aretes_tot(); - int nnz=nb_som_tot+nb_arete_tot; + int nnz=0; const IntTab& aretes_som=domaine_VEF.domaine().aretes_som(); const ArrOfInt& renum_arete_perio=domaine_VEF.get_renum_arete_perio(); const Domaine& dom=domaine_VEF.domaine(); @@ -299,6 +299,9 @@ void assembler(const Domaine_VEF& domaine_VEF, Matrice& matrice) } voisins[som1].add(som2); coeffs[som1].add(1); + nnz++; + if (diag(som1)==0) nnz++; + if (diag(som2)==0) nnz++; diag(som1)++; diag(som2)++; } @@ -309,6 +312,7 @@ void assembler(const Domaine_VEF& domaine_VEF, Matrice& matrice) { assert(i!=dom.get_renum_som_perio(i)); diag(i)=1; // Sommets periodiques + nnz++; } } MatPoisson.dimensionner(nb_som_tot, nnz) ; diff --git a/src/VEF/Geometrie/Domaine_Cl_VEF.cpp b/src/VEF/Geometrie/Domaine_Cl_VEF.cpp index dafe173413..c39beaf01b 100644 --- a/src/VEF/Geometrie/Domaine_Cl_VEF.cpp +++ b/src/VEF/Geometrie/Domaine_Cl_VEF.cpp @@ -1,5 +1,5 @@ /**************************************************************************** -* Copyright (c) 2025, CEA +* Copyright (c) 2026, CEA * All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: @@ -147,7 +147,6 @@ void Domaine_Cl_VEF::remplir_volumes_entrelaces_Cl(const Domaine_VEF& le_dom_VEF int nb_poly_tot = le_dom_VEF.domaine().nb_elem_tot(); ArrOfInt poly_fait(nb_poly_tot); - poly_fait = 0; for (int i = 0; i < les_conditions_limites_.size(); i++) { const Cond_lim_base& la_cl = les_conditions_limites_[i].valeur(); @@ -425,7 +424,7 @@ void Domaine_Cl_VEF::imposer_cond_lim(Champ_Inc_base& ch, double temps) surf += face_n * face_n; } // flux /= surf; // Fixed bug: Arithmetic exception - if (std::fabs(surf) >= DMINFLOAT) + if (Kokkos::fabs(surf) >= DMINFLOAT) flux /= surf; for (int ncomp = 0; ncomp < nb_comp; ncomp++) tab(num_face, ncomp) -= flux * face_normales(num_face, ncomp); @@ -551,7 +550,7 @@ void Domaine_Cl_VEF::imposer_cond_lim(Champ_Inc_base& ch, double temps) surf += face_n * face_n; } // flux /= surf; // Fixed bug: Arithmetic exception - if (std::fabs(surf) >= DMINFLOAT) + if (Kokkos::fabs(surf) >= DMINFLOAT) flux /= surf; for (int ncomp = 0; ncomp < nb_comp; ncomp++) tab(num_face, ncomp) = val_imp(num_face - ndeb, ncomp) - @@ -621,7 +620,7 @@ void Domaine_Cl_VEF::imposer_cond_lim(Champ_Inc_base& ch, double temps) int num2 = le_bord.nb_faces_tot(); CIntArrView num_face = le_bord.num_face().view_ro(); CIntTabView faces = domaine_vef.face_sommets().view_ro(); - CDoubleArrView flux_impose = static_cast(la_sortie_libre.flux_impose(true)).view_ro(); + CDoubleArrView flux_impose = static_cast(la_sortie_libre.tab_flux_impose(true)).view_ro(); CDoubleArrView face_surfaces = domaine_vef.face_surfaces().view_ro(); DoubleArrView surf_loc = static_cast(tab_surf_loc).view_rw(); DoubleArrView pression = static_cast(tab_pression).view_rw(); diff --git a/src/VEF/Geometrie/VerifierCoin.cpp b/src/VEF/Geometrie/VerifierCoin.cpp index 1353f12620..e297f2c132 100644 --- a/src/VEF/Geometrie/VerifierCoin.cpp +++ b/src/VEF/Geometrie/VerifierCoin.cpp @@ -131,7 +131,6 @@ Entree& VerifierCoin::interpreter_(Entree& is) // On compte les elements attaches a chaque sommet: ArrOfInt nb_elem_per_som(nbsom); - nb_elem_per_som = 0; for (int ne = 0; ne < nbelem; ne++) for (int ns = 0; ns < dimension+1; ns++) nb_elem_per_som(les_elems(ne,ns))++; diff --git a/src/VEF/Geometrie/distances_VEF.cpp b/src/VEF/Geometrie/distances_VEF.cpp index 3faf6f311b..846e356413 100644 --- a/src/VEF/Geometrie/distances_VEF.cpp +++ b/src/VEF/Geometrie/distances_VEF.cpp @@ -1,5 +1,5 @@ /**************************************************************************** -* Copyright (c) 2025, CEA +* Copyright (c) 2026, CEA * All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: @@ -13,6 +13,7 @@ * *****************************************************************************/ #include +#include #include #include @@ -493,14 +494,13 @@ double distance_3D(int fac,int elem,const Domaine_VEF& domaine) -DoubleVect& calcul_longueur_filtre(DoubleVect& longueur_filtre, const Motcle& methode, const Domaine_VEF& domaine) +DoubleVect& calcul_longueur_filtre(DoubleVect& tab_longueur_filtre, const Motcle& methode, const Domaine_VEF& domaine) { - int nbr_element=domaine.nb_elem_tot(); - int element; - int dim=Objet_U::dimension; + const int nbr_element = domaine.nb_elem_tot(); + const int dim = Objet_U::dimension; const Domaine& domaine_geom = domaine.domaine(); - if (longueur_filtre.size() != nbr_element) + if (tab_longueur_filtre.size() != nbr_element) { Cerr << "erreur dans la taille du DoubleVect valeurs de la longueur du filtre" << finl; Process::exit(); @@ -508,125 +508,91 @@ DoubleVect& calcul_longueur_filtre(DoubleVect& longueur_filtre, const Motcle& me if (methode == Motcle("volume") || methode == Motcle("volume_sans_lissage")) // racine cubique du volume { - longueur_filtre=-1.; - - const DoubleVect& volume = domaine.volumes(); - for (element=0; element(tab_longueur_filtre_sommet).view_rw(); + CDoubleArrView longueur_filtre = tab_longueur_filtre.view_ro(); + + Kokkos::parallel_for(start_gpu_timer(__KERNEL_NAME__), range_1D(0, nbr_element), KOKKOS_LAMBDA(const int element) + { + for (int som1 = 0; som1 < dim+1; som1++) { - som_0 = les_sommets(element, som1); - som_1 = dom.get_renum_som_perio(som_0); - longueur_filtre_sommet[som_1] = std::max(longueur_filtre(element), longueur_filtre_sommet[som_1]); + int som_0 = les_sommets(element, som1); + int som_1 = renum_som_perio(som_0); + Kokkos::atomic_fetch_max(&longueur_filtre_sommet(som_1), longueur_filtre(element)); } - - longueur_filtre=-1.; - - for (element=0; element0.); - - return longueur_filtre; + assert(nbr_element == 0 || min_array(tab_longueur_filtre) > 0.); + return tab_longueur_filtre; } double distance_sommets(const int sommet1, const int sommet2, const Domaine_VEF& domaine) diff --git a/src/VEF/Geometrie/distances_VEF.h b/src/VEF/Geometrie/distances_VEF.h index b4baa80915..1ad81c0d0e 100644 --- a/src/VEF/Geometrie/distances_VEF.h +++ b/src/VEF/Geometrie/distances_VEF.h @@ -1,5 +1,5 @@ /**************************************************************************** -* Copyright (c) 2025, CEA +* Copyright (c) 2026, CEA * All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: @@ -117,7 +117,7 @@ double distance_face(int dim, int fac, int fac1, CDoubleTabView xv, CDoubleTabVi a += ni * (xv(fac1,i) - xv(fac,i)); b += ni * ni; } - return std::fabs(a / sqrt(b)); + return Kokkos::fabs(a / sqrt(b)); } // Kokkos function (factorize distance_2D and distance_3D functions) @@ -132,7 +132,7 @@ double distance(int dim,int fac,int elem, CDoubleTabView xp, CDoubleTabView xv, norme += fn_i * fn_i; ps += fn_i * (xp(elem, i) - xv(fac, i)); } - return std::fabs(ps/sqrt(norme)); + return Kokkos::fabs(ps/sqrt(norme)); } // Kokkos function (factorize norm_2D_vit1 and norm_3D_vit1) KOKKOS_INLINE_FUNCTION @@ -168,7 +168,7 @@ double norm_vit1(int dim, CDoubleTabView vit, int fac, int nfac, const int* num, sum_carre += carre(v[i]); psc += v[i] * r[i]; } - double norm_vit = sqrt(std::fabs(sum_carre-carre(psc))); + double norm_vit = sqrt(Kokkos::fabs(sum_carre-carre(psc))); // val1,val2 val3 sont les vitesses tangentielles for (int i=0; i(tab_transporte).view_ro(); CDoubleTabView3 Kij = tab_Kij.view_ro<3>(); - CDoubleTabView val_ext = la_sortie_libre.val_ext().view_ro(); + CDoubleTabView val_ext = la_sortie_libre.tab_val_ext().view_ro(); DoubleArrView resuV = static_cast(tab_resu).view_rw(); Kokkos::parallel_for(start_gpu_timer(__KERNEL_NAME__), Kokkos::RangePolicy<>(0, num2), KOKKOS_LAMBDA( @@ -1135,8 +1135,8 @@ Op_Conv_Muscl_New_VEF_Face::ajouter_antidiffusion_v2(const DoubleTab& tab_Kij, c double R; if (kij >= 0.) //facei amont { - if (fij >= 0.) R = (std::fabs(P_plus) < DMINFLOAT) ? 0. : Q_plus / P_plus; - else R = (std::fabs(P_moins) < DMINFLOAT) ? 0. : Q_moins / P_moins; + if (fij >= 0.) R = (Kokkos::fabs(P_plus) < DMINFLOAT) ? 0. : Q_plus / P_plus; + else R = (Kokkos::fabs(P_moins) < DMINFLOAT) ? 0. : Q_moins / P_moins; R = minmod(R); @@ -1153,8 +1153,8 @@ Op_Conv_Muscl_New_VEF_Face::ajouter_antidiffusion_v2(const DoubleTab& tab_Kij, c } else //facej amont { - if (fji <= 0.) R = (std::fabs(P_moins) < DMINFLOAT) ? 0. : Q_moins / P_moins; - else R = (std::fabs(P_plus) < DMINFLOAT) ? 0. : Q_plus / P_plus; + if (fji <= 0.) R = (Kokkos::fabs(P_moins) < DMINFLOAT) ? 0. : Q_moins / P_moins; + else R = (Kokkos::fabs(P_plus) < DMINFLOAT) ? 0. : Q_plus / P_plus; R = minmod(R); R *= fji; @@ -1242,14 +1242,14 @@ Op_Conv_Muscl_New_VEF_Face::ajouter_antidiffusion_v1(const DoubleTab& tab_Kij, c //Face amont : facei if (fij >= 0.) { - Ri = (std::fabs(Pi_plus) < DMINFLOAT) ? 0. : Qi_plus / Pi_plus; - Rj = (std::fabs(Pj_moins) < DMINFLOAT) ? 0. : Qj_moins / + Ri = (Kokkos::fabs(Pi_plus) < DMINFLOAT) ? 0. : Qi_plus / Pi_plus; + Rj = (Kokkos::fabs(Pj_moins) < DMINFLOAT) ? 0. : Qj_moins / Pj_moins;//car fji=-fij } else { - Ri = (std::fabs(Pi_moins) < DMINFLOAT) ? 0. : Qi_moins / Pi_moins; - Rj = (std::fabs(Pj_plus) < DMINFLOAT) ? 0. : Qj_plus / + Ri = (Kokkos::fabs(Pi_moins) < DMINFLOAT) ? 0. : Qi_moins / Pi_moins; + Rj = (Kokkos::fabs(Pj_plus) < DMINFLOAT) ? 0. : Qj_plus / Pj_plus;//car fji=-fij } @@ -1268,13 +1268,13 @@ Op_Conv_Muscl_New_VEF_Face::ajouter_antidiffusion_v1(const DoubleTab& tab_Kij, c //Face amont : facej if (fji <= 0.) { - Rj = (std::fabs(Pj_moins) < DMINFLOAT) ? 0. : Qj_moins / Pj_moins; - Ri = (std::fabs(Pi_plus) < DMINFLOAT) ? 0. : Qi_plus / Pi_plus; + Rj = (Kokkos::fabs(Pj_moins) < DMINFLOAT) ? 0. : Qj_moins / Pj_moins; + Ri = (Kokkos::fabs(Pi_plus) < DMINFLOAT) ? 0. : Qi_plus / Pi_plus; } else { - Rj = (std::fabs(Pj_plus) < DMINFLOAT) ? 0. : Qj_plus / Pj_plus; - Ri = (std::fabs(Pi_moins) < DMINFLOAT) ? 0. : Qi_moins / Pi_moins; + Rj = (Kokkos::fabs(Pj_plus) < DMINFLOAT) ? 0. : Qj_plus / Pj_plus; + Ri = (Kokkos::fabs(Pi_moins) < DMINFLOAT) ? 0. : Qi_moins / Pi_moins; } if (is_dirichlet_faces(facei)) @@ -1305,7 +1305,7 @@ Op_Conv_Muscl_New_VEF_Face::calculer_senseur(CDoubleTabView3 Kij, CDoubleTabView const int dim, const int nb_comp, const int face_i, CIntTabView elem_faces, CIntTabView face_voisins, CIntTabView num_fac_loc, double& P_plus, double& P_moins, - double& Q_plus, double& Q_moins) const + double& Q_plus, double& Q_moins) { const int nb_faces_elem=(int)elem_faces.extent(1); for (int elem_voisin=0; elem_voisin<2; elem_voisin++) diff --git a/src/VEF/Operateurs/Op_Conv/Op_Conv_Muscl_New_VEF_Face.h b/src/VEF/Operateurs/Op_Conv/Op_Conv_Muscl_New_VEF_Face.h index 12539ddfb3..f267790527 100644 --- a/src/VEF/Operateurs/Op_Conv/Op_Conv_Muscl_New_VEF_Face.h +++ b/src/VEF/Operateurs/Op_Conv/Op_Conv_Muscl_New_VEF_Face.h @@ -63,7 +63,7 @@ class Op_Conv_Muscl_New_VEF_Face : public Op_Conv_VEF_Face //test void modifier_pour_Cl(Matrice_Morse&, DoubleTab&) const override; - public_for_cuda + protected_but_public_for_cuda void calculer_flux_bords(const DoubleTab&, const DoubleTab&, const DoubleTab&) const; void calculer_coefficients_operateur_centre(DoubleTab&,DoubleTab&,DoubleTab&,DoubleTab&,const int, const DoubleTab& vitesse) const; void calculer_flux_operateur_centre(DoubleTab&,const DoubleTab&,const DoubleTab&,const DoubleTab&,const DoubleTab&,const int,const DoubleTab&,const DoubleTab&) const; @@ -84,7 +84,7 @@ private : DoubleTab& ajouter_antidiffusion(const DoubleTab&, const DoubleTab&, const DoubleTab&, DoubleTab&) const; - KOKKOS_INLINE_FUNCTION void calculer_senseur(CDoubleTabView3, CDoubleTabView4, CDoubleArrView, const int, const int, const int, CIntTabView, CIntTabView, CIntTabView, double&, double&, double&, double&) const; + KOKKOS_INLINE_FUNCTION static void calculer_senseur(CDoubleTabView3, CDoubleTabView4, CDoubleArrView, const int, const int, const int, CIntTabView, CIntTabView, CIntTabView, double&, double&, double&, double&); void calculer_data_pour_dirichlet(); //Attributs de la classe diff --git a/src/VEF/Operateurs/Op_Conv/Op_Conv_VEF_Face.cpp b/src/VEF/Operateurs/Op_Conv/Op_Conv_VEF_Face.cpp index f0bf6722c3..387c06ef77 100644 --- a/src/VEF/Operateurs/Op_Conv/Op_Conv_VEF_Face.cpp +++ b/src/VEF/Operateurs/Op_Conv/Op_Conv_VEF_Face.cpp @@ -285,15 +285,15 @@ void compute_flux_tetra_kernel(const FluxTetraKernelData& kernel_data) // Determination du type de CL selon le rang int rang = rang_elem_non_std_v(poly); double xc[3]; - TRUST_IFCONSTEXPR (ordre == 3) // A optimiser! Risque de mauvais resultats en parallel si ordre=3 - { - double xsom[12]; - for (int i = 0; i < nsom_; i++) - for (int j = 0; j < dim; j++) - xsom[i * 3 + j] = coord_sommets_v(les_elems_[i], j); - int idirichlet, n1, n2, n3; - calcul_xg_tetra(xc, xsom, itypcl, idirichlet, n1, n2, n3); - } + if (ordre == 3) // A optimiser! Risque de mauvais resultats en parallel si ordre=3 + { + double xsom[12]; + for (int i = 0; i < nsom_; i++) + for (int j = 0; j < dim; j++) + xsom[i * 3 + j] = coord_sommets_v(les_elems_[i], j); + int idirichlet, n1, n2, n3; + calcul_xg_tetra(xc, xsom, itypcl, idirichlet, n1, n2, n3); + } double xp[3] = { xp_v(poly,0), xp_v(poly,1), xp_v(poly,2) }; @@ -1275,7 +1275,7 @@ DoubleTab& Op_Conv_VEF_Face::ajouter_gen(const DoubleTab& transporte, const Cham int num2 = num1 + le_bord.nb_faces(); int dim = Objet_U::dimension; CDoubleTabView face_normale = domaine_VEF.face_normales().view_ro(); - CDoubleTabView val_ext = la_sortie_libre.val_ext().view_ro(); + CDoubleTabView val_ext = la_sortie_libre.tab_val_ext().view_ro(); CDoubleTabView transporte_face_v = transporte_face.view_ro(); CDoubleTabView vitesse_face_v = vitesse_face.view_ro(); DoubleTabView flux_b_v = flux_b.view_wo(); @@ -1780,7 +1780,7 @@ void Op_Conv_VEF_Face::remplir_fluent() const double psc_m = (psc_c + psc_s + psc_s2) / dim; int num = (psc_m >= 0 ? num2 : num1); - Kokkos::atomic_add(&fluent[num], std::abs(psc_m)); + Kokkos::atomic_add(&fluent[num], Kokkos::fabs(psc_m)); } // fin de la boucle sur les facettes }; Kokkos::parallel_for(start_gpu_timer(__KERNEL_NAME__), nb_elem_tot, kernel); diff --git a/src/VEF/Operateurs/Op_Conv/Op_Conv_VEF_Face.h b/src/VEF/Operateurs/Op_Conv/Op_Conv_VEF_Face.h index 1d1b57e7d2..6ab63f137e 100644 --- a/src/VEF/Operateurs/Op_Conv/Op_Conv_VEF_Face.h +++ b/src/VEF/Operateurs/Op_Conv/Op_Conv_VEF_Face.h @@ -54,10 +54,11 @@ class Op_Conv_VEF_Face : public Op_Conv_VEF_base void get_alpha(double& ) const; void get_type_op(int& )const; -protected: + protected_but_public_for_cuda DoubleTab& ajouter_gen(const DoubleTab& transporte, const Champ_Inc_base& la_vitesse, DoubleTab& resu) const; void ajouter_contribution_gen(const DoubleTab& transporte, const Champ_Inc_base& la_vitesse, Matrice_Morse& matrice ) const; +protected: Motcle type_lim; enum type_lim_type {type_lim_minmod,type_lim_vanleer,type_lim_vanalbada,type_lim_chakravarthy,type_lim_superbee}; type_lim_type type_lim_int = type_lim_minmod; diff --git a/src/VEF/Operateurs/Op_Diff_Dift/Op_Diff_VEF_Face.cpp b/src/VEF/Operateurs/Op_Diff_Dift/Op_Diff_VEF_Face.cpp index 555126b88c..482e5d20a0 100644 --- a/src/VEF/Operateurs/Op_Diff_Dift/Op_Diff_VEF_Face.cpp +++ b/src/VEF/Operateurs/Op_Diff_Dift/Op_Diff_VEF_Face.cpp @@ -217,7 +217,7 @@ void Op_Diff_VEF_Face::ajouter_cas_scalaire(const DoubleTab& tab_inconnue, { const Neumann_paroi& la_cl_paroi = ref_cast(Neumann_paroi, la_cl.valeur()); CDoubleArrView surface = domaine_VEF.face_surfaces().view_ro(); - CDoubleTabView flux_impose = la_cl_paroi.flux_impose().view_ro(); + CDoubleTabView flux_impose = la_cl_paroi.tab_flux_impose().view_ro(); DoubleArrView flux_bords = static_cast(tab_flux_bords).view_rw(); DoubleArrView resu = static_cast(tab_resu).view_rw(); Kokkos::parallel_for(start_gpu_timer(__KERNEL_NAME__), Kokkos::RangePolicy<>(ndeb, nfin), KOKKOS_LAMBDA(const int face) diff --git a/src/VEF/Operateurs/Op_Diff_Dift/Op_Diff_VEF_Face_Stab.cpp b/src/VEF/Operateurs/Op_Diff_Dift/Op_Diff_VEF_Face_Stab.cpp index 0ef4516074..c8fc65c851 100644 --- a/src/VEF/Operateurs/Op_Diff_Dift/Op_Diff_VEF_Face_Stab.cpp +++ b/src/VEF/Operateurs/Op_Diff_Dift/Op_Diff_VEF_Face_Stab.cpp @@ -839,11 +839,14 @@ void Op_Diff_VEF_Face_Stab::completer() if ( (sub_type(Dirichlet,la_cl.valeur())) || (sub_type(Dirichlet_homogene,la_cl.valeur())) ) - for (ind_face=0; ind_face double viscA(int face_i, int face_j, int num_elem, const _TYPE_& diffu) const; template - KOKKOS_INLINE_FUNCTION double viscA(int face_i, int face_j, int num_elem, const _TYPE_& diffu, CIntTabView face_voisins_v, CDoubleTabView face_normales_v, CDoubleArrView inverse_volumes_v) const; + KOKKOS_INLINE_FUNCTION static double viscA(int face_i, int face_j, int num_elem, const _TYPE_& diffu, CIntTabView face_voisins_v, CDoubleTabView face_normales_v, CDoubleArrView inverse_volumes_v); double calculer_dt_stab() const override; void calculer_pour_post(Champ_base& espace_stockage,const Nom& option,int comp) const override; @@ -75,14 +75,14 @@ class Op_Diff_VEF_base : public Operateur_Diff_base, public Op_VEF_Face template std::enable_if_t< std::is_same<_TYPE_, TRUSTArray>::value , double> inline diffu__(const int comp, const int num_elem, const _TYPE_ &diffu) const { return diffu(comp); } - template std::enable_if_t< std::is_same<_TYPE_, double>::value , double> - KOKKOS_INLINE_FUNCTION diffu__view(const int comp, const int num_elem, const _TYPE_ &diffu) const { return diffu; } + template static std::enable_if_t< std::is_same<_TYPE_, double>::value , double> + KOKKOS_INLINE_FUNCTION diffu__view(const int comp, const int num_elem, const _TYPE_ &diffu) { return diffu; } - template std::enable_if_t< std::is_same<_TYPE_, TRUSTTab>::value , double> - KOKKOS_INLINE_FUNCTION diffu__view(const int comp, const int num_elem, const _TYPE_ &diffu) const { return diffu(num_elem, comp); } + template static std::enable_if_t< std::is_same<_TYPE_, TRUSTTab>::value , double> + KOKKOS_INLINE_FUNCTION diffu__view(const int comp, const int num_elem, const _TYPE_ &diffu) { return diffu(num_elem, comp); } - template std::enable_if_t< std::is_same<_TYPE_, TRUSTArray>::value , double> - KOKKOS_INLINE_FUNCTION diffu__view(const int comp, const int num_elem, const _TYPE_ &diffu) const { return diffu(comp); } + template static std::enable_if_t< std::is_same<_TYPE_, TRUSTArray>::value , double> + KOKKOS_INLINE_FUNCTION diffu__view(const int comp, const int num_elem, const _TYPE_ &diffu) { return diffu(comp); } }; // ATTENTION le diffu intervenant dans les fonctions n'est que LOCAL (on appelle d_nu apres) @@ -117,7 +117,7 @@ inline double Op_Diff_VEF_base::viscA(int i, int j, int num_elem, const _TYPE_ & } template -KOKKOS_INLINE_FUNCTION double Op_Diff_VEF_base::viscA(int i, int j, int num_elem, const _TYPE_ &diffu, CIntTabView face_voisins_v, CDoubleTabView face_normales_v, CDoubleArrView inverse_volumes_v) const +KOKKOS_INLINE_FUNCTION double Op_Diff_VEF_base::viscA(int i, int j, int num_elem, const _TYPE_ &diffu, CIntTabView face_voisins_v, CDoubleTabView face_normales_v, CDoubleArrView inverse_volumes_v) { constexpr bool is_double = std::is_same<_TYPE_, double>::value; int dim = (int)face_normales_v.extent(1); diff --git a/src/VEF/Operateurs/Op_Diff_Dift/Op_Dift_Stab_VEF_Face.cpp b/src/VEF/Operateurs/Op_Diff_Dift/Op_Dift_Stab_VEF_Face.cpp index 8457451bea..339c2dc9d0 100644 --- a/src/VEF/Operateurs/Op_Diff_Dift/Op_Dift_Stab_VEF_Face.cpp +++ b/src/VEF/Operateurs/Op_Diff_Dift/Op_Dift_Stab_VEF_Face.cpp @@ -622,11 +622,14 @@ void Op_Dift_Stab_VEF_Face::completer() int nb_faces_bord_tot = le_bord.nb_faces_tot(), face = -1; if ((sub_type(Dirichlet, la_cl.valeur())) || (sub_type(Dirichlet_homogene, la_cl.valeur()))) - for (ind_face = 0; ind_face < nb_faces_bord_tot; ind_face++) - { - face = le_bord.num_face(ind_face); - is_dirichlet_faces_(face) = 1; - } + { + ToDo_Kokkos("critical"); + for (ind_face = 0; ind_face < nb_faces_bord_tot; ind_face++) + { + face = le_bord.num_face(ind_face); + is_dirichlet_faces_(face) = 1; + } + } } } diff --git a/src/VEF/Operateurs/Op_Diff_Dift/Op_Dift_VEF_Face_Gen.h b/src/VEF/Operateurs/Op_Diff_Dift/Op_Dift_VEF_Face_Gen.h index de9e38ba21..ce65724f3f 100644 --- a/src/VEF/Operateurs/Op_Diff_Dift/Op_Dift_VEF_Face_Gen.h +++ b/src/VEF/Operateurs/Op_Diff_Dift/Op_Dift_VEF_Face_Gen.h @@ -66,7 +66,7 @@ class Op_Dift_VEF_Face_Gen private: - public_for_cuda + protected_but_public_for_cuda template void ajouter_bord_perio_gen__(const int , const DoubleTab&, DoubleTab* /* Si explicite */ , Matrice_Morse* /* Si implicite */, const DoubleTab&, const DoubleTab&, const DoubleVect& , DoubleTab* flux_bord = nullptr /* flux_bords */) const; diff --git a/src/VEF/Operateurs/Op_Diff_Dift/Op_Dift_VEF_Face_Gen.tpp b/src/VEF/Operateurs/Op_Diff_Dift/Op_Dift_VEF_Face_Gen.tpp index 90c58c0224..2c0ad4cdcc 100644 --- a/src/VEF/Operateurs/Op_Diff_Dift/Op_Dift_VEF_Face_Gen.tpp +++ b/src/VEF/Operateurs/Op_Diff_Dift/Op_Dift_VEF_Face_Gen.tpp @@ -424,7 +424,7 @@ void Op_Dift_VEF_Face_Gen::modifie_pour_cl_gen(const DoubleTab& tab_i if (sub_type(Neumann_paroi, la_cl.valeur())) { const Neumann_paroi& la_cl_paroi = ref_cast(Neumann_paroi, la_cl.valeur()); - CDoubleTabView flux_impose = la_cl_paroi.flux_impose().view_ro(); + CDoubleTabView flux_impose = la_cl_paroi.tab_flux_impose().view_ro(); CDoubleArrView face_surfaces = domaine_VEF.face_surfaces().view_ro(); DoubleTabView flux_bords = tab_flux_bords.view_wo(); DoubleTabView resu = tab_resu.view_rw(); diff --git a/src/VEF/Operateurs/Op_Divers/Op_Div_VEFP1B_Elem.cpp b/src/VEF/Operateurs/Op_Divers/Op_Div_VEFP1B_Elem.cpp index d1eddd8cc8..d33be3a4ee 100644 --- a/src/VEF/Operateurs/Op_Divers/Op_Div_VEFP1B_Elem.cpp +++ b/src/VEF/Operateurs/Op_Divers/Op_Div_VEFP1B_Elem.cpp @@ -1,5 +1,5 @@ /**************************************************************************** -* Copyright (c) 2025, CEA +* Copyright (c) 2026, CEA * All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: @@ -112,18 +112,15 @@ DoubleTab& Op_Div_VEFP1B_Elem::ajouter_elem(const DoubleTab& vit, DoubleTab& div RandomAccessView face_voisins_v = face_voisins.view_ro(); RandomAccessView face_normales_v = face_normales.view_ro(); RandomAccessView vit_v = vit.view_ro(); - Kokkos::parallel_for(start_gpu_timer(__KERNEL_NAME__), nb_elem, KOKKOS_LAMBDA( - const int elem) + Kokkos::parallel_for(start_gpu_timer(__KERNEL_NAME__), range_2D({0,0}, {nb_elem,nfe}), KOKKOS_LAMBDA( + const int elem, const int indice) { double pscf = 0; - for (int indice = 0; indice < nfe; indice++) - { - int face = elem_faces_v(elem, indice); - int signe = elem == face_voisins_v(face, 0) ? 1 : -1; - for (int comp = 0; comp < dim; comp++) - pscf += signe * vit_v(face, comp) * face_normales_v(face, comp); - } - div_v(elem, 0) += pscf; + int face = elem_faces_v(elem, indice); + int signe = elem == face_voisins_v(face, 0) ? 1 : -1; + for (int comp = 0; comp < dim; comp++) + pscf += signe * vit_v(face, comp) * face_normales_v(face, comp); + Kokkos::atomic_add(&div_v(elem, 0), pscf); }); } else @@ -131,18 +128,15 @@ DoubleTab& Op_Div_VEFP1B_Elem::ajouter_elem(const DoubleTab& vit, DoubleTab& div CIntTabView face_voisins_v = face_voisins.view_ro(); CDoubleTabView face_normales_v = face_normales.view_ro(); CDoubleTabView vit_v = vit.view_ro(); - Kokkos::parallel_for(start_gpu_timer(__KERNEL_NAME__), nb_elem, KOKKOS_LAMBDA( - const int elem) + Kokkos::parallel_for(start_gpu_timer(__KERNEL_NAME__), range_2D({0,0}, {nb_elem,nfe}), KOKKOS_LAMBDA( + const int elem, const int indice) { double pscf = 0; - for (int indice = 0; indice < nfe; indice++) - { - int face = elem_faces_v(elem, indice); - int signe = elem == face_voisins_v(face, 0) ? 1 : -1; - for (int comp = 0; comp < dim; comp++) - pscf += signe * vit_v(face, comp) * face_normales_v(face, comp); - } - div_v(elem, 0) += pscf; + int face = elem_faces_v(elem, indice); + int signe = elem == face_voisins_v(face, 0) ? 1 : -1; + for (int comp = 0; comp < dim; comp++) + pscf += signe * vit_v(face, comp) * face_normales_v(face, comp); + Kokkos::atomic_add(&div_v(elem, 0), pscf); }); } end_gpu_timer(__KERNEL_NAME__); @@ -321,7 +315,6 @@ DoubleTab& Op_Div_VEFP1B_Elem::ajouter_som(const DoubleTab& tab_vit, DoubleTab& // Initialisation tableaux constants if (!som_initialized_) { - som_initialized_ = true; const IntTab& som_elem = domaine.les_elems(); som_.resize(nb_elem_tot, nfe); nb_degres_liberte_.resize(domaine_VEF.domaine().nb_som_tot()); @@ -340,14 +333,20 @@ DoubleTab& Op_Div_VEFP1B_Elem::ajouter_som(const DoubleTab& tab_vit, DoubleTab& int modif_traitement_diri = domaine_VEF.get_modif_div_face_dirichlet(); const Domaine_Cl_VEF& zcl = ref_cast(Domaine_Cl_VEF, la_zcl_vef.valeur()); + CIntArrView rang_elem_non_std; + CIntArrView type_elem_Cl; + if (modif_traitement_diri) + { + rang_elem_non_std = domaine_VEF.rang_elem_non_std().view_ro(); + type_elem_Cl = zcl.type_elem_Cl().view_ro(); + } CDoubleTabView face_normales = domaine_VEF.face_normales().view_ro(); CDoubleTabView vit = tab_vit.view_ro(); - CIntArrView rang_elem_non_std = domaine_VEF.rang_elem_non_std().view_ro(); - CIntArrView type_elem_Cl = zcl.type_elem_Cl().view_ro(); CIntTabView elem_faces = domaine_VEF.elem_faces().view_ro(); CIntTabView face_voisins = domaine_VEF.face_voisins().view_ro(); CIntTabView som_v = som_.view_ro(); DoubleArrView div = static_cast(tab_div).view_rw(); + // PL: not possible to use MDRangePolicy here. It needs sigma to be the complete sum over faces Kokkos::parallel_for(start_gpu_timer(__KERNEL_NAME__), range_1D(0, nb_elem_tot), KOKKOS_LAMBDA (const int elem) @@ -418,9 +417,9 @@ DoubleTab& Op_Div_VEFP1B_Elem::ajouter_som(const DoubleTab& tab_vit, DoubleTab& if (sub_type(Dirichlet,la_cl.valeur()) || sub_type(Dirichlet_homogene, la_cl.valeur()) || sub_type(Dirichlet_entree_fluide, la_cl.valeur()) || sub_type(Symetrie, la_cl.valeur())) libre = 0; + int som_initialized = som_initialized_; CIntTabView face_sommets = domaine_VEF.face_sommets().view_ro(); CIntArrView renum_som_perio = domaine.get_renum_som_perio().view_ro(); - // On boucle sur les faces de bord reelles et virtuelles Kokkos::parallel_for(start_gpu_timer(__KERNEL_NAME__), range_1D(0, nb_faces_bord_tot), KOKKOS_LAMBDA( @@ -437,7 +436,7 @@ DoubleTab& Op_Div_VEFP1B_Elem::ajouter_som(const DoubleTab& tab_vit, DoubleTab& { int som = renum_som_perio(face_sommets(face, indice)); Kokkos::atomic_add(&div(nps + som), flux); - if (libre) + if (libre && !som_initialized) Kokkos::atomic_add(&nb_degres_liberte(som), 1); } }); @@ -470,6 +469,7 @@ DoubleTab& Op_Div_VEFP1B_Elem::ajouter_som(const DoubleTab& tab_vit, DoubleTab& end_gpu_timer(__KERNEL_NAME__); } } + som_initialized_ = true; return tab_div; } @@ -755,99 +755,111 @@ void Op_Div_VEFP1B_Elem::degres_liberte() const decoup_som << "1" << finl; decoup_som << Objet_U::dimension << " " << nb_som << finl; ArrOfInt somm(dimension + 2); - for (int k = 0; k < nb_som; k++) + CIntArrView renum_som_perio = domaine.get_renum_som_perio().view_ro(); + CIntArrView nb_degres_liberte = nb_degres_liberte_.view_ro(); + int error = 0; + Kokkos::parallel_reduce(start_gpu_timer(__KERNEL_NAME__), nb_som, KOKKOS_LAMBDA(const int k, int& local_error) + { + int sommet = renum_som_perio(k); + if (nb_degres_liberte(sommet) == 0) local_error++; + }, error); + end_gpu_timer(__KERNEL_NAME__); + if (error) { - int sommet = domaine.get_renum_som_perio(k); - if (nb_degres_liberte_(sommet) != 0) - continue; - if (!afficher_message) + for (int k = 0; k < nb_som; k++) { - afficher_message = 1; - Cerr << finl << "Problem with the mesh used for the VEF P1Bulle discretization." << finl; - journal << "List of nodes with no degrees of freedom :" << finl; - } - const double x = domaine.coord(sommet, 0); - const double y = domaine.coord(sommet, 1); - const double z = (Objet_U::dimension == 3) ? domaine.coord(sommet, 2) : 0.; - - journal << "Error node " << sommet << " ( " << x << " " << y << " " << z << " )\n"; - // On affiche la liste des indices d'elements reels et virtuels qui contiennent - // ce sommet. On affiche la lettre "v" pour les elements virtuels. - journal << "Elements "; - const int nb_elem_tot = domaine.nb_elem_tot(); - const int nb_elem = domaine.nb_elem(); - const IntTab& som_elem = domaine.les_elems(); - for (int elem = 0; elem < nb_elem_tot; elem++) - for (int som = 0; som < nse; som++) - if (som_elem(elem, som) == sommet) + int sommet = domaine.get_renum_som_perio(k); + if (nb_degres_liberte_(sommet) != 0) + continue; + if (!afficher_message) { - journal << elem << ((elem >= nb_elem) ? "v " : " "); + afficher_message = 1; + Cerr << finl << "Problem with the mesh used for the VEF P1Bulle discretization." << finl; + journal << "List of nodes with no degrees of freedom :" << finl; + } + const double x = domaine.coord(sommet, 0); + const double y = domaine.coord(sommet, 1); + const double z = (Objet_U::dimension == 3) ? domaine.coord(sommet, 2) : 0.; + + journal << "Error node " << sommet << " ( " << x << " " << y << " " << z << " )\n"; + // On affiche la liste des indices d'elements reels et virtuels qui contiennent + // ce sommet. On affiche la lettre "v" pour les elements virtuels. + journal << "Elements "; + const int nb_elem_tot = domaine.nb_elem_tot(); + const int nb_elem = domaine.nb_elem(); + const IntTab& som_elem = domaine.les_elems(); + for (int elem = 0; elem < nb_elem_tot; elem++) + for (int som = 0; som < nse; som++) + if (som_elem(elem, som) == sommet) + { + journal << elem << ((elem >= nb_elem) ? "v " : " "); - // Ecriture dans le fichier decoupage_som - int face_opp = elem_faces(elem, som); - int elem_opp; - somm = -1; - somm(0) = sommet; + // Ecriture dans le fichier decoupage_som + int face_opp = elem_faces(elem, som); + int elem_opp; + somm = -1; + somm(0) = sommet; - int elem1 = face_voisins(face_opp, 0); - int elem2 = face_voisins(face_opp, 1); + int elem1 = face_voisins(face_opp, 0); + int elem2 = face_voisins(face_opp, 1); - if (elem1 == elem) - elem_opp = elem2; - else - elem_opp = elem1; - - int i = 2; - for (int som1 = 0; som1 < nse; som1++) // on parcourt les sommets de elem_opp - { - int ok = 1; - for (int som2 = 0; som2 < nse; som2++) // on parcourt les sommets de elem - if (som_elem(elem, som2) == som_elem(elem_opp, som1)) - ok = 0; - if (ok) - somm(1) = som_elem(elem_opp, som1); + if (elem1 == elem) + elem_opp = elem2; else + elem_opp = elem1; + + int i = 2; + for (int som1 = 0; som1 < nse; som1++) // on parcourt les sommets de elem_opp { - somm(i) = som_elem(elem_opp, som1); // sommets de la face commune - i++; + int ok = 1; + for (int som2 = 0; som2 < nse; som2++) // on parcourt les sommets de elem + if (som_elem(elem, som2) == som_elem(elem_opp, som1)) + ok = 0; + if (ok) + somm(1) = som_elem(elem_opp, som1); + else + { + somm(i) = som_elem(elem_opp, som1); // sommets de la face commune + i++; + } + } + if (decoupage_som) + { + ecrire_decoupage_som = 1; + for (int j = 0; j < dimension + 2; j++) + decoup_som << somm(j) << " "; + decoup_som << elem << " " << elem_opp << finl; } } - if (decoupage_som) - { - ecrire_decoupage_som = 1; - for (int j = 0; j < dimension + 2; j++) - decoup_som << somm(j) << " "; - decoup_som << elem << " " << elem_opp << finl; - } - } - journal << "\n"; - // On affiche la liste des faces qui contiennent ce sommet. - // Pour les faces de bord, on affiche la condlim, - // pour les faces virtuelles, la lettre "v" - journal << "\nFaces "; - const int nb_faces = domaine_VEF.nb_faces(); - const int nb_som_face = domaine_VEF.face_sommets().dimension(1); - for (int face = 0; face < nb_faces_tot; face++) - { - for (int som = 0; som < nb_som_face; som++) + journal << "\n"; + // On affiche la liste des faces qui contiennent ce sommet. + // Pour les faces de bord, on affiche la condlim, + // pour les faces virtuelles, la lettre "v" + journal << "\nFaces "; + const int nb_faces = domaine_VEF.nb_faces(); + const int nb_som_face = domaine_VEF.face_sommets().dimension(1); + for (int face = 0; face < nb_faces_tot; face++) { - if (domaine_VEF.face_sommets(face, som) == sommet) + for (int som = 0; som < nb_som_face; som++) { - journal << face; - if (face >= nb_faces) // Face virtuelle - journal << "v"; - const int cl = find_cl_face(domaine, face); - // Face de bord reelle: - if (cl >= 0) + if (domaine_VEF.face_sommets(face, som) == sommet) { - const Nom& nom_bord = domaine.frontiere(cl).le_nom(); - journal << "(boundary=" << nom_bord << ")"; + journal << face; + if (face >= nb_faces) // Face virtuelle + journal << "v"; + const int cl = find_cl_face(domaine, face); + // Face de bord reelle: + if (cl >= 0) + { + const Nom& nom_bord = domaine.frontiere(cl).le_nom(); + journal << "(boundary=" << nom_bord << ")"; + } + journal << " "; } - journal << " "; } } + journal << finl; } - journal << finl; } if (ecrire_decoupage_som) diff --git a/src/VEF/Operateurs/Op_Divers/Op_Div_VEFP1B_Elem.h b/src/VEF/Operateurs/Op_Divers/Op_Div_VEFP1B_Elem.h index 2c80e73d2d..3b3c77d9ed 100644 --- a/src/VEF/Operateurs/Op_Divers/Op_Div_VEFP1B_Elem.h +++ b/src/VEF/Operateurs/Op_Divers/Op_Div_VEFP1B_Elem.h @@ -56,7 +56,7 @@ class Op_Div_VEFP1B_Elem: public Operateur_Div_base void contribuer_a_avec(const DoubleTab&, Matrice_Morse&) const override { } void contribuer_au_second_membre(DoubleTab&) const override { } - public_for_cuda + protected_but_public_for_cuda void volumique_P0(DoubleTab&) const; private: diff --git a/src/VEF/Operateurs/Op_Divers/Op_Grad_VEF_P1B_Face.cpp b/src/VEF/Operateurs/Op_Divers/Op_Grad_VEF_P1B_Face.cpp index e7c327a6a8..d745f8bcc4 100644 --- a/src/VEF/Operateurs/Op_Divers/Op_Grad_VEF_P1B_Face.cpp +++ b/src/VEF/Operateurs/Op_Divers/Op_Grad_VEF_P1B_Face.cpp @@ -1,5 +1,5 @@ /**************************************************************************** -* Copyright (c) 2025, CEA +* Copyright (c) 2026, CEA * All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: @@ -215,7 +215,7 @@ DoubleTab& Op_Grad_VEF_P1B_Face::modifier_grad_pour_Cl(DoubleTab& tab_grad) cons norm += face_normales(face, comp) * face_normales(face, comp); } // psc/=norm; // Fixed bug: Arithmetic exception - if (std::fabs(norm) >= DMINFLOAT) + if (Kokkos::fabs(norm) >= DMINFLOAT) psc /= norm; for (int comp = 0; comp < dim; comp++) grad(face, comp) -= psc * face_normales(face, comp); @@ -242,7 +242,6 @@ DoubleTab& Op_Grad_VEF_P1B_Face::ajouter_elem(const DoubleTab& tab_pre, DoubleTa const Domaine_VEF& domaine_VEF = domaine_vef(); assert(domaine_VEF.get_alphaE()); const Domaine& domaine = domaine_VEF.domaine(); - const IntTab& elem_faces = domaine_VEF.elem_faces(); int nfe = domaine.nb_faces_elem(); int nb_elem_tot = domaine.nb_elem_tot(); CDoubleArrView porosite_face = equation().milieu().porosite_face().view_ro(); @@ -265,7 +264,7 @@ DoubleTab& Op_Grad_VEF_P1B_Face::ajouter_elem(const DoubleTab& tab_pre, DoubleTa const Front_VF& le_bord = ref_cast(Front_VF, la_cl->frontiere_dis()); int num1 = le_bord.num_premiere_face(); int num2 = num1 + le_bord.nb_faces(); - CDoubleTabView flux_impose = la_sortie_libre.flux_impose().view_ro(); + CDoubleTabView flux_impose = la_sortie_libre.tab_flux_impose().view_ro(); DoubleTabView grad = tab_grad.view_rw(); Kokkos::parallel_for(start_gpu_timer(__KERNEL_NAME__), Kokkos::RangePolicy<>(num1, num2), KOKKOS_LAMBDA(const int face) { @@ -302,30 +301,22 @@ DoubleTab& Op_Grad_VEF_P1B_Face::ajouter_elem(const DoubleTab& tab_pre, DoubleTa } CIntTabView face_voisins = domaine_VEF.face_voisins().view_ro(); - CIntTabView elem_faces_v = elem_faces.view_ro(); + CIntTabView elem_faces = domaine_VEF.elem_faces().view_ro(); CDoubleTabView pre = tab_pre.view_ro(); DoubleTabView grad = tab_grad.view_rw(); int dim = Objet_U::dimension; - - auto kern_elem = KOKKOS_LAMBDA(int - elem) + Kokkos::parallel_for(start_gpu_timer(__KERNEL_NAME__), Kokkos::MDRangePolicy>({0,0}, {nb_elem_tot,nfe}), KOKKOS_LAMBDA (int elem, int indice) { - for (int indice = 0; indice < nfe; indice++) + int face = elem_faces(elem, indice); + double pe = pre(elem, 0); + double signe = elem == face_voisins(face, 0) ? 1 : -1; + double poro = porosite_face(face); + double coeff = pe * signe * poro; + for (int comp = 0; comp < dim; comp++) { - double pe = pre(elem, 0); - int face = elem_faces_v(elem, indice); - double signe = 1; - if (elem != face_voisins(face, 0)) signe = -1; - for (int comp = 0; comp < dim; comp++) - { - double val = pe * signe * face_normales(face, comp) * porosite_face(face); - Kokkos::atomic_sub(&grad(face, comp), val); - - } + Kokkos::atomic_sub(&grad(face, comp), coeff * face_normales(face, comp)); } - }; - - Kokkos::parallel_for(start_gpu_timer(__KERNEL_NAME__), nb_elem_tot, kern_elem); + }); end_gpu_timer(__KERNEL_NAME__); return tab_grad; @@ -377,21 +368,16 @@ DoubleTab& Op_Grad_VEF_P1B_Face::ajouter_som(const DoubleTab& tab_pre, DoubleTab KOKKOS_LAMBDA (int elem, int indice) { int face = elem_faces(elem,indice); - - double signe = 1; - if (elem != face_voisins(face,0)) - signe = -1; - - double sigma[3]; - for (int comp = 0; comp < dim; comp++) - sigma[comp] = face_normales(face,comp) * signe; - + double signe = elem == face_voisins(face,0) ? 1 : -1; + double pe = pre(som_v(elem,indice)); + double coeff = coeff_som(elem) * pe * signe; for (int indice2 = 0; indice2 < nfe; indice2++) { - int face2 = elem_faces(elem,indice2); + int face2 = elem_faces(elem, indice2); + double poro = porosite_face(face2); for (int comp = 0; comp < dim; comp++) { - Kokkos::atomic_add(&grad(face2,comp), -(coeff_som(elem) * pre(som_v(elem,indice)) * sigma[comp] * porosite_face(face2))); + Kokkos::atomic_sub(&grad(face2,comp), coeff * poro * face_normales(face,comp)); } } }); @@ -417,7 +403,7 @@ DoubleTab& Op_Grad_VEF_P1B_Face::ajouter_som(const DoubleTab& tab_pre, DoubleTab const Neumann_sortie_libre& sortie_libre = ref_cast(Neumann_sortie_libre, la_cl.valeur()); int num1 = le_bord.num_premiere_face(); int num2 = num1 + le_bord.nb_faces(); - CDoubleTabView flux_impose = sortie_libre.flux_impose().view_ro(); + CDoubleTabView flux_impose = sortie_libre.tab_flux_impose().view_ro(); CIntTabView face_sommets = domaine_VEF.face_sommets().view_ro(); CIntArrView renum_som_perio = dom.get_renum_som_perio().view_ro(); Kokkos::parallel_for(start_gpu_timer(__KERNEL_NAME__), Kokkos::RangePolicy<>(num1, num2), KOKKOS_LAMBDA(const int face) @@ -665,13 +651,8 @@ void Op_Grad_VEF_P1B_Face::calculer_flux_bords() const flux_bords_.resize(domaine_VEF.nb_faces_bord(), dimension); flux_bords_ = 0.; - //int nse=domaine_VEF.domaine().nb_som_elem(); int nb_faces_bord = domaine_VEF.premiere_face_int(); int nps = domaine_VEF.numero_premier_sommet(); - const IntTab& sommets = domaine_VEF.face_sommets(); - const IntTab& face_voisins = domaine_VEF.face_voisins(); - //const IntTab& som_elem=le_dom_vef->domaine().les_elems(); - const DoubleTab& face_normales = domaine_VEF.face_normales(); const Navier_Stokes_std& eqn_hydr = ref_cast(Navier_Stokes_std, equation()); const Champ_P1_isoP1Bulle& la_pression_P1B = ref_cast(Champ_P1_isoP1Bulle, eqn_hydr.pression_pa()); // Si on filtre: @@ -682,42 +663,34 @@ void Op_Grad_VEF_P1B_Face::calculer_flux_bords() const else la_pression_P1B.filtrage(domaine_VEF, la_pression_P1B); - - - const DoubleVect& pression_P1B = la_pression_P1B.champ_filtre(); - double coeff_P1 = 1. / dimension; bool alphaE = domaine_VEF.get_alphaE(); bool alphaS = domaine_VEF.get_alphaS(); - int nb_som_par_face = sommets.dimension(1); - CIntTabView face_voisins_v = face_voisins.view_ro(); - CIntTabView sommets_v = sommets.view_ro(); - CDoubleTabView face_normales_v = face_normales.view_ro(); - CDoubleArrView pression_P1B_v = pression_P1B.view_ro(); - DoubleTabView flux_bords_v = flux_bords_.view_wo(); + int nb_som_par_face = domaine_VEF.face_sommets().dimension(1); int dim = Objet_U::dimension; - - auto kern_flux_bords = KOKKOS_LAMBDA(int - face) + CIntTabView face_voisins = domaine_VEF.face_voisins().view_ro(); + CIntTabView sommets = domaine_VEF.face_sommets().view_ro(); + CDoubleTabView face_normales = domaine_VEF.face_normales().view_ro(); + CDoubleArrView pression_P1B = static_cast(la_pression_P1B.champ_filtre()).view_ro(); + DoubleTabView flux_bords = flux_bords_.view_wo(); + Kokkos::parallel_for(start_gpu_timer(__KERNEL_NAME__), nb_faces_bord, KOKKOS_LAMBDA(int face) { - int elem = face_voisins_v(face, 0); + int elem = face_voisins(face, 0); double pres_tot = 0.; // Contribution de la pression P0 - if (alphaE) pres_tot = pression_P1B_v(elem); + if (alphaE) pres_tot = pression_P1B(elem); // Contribution de la pression P1 if (alphaS) { double pres_som = 0.; for (int som = 0; som < nb_som_par_face; som++) - pres_som += pression_P1B_v(nps + sommets_v(face, som)); + pres_som += pression_P1B(nps + sommets(face, som)); pres_tot += coeff_P1 * pres_som; } // Calcul de la resultante et du couple de pression for (int i = 0; i < dim; i++) - flux_bords_v(face, i) = pres_tot * face_normales_v(face, i); - }; - - Kokkos::parallel_for(start_gpu_timer(__KERNEL_NAME__), nb_faces_bord, kern_flux_bords); + flux_bords(face, i) = pres_tot * face_normales(face, i); + }); end_gpu_timer(__KERNEL_NAME__); } diff --git a/src/VEF/Solveurs/Assembleur_P_VEF.h b/src/VEF/Solveurs/Assembleur_P_VEF.h index 765028aab0..1281984561 100644 --- a/src/VEF/Solveurs/Assembleur_P_VEF.h +++ b/src/VEF/Solveurs/Assembleur_P_VEF.h @@ -49,13 +49,15 @@ class Assembleur_P_VEF: public Assembleur_base void completer(const Equation_base&) override; inline const Equation_base& equation() const; + protected_but_public_for_cuda + void calculer_inv_volume(DoubleTab& inv_volumes_entrelaces, const Domaine_Cl_VEF& domaine_Cl_VEF, const DoubleVect& volumes_entrelaces); + protected: OBS_PTR(Equation_base) mon_equation; OBS_PTR(Domaine_VEF) le_dom_VEF; OBS_PTR(Domaine_Cl_VEF) le_dom_Cl_VEF; DoubleTab les_coeff_pression; int has_P_ref = 0; - void calculer_inv_volume(DoubleTab& inv_volumes_entrelaces, const Domaine_Cl_VEF& domaine_Cl_VEF, const DoubleVect& volumes_entrelaces); }; diff --git a/src/VEF/Solveurs/Assembleur_P_VEFPre1B_tools.cpp b/src/VEF/Solveurs/Assembleur_P_VEFPre1B_tools.cpp index 2ee16ee829..c0abf8f3d2 100644 --- a/src/VEF/Solveurs/Assembleur_P_VEFPre1B_tools.cpp +++ b/src/VEF/Solveurs/Assembleur_P_VEFPre1B_tools.cpp @@ -1,5 +1,5 @@ /**************************************************************************** -* Copyright (c) 2025, CEA +* Copyright (c) 2026, CEA * All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: @@ -29,77 +29,86 @@ #include #include #include +#include +#include +#include +#include -static int face_associee=-1; +static double gradi[3]; +static double gradj[3]; -static ArrOfDouble gradi(3); -static ArrOfDouble gradj(3); -static inline void -projette(ArrOfDouble& grad, int face, const DoubleTab& normales) +template +KOKKOS_INLINE_FUNCTION +static void projette(VectType& grad, int face, const TabType& normales) { + int dimension; + if constexpr (Kokkos::is_view::value) + dimension = (int)normales.extent(1); + else + dimension = normales.dimension(1); double psc=0, norm=0; - int dimension=Objet_U::dimension, comp; - for(comp=0; comp=DMINFLOAT) psc/=norm; - for(comp=0; comp=DMINFLOAT) psc/=norm; + for(int comp=0; comp=0 face_associee_perio pour Periodique +// INTERNAL Interne +// DIRICHLET Dirichlet +// NEUMANN Neumann +// SYMMETRY Symetrie +// OTHER Periodique skipee +enum BOUNDARY { INTERNAL = -1, DIRICHLET = -2, NEUMANN = -3, SYMMETRY = -4, OTHER = -5 }; + +static void build_cl(ArrOfInt& cl, const Conds_lim& les_cl) { - face_associee=-1; - int ok=1; - const Front_VF& le_bord = ref_cast(Front_VF,la_cl->frontiere_dis()); - int nb_faces_bord_tot = le_bord.nb_faces_tot(); - do + cl = INTERNAL; + for (int i = 0; i < les_cl.size(); i++) { - face=le_bord.num_face(ind_face); - if ((sub_type(Dirichlet, la_cl.valeur())) - || (sub_type(Dirichlet_homogene, la_cl.valeur()))) - { - ok=0; - } - else if (sub_type(Periodique,la_cl.valeur())) + const Cond_lim& la_cl = les_cl[i]; + const Front_VF& le_bord = ref_cast(Front_VF, la_cl->frontiere_dis()); + int nb_faces_bord_tot = le_bord.nb_faces_tot(); + if (sub_type(Dirichlet, la_cl.valeur()) || sub_type(Dirichlet_homogene, la_cl.valeur())) { - //periodicite - const Periodique& la_cl_perio = ref_cast(Periodique, la_cl.valeur()); - face_associee=le_bord.num_face(la_cl_perio.face_associee(ind_face)); - ok=2; + for (int ind_face = 0; ind_face < nb_faces_bord_tot; ind_face++) + cl(le_bord.num_face(ind_face)) = DIRICHLET; } else if (sub_type(Neumann_sortie_libre, la_cl.valeur())) { - //sortie_libre - ok=3; + for (int ind_face = 0; ind_face < nb_faces_bord_tot; ind_face++) + cl(le_bord.num_face(ind_face)) = NEUMANN; } else if (sub_type(Symetrie, la_cl.valeur())) { - //symetrie - ok=4; + for (int ind_face = 0; ind_face < nb_faces_bord_tot; ind_face++) + cl(le_bord.num_face(ind_face)) = SYMMETRY; + } + else if (sub_type(Periodique, la_cl.valeur())) + { + const Periodique& la_cl_perio = ref_cast(Periodique, la_cl.valeur()); + for (int ind_face = 0; ind_face < nb_faces_bord_tot; ind_face++) + { + int face = le_bord.num_face(ind_face); + int face_associee = le_bord.num_face(la_cl_perio.face_associee(ind_face)); + cl(face) = face_associee >= face ? face_associee : OTHER; // OTHER = Periodique skipee + } } + else + Process::exit("Not coded"); } - while ( ( (ok==0) || ((ok==2)&&(face_associee0 && sommets[sz-1]==-1) sz--; + for(int i=0; isommets[j]) { - int tmp=sommets[i]; - sommets[i]=sommets[j]; - sommets[j]=tmp; - tmp=faces_op1[i]; - faces_op1[i]=faces_op1[j]; - faces_op1[j]=tmp; - tmp=faces_op2[i]; - faces_op2[i]=faces_op2[j]; - faces_op2[j]=tmp; + swap(sommets[i], sommets[j]); + swap(faces_op1[i], faces_op1[j]); + swap(faces_op2[i], faces_op2[j]); } } static inline int chercher_arete(const Domaine_VEF& domaine_VEF, @@ -324,12 +333,6 @@ static inline int chercher_arete(const Domaine_VEF& domaine_VEF, } return -1; } -static inline void swap (int& i, int& j) -{ - int k=i; - i=j; - j=k; -} // // rempli sommets, faces_op1 et faces_op2 @@ -338,93 +341,88 @@ static inline void swap (int& i, int& j) // dans elem1. face_op2(i) est ... dans elem2. (si elem2=-1, alors face_op2=-1) // les dimension premiers sommets sont ceux de face // le dernier est dans elem2 -static inline void remplir_sommets(const Domaine_VEF& domaine_VEF, - int face, int elem1, int elem2, - ArrOfInt& sommets, - ArrOfInt& faces_op1, - ArrOfInt& faces_op2) +template +KOKKOS_INLINE_FUNCTION +static void remplir_sommets(const TabType& elem_som, const TabType& face_som, const TabType& elem_faces, const VectType& renum_som_perio, + int face, int face_associee, int elem1, int elem2, + int* sommets, + int* faces_op1, + int* faces_op2) { - int dplusun=Objet_U::dimension+1; - const IntTab& elem_som = domaine_VEF.domaine().les_elems(); - const IntTab& face_som = domaine_VEF.face_sommets(); - const IntTab& elem_faces = domaine_VEF.elem_faces(); - const Domaine& dom=domaine_VEF.domaine(); - for(int i=0; i::value) + size = (int)elem_faces.extent(1); + else + size = elem_faces.dimension(1); + int dim = size - 1; + for(int i=0; i +KOKKOS_INLINE_FUNCTION +static void calculer_grad(const ConstTabType& face_voisins, int elem1, int elem2, - const ArrOfDouble& coef_som, + const VectType& coef_som, int s, int fop1, int fop2, - const DoubleTab& normales, - ArrOfDouble& grad) + const TabType& normales, + double* grad) { - int dimension=Objet_U::dimension; + int dimension; + if constexpr (Kokkos::is_view::value) + dimension = (int)normales.extent(1); + else + dimension = normales.dimension(1); double signe=1; if(fop1!=-1) { @@ -435,7 +433,8 @@ static void calculer_grad(const IntTab& face_voisins, grad[comp]=signe*normales(fop1,comp); } else - grad=0; + for(int comp=0; comp +KOKKOS_INLINE_FUNCTION +static double dotproduct_array_fois_inverse_quantitee_entrelacee(const GradType1& grad1, const GradType2& grad2, const TabType& inverse_quantitee_entrelacee, int face) { + int size; + if constexpr (Kokkos::is_view::value) + size = (int)inverse_quantitee_entrelacee.extent(1); + else + size = inverse_quantitee_entrelacee.dimension(1); double dot=0; - int size=inverse_quantitee_entrelacee.dimension(1); for (int i=0; i +KOKKOS_INLINE_FUNCTION +void range(int i, int n, int j, int m, MatType& ARR, MatType& ARV, MatType& AVR, MatType& AVV, double coeff) { - const DoubleTab& normales = domaine_VEF.face_normales(); - const IntTab& face_voisins = domaine_VEF.face_voisins(); - assert(elem1==face_voisins(face, 0)); - assert(elem2==face_voisins(face, 1)); - - int dimension=Objet_U::dimension, - dplusdeux=dimension+2; - double psc; - //double coeff_som=1./(dimension)/(dimension+1); - - int nb_elem=domaine_VEF.nb_elem(); - int nb_som=domaine_VEF.nb_som(); - for(int i=0; isi) - { - if(rang==-1) - { - voisins[si].add(sj); - coeffs[si].add(0); - } - } - } + if (jsi); - if(siarete2) swap(arete1, arete2); - int rang=voisins[arete1].rang(arete2); - if(rang==-1) - { - voisins[arete1].add(arete2); - coeffs[arete1].add(0); - - } + stencil(nnz, 0) = arete1; + stencil(nnz, 1) = arete2; + nnz++; arete1=tmp; } } @@ -738,8 +600,8 @@ static void contribuer_matricePaPa(const Domaine_VEF& domaine_VEF, static void update_matricePaPa(const Domaine_VEF& domaine_VEF, const DoubleTab& inverse_quantitee_entrelacee, int face, int elem1, int elem2, - ArrOfInt& sommets, ArrOfInt& faces_op1, - ArrOfInt& faces_op2, + int* sommets, int* faces_op1, + int* faces_op2, Matrice_Morse& ARR, Matrice_Morse& ARV, Matrice_Morse& AVR, Matrice_Morse& AVV) { @@ -749,8 +611,6 @@ static void update_matricePaPa(const Domaine_VEF& domaine_VEF, const IntTab& face_voisins=domaine_VEF.face_voisins(); const DoubleTab& normales = domaine_VEF.face_normales(); int nb_aretes_tot=domaine_VEF.domaine().nb_aretes(); - int i, j, k, l; - double psc; // On ne traite pas les sommets -1 qui // sont en fin de tableau sommets: //while (sommets(dplusdeux-1)==-1) @@ -760,10 +620,10 @@ static void update_matricePaPa(const Domaine_VEF& domaine_VEF, int jmax=5; if(elem2==-1) jmax=4; - for(i=0; i<3; i++) + for(int i=0; i<3; i++) { int si=sommets[i]; - for(j=i+1; jarete2) swap(arete1, arete2); - if(arete1arete2) swap(arete1, arete2); - int rang=voisins[arete1].rang(arete2); - if(rang==-1) - { - voisins[arete1].add(arete2); - coeffs[arete1].add(0); - - } + stencil(nnz, 0) = arete1; + stencil(nnz, 1) = arete2; + nnz++; arete1=tmp; } } @@ -1096,7 +733,7 @@ static void update_matrice_NeumannPaPa(const Domaine_VEF& domaine_VEF, const DoubleTab& inverse_quantitee_entrelacee, int face, int elem, - ArrOfInt& sommets, ArrOfInt& faces_op1, + int* sommets, int* faces_op1, Matrice_Morse& ARR, Matrice_Morse& ARV, Matrice_Morse& AVR, Matrice_Morse& AVV) { @@ -1107,14 +744,10 @@ update_matrice_NeumannPaPa(const Domaine_VEF& domaine_VEF, const DoubleTab& normales = domaine_VEF.face_normales(); int nb_aretes_tot=domaine_VEF.domaine().nb_aretes(); - int i, j, k, l; - double psc; - - - for(i=0; i<3; i++) + for(int i=0; i<3; i++) { int si=sommets[i]; - for(j=i+1; j<4; j++) + for(int j=i+1; j<4; j++) { int sj=sommets[j]; int arete1; @@ -1131,10 +764,10 @@ update_matrice_NeumannPaPa(const Domaine_VEF& domaine_VEF, ARR(arete1,arete1)+=dotproduct_array_fois_inverse_quantitee_entrelacee(gradi,gradi, inverse_quantitee_entrelacee,face); int jj=j; - for(k=i; k<3; k++) + for(int k=i; k<3; k++) { int sk=sommets[k]; - for(l=jj+1; l<4; l++) + for(int l=jj+1; l<4; l++) { int sl=sommets[l]; int arete2= chercher_arete(domaine_VEF,elem, sl, sk, @@ -1148,19 +781,11 @@ update_matrice_NeumannPaPa(const Domaine_VEF& domaine_VEF, faces_op1[k], -1, faces_op1[l], -1, normales, gradj); - psc=dotproduct_array_fois_inverse_quantitee_entrelacee(gradi,gradj, - inverse_quantitee_entrelacee,face); + double psc=dotproduct_array_fois_inverse_quantitee_entrelacee(gradi,gradj, + inverse_quantitee_entrelacee,face); int tmp=arete1; if(arete1>arete2) swap(arete1, arete2); - if(arete1arete2) swap(arete1, arete2); - int rang=voisins[arete1].rang(arete2); - if(rang==-1) - { - voisins[arete1].add(arete2); - coeffs[arete1].add(0); - - } + stencil(nnz, 0) = arete1; + stencil(nnz, 1) = arete2; + nnz++; arete1=tmp; } } @@ -1224,7 +845,7 @@ static void update_matrice_SymetriePaPa(const Domaine_VEF& domaine_VEF, const DoubleTab& inverse_quantitee_entrelacee, int face, int elem, - ArrOfInt& sommets, ArrOfInt& faces_op1, + int* sommets, int* faces_op1, Matrice_Morse& ARR, Matrice_Morse& ARV, Matrice_Morse& AVR, Matrice_Morse& AVV) { @@ -1233,16 +854,12 @@ update_matrice_SymetriePaPa(const Domaine_VEF& domaine_VEF, const ArrOfInt& ok_arete=domaine_VEF.get_ok_arete(); const IntTab& face_voisins=domaine_VEF.face_voisins(); const DoubleTab& normales = domaine_VEF.face_normales(); - int i, j, k, l; int nb_aretes_tot=domaine_VEF.domaine().nb_aretes(); - double psc; - - - for(i=0; i<3; i++) + for(int i=0; i<3; i++) { int si=sommets[i]; - for(j=i+1; j<4; j++) + for(int j=i+1; j<4; j++) { int sj=sommets[j]; int arete1; @@ -1260,10 +877,10 @@ update_matrice_SymetriePaPa(const Domaine_VEF& domaine_VEF, ARR(arete1,arete1)+=dotproduct_array_fois_inverse_quantitee_entrelacee(gradi,gradi, inverse_quantitee_entrelacee,face); int jj=j; - for(k=i; k<3; k++) + for(int k=i; k<3; k++) { int sk=sommets[k]; - for(l=jj+1; l<4; l++) + for(int l=jj+1; l<4; l++) { int sl=sommets[l]; int arete2= chercher_arete(domaine_VEF,elem, sl, sk, @@ -1278,19 +895,11 @@ update_matrice_SymetriePaPa(const Domaine_VEF& domaine_VEF, faces_op1[l], -1, normales, gradj); projette(gradj, face, normales); - psc=dotproduct_array_fois_inverse_quantitee_entrelacee(gradi,gradj, - inverse_quantitee_entrelacee,face); + double psc=dotproduct_array_fois_inverse_quantitee_entrelacee(gradi,gradj, + inverse_quantitee_entrelacee,face); int tmp=arete1; if(arete1>arete2) swap(arete1, arete2); - if(arete1frontiere_dis()); - int nb_faces_bord_tot = le_bord.nb_faces_tot(); - for(int ind_face=0; ind_face tab_nnz(1); + tab_nnz(0) = nb_som_tot; + auto nnz = tab_nnz.view_rw(); + CIntArrView cl = tab_cl.view_ro(); + CIntTabView face_voisins = domaine_VEF.face_voisins().view_ro(); + CIntTabView elem_som = domaine_VEF.domaine().les_elems().view_ro(); + CIntTabView face_som = domaine_VEF.face_sommets().view_ro(); + CIntTabView elem_faces = domaine_VEF.elem_faces().view_ro(); + CIntArrView renum_som_perio = domaine_VEF.domaine().get_renum_som_perio().view_ro(); + Kokkos::parallel_for(start_gpu_timer(__KERNEL_NAME__), nb_faces_tot, KOKKOS_LAMBDA(const int face) + { + if (cl(face)==NEUMANN // Neumann + || cl(face)==SYMMETRY // Symetrie + || cl(face)>0 // Periodique + || cl(face)==INTERNAL) // Face interne + { + int sommets[5] = { -1,-1,-1,-1,-1}; + int face_opp1[5]= { -1,-1,-1,-1,-1}; + int face_opp2[5]= { -1,-1,-1,-1,-1}; + int elem1=face_voisins(face, 0); + int elem2=face_voisins(face, 1); + int face_associee = cl(face) < 0 ? -1 : cl(face); // Periodique + remplir_sommets(elem_som, face_som, elem_faces, renum_som_perio, face, face_associee, elem1, elem2, sommets, face_opp1, face_opp2); + sort(sommets, face_opp1, face_opp2); + int size = dimension + 2; + for (int i = 0; i < size; i++) + { + int si = sommets[i]; + if (si<0) break; + for (int j = i + 1; j < size; j++) + { + int sj = sommets[j]; + if (sj<0) break; + if (sj > si || elem2==-1) + { + nnz_t slot = Kokkos::atomic_fetch_add(&nnz(0), 1); + stencil(slot, 0) = si; + stencil(slot, 1) = sj; + } + } + } + } + }); + end_gpu_timer(__KERNEL_NAME__); + tab_stencil.resize(tab_nnz(0), 2); matrice.typer("Matrice_Bloc"); Matrice_Bloc& matrice_bloc=ref_cast(Matrice_Bloc, matrice.valeur()); - matrice_bloc.remplir(voisins, coeffs, diag, domaine.nb_som(), domaine.nb_som_tot()); + matrice_bloc.remplir(tab_stencil, domaine.nb_som(), domaine.nb_som_tot()); Cerr << "Assemblage P1 OK" << finl; } @@ -1920,66 +1448,92 @@ void assemblerP1P1(const Domaine_dis_base& z, void updateP1P1(const Domaine_dis_base& z, const Domaine_Cl_dis_base& zcl, Matrice& matrice, - const DoubleTab& inverse_quantitee_entrelacee, const ArrOfDouble& coef_som) + const DoubleTab& tab_inverse_quantitee_entrelacee, const ArrOfDouble& tab_coef_som) { int dimension=Objet_U::dimension; const Domaine_VEF& domaine_VEF=ref_cast(Domaine_VEF, z); const Domaine_Cl_VEF& domaine_Cl_VEF=ref_cast(Domaine_Cl_VEF, zcl); const Conds_lim& les_cl = domaine_Cl_VEF.les_conditions_limites(); - const IntTab& face_voisins = domaine_VEF.face_voisins(); - int nint = domaine_VEF.premiere_face_int(); - int nb_faces = domaine_VEF.nb_faces_tot(); - int elem1, elem2, face, ok; - ArrOfInt sommets(dimension+2); - ArrOfInt face_opp1(dimension+2); - ArrOfInt face_opp2(dimension+2); + int nb_faces_tot = domaine_VEF.nb_faces_tot(); + + ArrOfInt tab_cl(nb_faces_tot); + build_cl(tab_cl, les_cl); + int nb_som=domaine_VEF.nb_som(); Matrice_Bloc& A=ref_cast(Matrice_Bloc, matrice.valeur()); - Matrice_Morse_Sym& ARR=ref_cast(Matrice_Morse_Sym, A.get_bloc(0,0).valeur()); - Matrice_Morse& ARV=ref_cast(Matrice_Morse, A.get_bloc(0,1).valeur()); - Matrice_Morse& AVR=ref_cast(Matrice_Morse, A.get_bloc(1,0).valeur()); - Matrice_Morse& AVV=ref_cast(Matrice_Morse, A.get_bloc(1,1).valeur()); - // Faces de bord : - for (auto &itr : les_cl) - { - const Cond_lim& la_cl = itr; - const Front_VF& le_bord = ref_cast(Front_VF, la_cl->frontiere_dis()); - int nb_faces_bord_tot = le_bord.nb_faces_tot(); - for (int ind_face = 0; ind_face < nb_faces_bord_tot; ind_face++) - { - ok = okface(ind_face, face, la_cl); - if (ok == -1) - break; - elem1 = face_voisins(face, 0); - elem2 = face_voisins(face, 1); - remplir_sommets(domaine_VEF, face, elem1, elem2, sommets, face_opp1, face_opp2); - sort(sommets, face_opp1, face_opp2); - if (ok == 3) - update_matrice_NeumannP1P1(domaine_VEF, inverse_quantitee_entrelacee, face, elem1, sommets, face_opp1, coef_som, ARR, ARV, AVR, AVV); - else if (ok == 4) - update_matrice_SymetrieP1P1(domaine_VEF, inverse_quantitee_entrelacee, face, elem1, sommets, face_opp1, coef_som, ARR, ARV, AVR, AVV); - else - update_matriceP1P1(domaine_VEF, inverse_quantitee_entrelacee, face, elem1, elem2, sommets, face_opp1, face_opp2, coef_som, ARR, ARV, AVR, AVV); - } - } - face_associee=-1; - for (face = nint; face < nb_faces; face++) - { - elem1 = face_voisins(face, 0); - elem2 = face_voisins(face, 1); - if (!domaine_VEF.est_une_face_virt_bord(face)) // On ne traite que les faces internes - { - remplir_sommets(domaine_VEF, face, elem1, elem2, sommets, face_opp1, face_opp2); - sort(sommets, face_opp1, face_opp2); - update_matriceP1P1(domaine_VEF, inverse_quantitee_entrelacee, face, elem1, elem2, sommets, face_opp1, face_opp2, coef_som, ARR, ARV, AVR, AVV); - } - } - int nb_som=domaine_VEF.domaine().nb_som(); - for(int i=0; ifrontiere_dis()); - int nb_faces_bord_tot = le_bord.nb_faces_tot(); - for(int ind_face=0; ind_facefrontiere_dis()); - int nb_faces_bord_tot = le_bord.nb_faces_tot(); - for (int ind_face = 0; ind_face < nb_faces_bord_tot; ind_face++) - { - ok = okface(ind_face, face, la_cl); - if (ok == -1) - break; - elem1 = face_voisins(face, 0); - elem2 = face_voisins(face, 1); - remplir_sommets(domaine_VEF, face, elem1, elem2, sommets, face_opp1, face_opp2); - if (ok == 3) - update_matrice_NeumannPaPa(domaine_VEF, inverse_quantitee_entrelacee, face, elem1, sommets, face_opp1, ARR, ARV, AVR, AVV); - else if (ok == 4) - update_matrice_SymetriePaPa(domaine_VEF, inverse_quantitee_entrelacee, face, elem1, sommets, face_opp1, ARR, ARV, AVR, AVV); - else - update_matricePaPa(domaine_VEF, inverse_quantitee_entrelacee, face, elem1, elem2, sommets, face_opp1, face_opp2, ARR, ARV, AVR, AVV); - } - } - face_associee=-1; - for (face = nint; face < nb_faces; face++) - { - elem1 = face_voisins(face, 0); - elem2 = face_voisins(face, 1); - if (!domaine_VEF.est_une_face_virt_bord(face)) // On ne traite que les faces internes - { - remplir_sommets(domaine_VEF, face, elem1, elem2, sommets, face_opp1, face_opp2); - update_matricePaPa(domaine_VEF, inverse_quantitee_entrelacee, face, elem1, elem2, sommets, face_opp1, face_opp2, ARR, ARV, AVR, AVV); - } + for (int face = 0; face < nb_faces_tot; face++) + { + if (cl(face) == DIRICHLET || cl(face) == OTHER) continue; + int elem1 = face_voisins(face, 0); + int elem2 = face_voisins(face, 1); + int face_associee = cl(face) < 0 ? -1 : cl(face); + remplir_sommets(elem_som, face_som, elem_faces, renum_som_perio, face, face_associee, elem1, elem2, sommets, face_opp1, face_opp2); + if (cl(face) == NEUMANN) + update_matrice_NeumannPaPa(domaine_VEF, inverse_quantitee_entrelacee, face, elem1, sommets, face_opp1, ARR, ARV, AVR, AVV); + else if (cl(face) == SYMMETRY) + update_matrice_SymetriePaPa(domaine_VEF, inverse_quantitee_entrelacee, face, elem1, sommets, face_opp1, ARR, ARV, AVR, AVV); + else + update_matricePaPa(domaine_VEF, inverse_quantitee_entrelacee, face, elem1, elem2, sommets, face_opp1, face_opp2, ARR, ARV, AVR, AVV); } for(int i=0; ifrontiere_dis()); - int nb_faces_bord_tot = le_bord.nb_faces_tot(); - for(int ind_face=0; ind_facefrontiere_dis()); - int nb_faces_bord_tot = le_bord.nb_faces_tot(); - for (int ind_face = 0; ind_face < nb_faces_bord_tot; ind_face++) - { - ok = okface(ind_face, face, la_cl); - if (ok == -1) - break; - elem1 = face_voisins(face, 0); - elem2 = face_voisins(face, 1); - remplir_sommets(domaine_VEF, face, elem1, elem2, sommets, face_opp1, face_opp2); - if (ok == 3) - update_matrice_NeumannP0Pa(domaine_VEF, inverse_quantitee_entrelacee, face, elem1, sommets, face_opp1, ARR, ARV, AVR, AVV); - else if (ok == 4) { /* Do nothing */ } - else - update_matriceP0Pa(domaine_VEF, inverse_quantitee_entrelacee, face, elem1, elem2, sommets, face_opp1, face_opp2, ARR, ARV, AVR, AVV); - } - } - face_associee = -1; - for (face = nint; face < nb_faces; face++) - { - elem1 = face_voisins(face, 0); - elem2 = face_voisins(face, 1); - if (!domaine_VEF.est_une_face_virt_bord(face)) // On ne traite que les faces internes - { - remplir_sommets(domaine_VEF, face, elem1, elem2, sommets, face_opp1, face_opp2); - update_matriceP0Pa(domaine_VEF, inverse_quantitee_entrelacee, face, elem1, elem2, sommets, face_opp1, face_opp2, ARR, ARV, AVR, AVV); - } + for (int face = 0; face < nb_faces_tot; face++) + { + if (cl(face) == DIRICHLET || cl(face) == SYMMETRY || cl(face) == OTHER) continue; // Dirichlet, Symetrie (rien), reverse periodic + int elem1 = face_voisins(face, 0); + int elem2 = face_voisins(face, 1); + int face_associee = cl(face) < 0 ? -1 : cl(face); + remplir_sommets(elem_som, face_som, elem_faces, renum_som_perio, face, face_associee, elem1, elem2, sommets, face_opp1, face_opp2); + if (cl(face) == NEUMANN) + update_matrice_NeumannP0Pa(domaine_VEF, inverse_quantitee_entrelacee, face, elem1, sommets, face_opp1, ARR, ARV, AVR, AVV); + else + update_matriceP0Pa(domaine_VEF, inverse_quantitee_entrelacee, face, elem1, elem2, sommets, face_opp1, face_opp2, ARR, ARV, AVR, AVV); } Cerr << "Update P0Pa OK" << finl; } @@ -2290,63 +1767,43 @@ void assemblerP1Pa(const Domaine_dis_base& z, Matrice& matrice, const DoubleTab& inverse_quantitee_entrelacee, const ArrOfDouble& coef_som) { - int dimension=Objet_U::dimension; const Domaine_VEF& domaine_VEF=ref_cast(Domaine_VEF, z); const Domaine& domaine=domaine_VEF.domaine(); const Domaine_Cl_VEF& domaine_Cl_VEF=ref_cast(Domaine_Cl_VEF, zcl); const Conds_lim& les_cl = domaine_Cl_VEF.les_conditions_limites(); const IntTab& face_voisins = domaine_VEF.face_voisins(); - int nint = domaine_VEF.premiere_face_int(); - int nb_faces = domaine_VEF.nb_faces_tot(); - int elem1, elem2, face, ok; - int nb_som = domaine.nb_som_tot(); - IntLists voisins(nb_som); - DoubleLists coeffs(nb_som); - ArrOfInt sommets(dimension+2); - ArrOfInt face_opp1(dimension+2); - ArrOfInt face_opp2(dimension+2); - // Faces de bord : - for(int i=0; ifrontiere_dis()); - int nb_faces_bord_tot = le_bord.nb_faces_tot(); - for(int ind_face=0; ind_facefrontiere_dis()); - int nb_faces_bord_tot = le_bord.nb_faces_tot(); - for (int ind_face = 0; ind_face < nb_faces_bord_tot; ind_face++) - { - ok = okface(ind_face, face, la_cl); - if (ok == -1) - break; - elem1 = face_voisins(face, 0); - elem2 = face_voisins(face, 1); - remplir_sommets(domaine_VEF, face, elem1, elem2, sommets, face_opp1, face_opp2); - if (ok == 3) - update_matrice_NeumannP1Pa(domaine_VEF, inverse_quantitee_entrelacee, face, elem1, sommets, face_opp1, coef_som, ARR, ARV, AVR, AVV); - else if (ok == 4) - update_matrice_SymetrieP1Pa(domaine_VEF, inverse_quantitee_entrelacee, face, elem1, sommets, face_opp1, coef_som, ARR, ARV, AVR, AVV); - else - update_matriceP1Pa(domaine_VEF, inverse_quantitee_entrelacee, face, elem1, elem2, sommets, face_opp1, face_opp2, coef_som, ARR, ARV, AVR, AVV); - } - } - face_associee=-1; - for (face = nint; face < nb_faces; face++) - { - elem1 = face_voisins(face, 0); - elem2 = face_voisins(face, 1); - if (!domaine_VEF.est_une_face_virt_bord(face)) // On ne traite que les faces internes - { - remplir_sommets(domaine_VEF, face, elem1, elem2, sommets, face_opp1, face_opp2); - update_matriceP1Pa(domaine_VEF, inverse_quantitee_entrelacee, face, elem1, elem2, sommets, face_opp1, face_opp2, coef_som, ARR, ARV, AVR, AVV); - } + for (int face = 0; face < nb_faces_tot; face++) + { + if (cl(face) == DIRICHLET || cl(face) == OTHER) continue; + int elem1 = face_voisins(face, 0); + int elem2 = face_voisins(face, 1); + int face_associee = cl(face) < 0 ? -1 : cl(face); + remplir_sommets(elem_som, face_som, elem_faces, renum_som_perio, face, face_associee, elem1, elem2, sommets, face_opp1, face_opp2); + if (cl(face) == NEUMANN) + update_matrice_NeumannP1Pa(domaine_VEF, inverse_quantitee_entrelacee, face, elem1, sommets, face_opp1, coef_som, ARR, ARV, AVR, AVV); + else if (cl(face) == SYMMETRY) + update_matrice_SymetrieP1Pa(domaine_VEF, inverse_quantitee_entrelacee, face, elem1, sommets, face_opp1, coef_som, ARR, ARV, AVR, AVV); + else + update_matriceP1Pa(domaine_VEF, inverse_quantitee_entrelacee, face, elem1, elem2, sommets, face_opp1, face_opp2, coef_som, ARR, ARV, AVR, AVV); } Cerr << "Update P1Pa OK" << finl; @@ -2422,122 +1859,129 @@ void assemblerP0P1(const Domaine_dis_base& z, const Domaine& domaine=domaine_VEF.domaine(); const Domaine_Cl_VEF& domaine_Cl_VEF=ref_cast(Domaine_Cl_VEF, zcl); const Conds_lim& les_cl = domaine_Cl_VEF.les_conditions_limites(); - const IntTab& face_voisins = domaine_VEF.face_voisins(); - int nint = domaine_VEF.premiere_face_int(); - int nb_faces = domaine_VEF.nb_faces_tot(); - int nb_elem = domaine.nb_elem_tot(); - int elem1, elem2, face, ok; - IntLists voisins(nb_elem); - DoubleLists coeffs(nb_elem); - ArrOfInt sommets(dimension+2); - ArrOfInt face_opp1(dimension+2); - ArrOfInt face_opp2(dimension+2); - // Faces de bord : - for(int i=0; ifrontiere_dis()); - int nb_faces_bord_tot = le_bord.nb_faces_tot(); - for(int ind_face=0; ind_face tab_nnz(1); + tab_nnz(0) = 0; + auto nnz = tab_nnz.view_rw(); + CIntArrView cl = tab_cl.view_ro(); + CIntTabView face_voisins = domaine_VEF.face_voisins().view_ro(); + CIntTabView elem_som = domaine_VEF.domaine().les_elems().view_ro(); + CIntTabView face_som = domaine_VEF.face_sommets().view_ro(); + CIntTabView elem_faces = domaine_VEF.elem_faces().view_ro(); + CIntArrView renum_som_perio = domaine_VEF.domaine().get_renum_som_perio().view_ro(); + auto stencil = tab_stencil.view_wo(); + Kokkos::parallel_for(start_gpu_timer(__KERNEL_NAME__), nb_faces_tot, KOKKOS_LAMBDA(const int face) + { + if (cl(face)==NEUMANN // Neumann + || cl(face)==SYMMETRY // Symetrie + || cl(face)>0 // Periodique + || cl(face)==INTERNAL) // Face interne + { + int sommets[5] = { -1,-1,-1,-1,-1}; + int face_opp1[5]= { -1,-1,-1,-1,-1}; + int face_opp2[5]= { -1,-1,-1,-1,-1}; + int elem1=face_voisins(face, 0); + int elem2=face_voisins(face, 1); + int face_associee = cl(face) < 0 ? -1 : cl(face); // Periodique + remplir_sommets(elem_som, face_som, elem_faces, renum_som_perio, face, face_associee, elem1, elem2, sommets, face_opp1, face_opp2); + sort(sommets, face_opp1, face_opp2); + int size = dimension + 2; + for (int i = 0; i < size; i++) + { + int si = sommets[i]; + if (si < 0) break; + nnz_t slot = Kokkos::atomic_fetch_add(&nnz(0), 1); + stencil(slot, 0) = elem1; + stencil(slot, 1) = si; + if (elem2 != -1) + { + slot = Kokkos::atomic_fetch_add(&nnz(0), 1); + stencil(slot, 0) = elem2; + stencil(slot, 1) = si; + } + } + } + }); + end_gpu_timer(__KERNEL_NAME__); + tab_stencil.resize(tab_nnz(0), 2); matrice.typer("Matrice_Bloc"); Matrice_Bloc& matrice_bloc=ref_cast(Matrice_Bloc, matrice.valeur()); - matrice_bloc.remplir(voisins, coeffs, domaine.nb_elem(), domaine.nb_elem_tot(), domaine.nb_som(), domaine.nb_som_tot()); + matrice_bloc.remplir(tab_stencil, domaine.nb_elem(), domaine.nb_elem_tot(), domaine.nb_som(), domaine.nb_som_tot()); Cerr << "Assemblage POP1 OK" << finl; } void updateP0P1(const Domaine_dis_base& z, const Domaine_Cl_dis_base& zcl, Matrice& matrice, - const DoubleTab& inverse_quantitee_entrelacee, const ArrOfDouble& coef_som) + const DoubleTab& tab_inverse_quantitee_entrelacee, const ArrOfDouble& tab_coef_som) { int dimension=Objet_U::dimension; const Domaine_VEF& domaine_VEF=ref_cast(Domaine_VEF, z); const Domaine_Cl_VEF& domaine_Cl_VEF=ref_cast(Domaine_Cl_VEF, zcl); const Conds_lim& les_cl = domaine_Cl_VEF.les_conditions_limites(); - const IntTab& face_voisins = domaine_VEF.face_voisins(); - int nint = domaine_VEF.premiere_face_int(); - int nb_faces = domaine_VEF.nb_faces_tot(); - int elem1, elem2, face, ok; - ArrOfInt sommets(dimension+2); - ArrOfInt face_opp1(dimension+2); - ArrOfInt face_opp2(dimension+2); + int nb_faces_tot = domaine_VEF.nb_faces_tot(); + ArrOfInt tab_cl(nb_faces_tot); + build_cl(tab_cl, les_cl); + int nb_elem=domaine_VEF.nb_elem(); + int nb_som=domaine_VEF.nb_som(); Matrice_Bloc& A=ref_cast(Matrice_Bloc, matrice.valeur()); - Matrice_Morse& ARR=ref_cast(Matrice_Morse, A.get_bloc(0,0).valeur()); - Matrice_Morse& ARV=ref_cast(Matrice_Morse, A.get_bloc(0,1).valeur()); - Matrice_Morse& AVR=ref_cast(Matrice_Morse, A.get_bloc(1,0).valeur()); - Matrice_Morse& AVV=ref_cast(Matrice_Morse, A.get_bloc(1,1).valeur()); - // Faces de bord : - for (auto &itr : les_cl) - { - const Cond_lim& la_cl = itr; - const Front_VF& le_bord = ref_cast(Front_VF, la_cl->frontiere_dis()); - int nb_faces_bord_tot = le_bord.nb_faces_tot(); - for (int ind_face = 0; ind_face < nb_faces_bord_tot; ind_face++) - { - ok = okface(ind_face, face, la_cl); - if (ok == -1) - break; - elem1 = face_voisins(face, 0); - elem2 = face_voisins(face, 1); - remplir_sommets(domaine_VEF, face, elem1, elem2, sommets, face_opp1, face_opp2); - sort(sommets, face_opp1, face_opp2); - if (ok == 3) - update_matrice_NeumannP0P1(domaine_VEF, inverse_quantitee_entrelacee, face, elem1, sommets, face_opp1, coef_som, ARR, ARV, AVR, AVV); - else if (ok == 4) { /* Do nothing */ } - else - update_matriceP0P1(domaine_VEF, inverse_quantitee_entrelacee, face, elem1, elem2, sommets, face_opp1, face_opp2, coef_som, ARR, ARV, AVR, AVV); - } - } - face_associee=-1; - for(face=nint; face=k1) - { - double prod = beta * beta * coeff[n]; // Calcul de beta*beta*Ak1k2 - for (int som1=0; som1=s1) range(prod,s1,nb_som,s2,nb_som,A11RR,A11RV,A11VR,A11VV); - } - } - if (k1!=k2) - { - for (int som1=0; som1=s1) range(prod,s1,nb_som,s2,nb_som,A11RR,A11RV,A11VR,A11VV); - } - } - } - } - } - } + Kokkos::parallel_for(start_gpu_timer(__KERNEL_NAME__), range_1D(0, nb_lignes), KOKKOS_LAMBDA(const int i) + { + int k1=ligne+i; // Element k1 + for (auto n=tab1(i)-1; n=k1) + { + double prod = beta * beta * coeff(n); // Calcul de beta*beta*Ak1k2 + for (int som1=0; som1=s1) range(prod,s1,nb_som,s2,nb_som,A11RR,A11RV,A11VR,A11VV); + } + } + if (k1!=k2) + { + for (int som1=0; som1=s1) range(prod,s1,nb_som,s2,nb_som,A11RR,A11RV,A11VR,A11VV); + } + } + } + } + } + }); + end_gpu_timer(__KERNEL_NAME__); colonne+=nb_colonnes; } ligne+=nb_lignes; } // On parcours les elements de la matrice A01 - //Cerr << "[" << Process::me() << "] Contribution de A01 dans A11~" << finl; ligne=0; for (int i_bloc=0; i_bloc=s1) range(prod,s1,nb_som,s2,nb_som,A11RR,A11RV,A11VR,A11VV); - if (s1>=s2) range(prod,s2,nb_som,s1,nb_som,A11RR,A11RV,A11VR,A11VV); - } - } - } + Kokkos::parallel_for(start_gpu_timer(__KERNEL_NAME__), range_1D(0, nb_lignes), KOKKOS_LAMBDA(const int i) + { + int k=ligne+i; // Element k + for (auto n=tab1(i)-1; n=s1) range(prod,s1,nb_som,s2,nb_som,A11RR,A11RV,A11VR,A11VV); + if (s1>=s2) range(prod,s2,nb_som,s1,nb_som,A11RR,A11RV,A11VR,A11VV); + } + } + }); + end_gpu_timer(__KERNEL_NAME__); colonne+=nb_colonnes; } ligne+=nb_lignes; @@ -1081,15 +1082,16 @@ void operation11(Matrice_Bloc& A00, Matrice_Bloc& A01, Matrice_Bloc& A11, double void operation01(Matrice_Bloc& A00, Matrice_Bloc& A01, double alpha, double beta, const Domaine& domaine) { //Cerr << "[" << Process::me() << "] Operation01" << finl; - Matrice_Morse& A01RR=ref_cast(Matrice_Morse, A01.get_bloc(0,0).valeur()); - Matrice_Morse& A01RV=ref_cast(Matrice_Morse, A01.get_bloc(0,1).valeur()); - Matrice_Morse& A01VR=ref_cast(Matrice_Morse, A01.get_bloc(1,0).valeur()); - Matrice_Morse& A01VV=ref_cast(Matrice_Morse, A01.get_bloc(1,1).valeur()); - const IntTab& les_elems=domaine.les_elems(); - const Domaine& dom=domaine; - int nb_elem=A01RR.nb_lignes(); - int nb_som=A01RR.nb_colonnes(); - int nb_som_elem=les_elems.dimension(1); + int nb_elem = ref_cast(Matrice_Morse, A01.get_bloc(0,0).valeur()).nb_lignes(); + int nb_som = ref_cast(Matrice_Morse, A01.get_bloc(0,0).valeur()).nb_colonnes(); + Matrice_Morse_View A01RR, A01RV, A01VR, A01VV; + A01RR.set(ref_cast(Matrice_Morse, A01.get_bloc(0,0).valeur())); + A01RV.set(ref_cast(Matrice_Morse, A01.get_bloc(0,1).valeur())); + A01VR.set(ref_cast(Matrice_Morse, A01.get_bloc(1,0).valeur())); + A01VV.set(ref_cast(Matrice_Morse, A01.get_bloc(1,1).valeur())); + CIntTabView les_elems = domaine.les_elems().view_ro(); + CIntArrView renum_som_perio = domaine.get_renum_som_perio().view_ro(); + int nb_som_elem = domaine.les_elems().dimension(1); // On parcours les coefficients de A00 int ligne=0; for (int i_bloc=0; i_bloc=k1) - { - double prod = -alpha * beta * coeff[n]; // Calcul de -alpha*beta*Ak1k2 - for (int som=0; som=k1) + { + double prod = -alpha * beta * coeff(n); // Calcul de -alpha*beta*Ak1k2 + for (int som=0; som= 0) debit_face += porosite_face(num_face) * vitesse(num_face, axe) * - std::fabs(face_normales(num_face, axe)); + Kokkos::fabs(face_normales(num_face, axe)); else { for (int i = 0; i < dim; i++) @@ -170,4 +170,3 @@ void Terme_Source_Canal_perio_VEF_P1NC::calculer_debit(double& debit_e) const } - diff --git a/src/VEF/Sources/Canal/Terme_Source_Canal_perio_VEF_P1NC.h b/src/VEF/Sources/Canal/Terme_Source_Canal_perio_VEF_P1NC.h index c3c51e079e..6e69a2ba02 100644 --- a/src/VEF/Sources/Canal/Terme_Source_Canal_perio_VEF_P1NC.h +++ b/src/VEF/Sources/Canal/Terme_Source_Canal_perio_VEF_P1NC.h @@ -47,7 +47,7 @@ protected : OBS_PTR(Domaine_Cl_VEF) le_dom_Cl_VEF; void associer_domaines(const Domaine_dis_base& ,const Domaine_Cl_dis_base& ) override; - public_for_cuda + protected_but_public_for_cuda void calculer_debit(double&) const override; // les attributs ont ete mis dans la classe mere diff --git a/src/VEF/Sources/Dilatable/Source_Fluide_Dilatable_VEF_Proto.h b/src/VEF/Sources/Dilatable/Source_Fluide_Dilatable_VEF_Proto.h index dd181ea78c..46ad13345e 100644 --- a/src/VEF/Sources/Dilatable/Source_Fluide_Dilatable_VEF_Proto.h +++ b/src/VEF/Sources/Dilatable/Source_Fluide_Dilatable_VEF_Proto.h @@ -32,7 +32,7 @@ class Source_Fluide_Dilatable_VEF_Proto void associer_domaines_impl(const Domaine_dis_base& domaine,const Domaine_Cl_dis_base& domaine_cl); void associer_volume_porosite_impl(const Domaine_dis_base& domaine, DoubleVect& volumes, DoubleVect& porosites); - public_for_cuda + protected_but_public_for_cuda void ajouter_impl(const Equation_base& eqn, const DoubleVect& g, const int dimension, const double rho_m, const DoubleTab& tab_rho, DoubleTab& resu) const; protected: diff --git a/src/VEF/Sources/Sources_It_Eval/Iterateur_Source_VEF_Face.h b/src/VEF/Sources/Sources_It_Eval/Iterateur_Source_VEF_Face.h index ee720720ed..f866ee003f 100644 --- a/src/VEF/Sources/Sources_It_Eval/Iterateur_Source_VEF_Face.h +++ b/src/VEF/Sources/Sources_It_Eval/Iterateur_Source_VEF_Face.h @@ -70,7 +70,7 @@ class Iterateur_Source_VEF_Face: public Iterateur_Source_base mutable DoubleTab tab_coef_; DoubleVect volumes_cl_dirichlet_; - public_for_cuda + protected_but_public_for_cuda template DoubleTab& ajouter_faces_standard(const int, DoubleTab& ) const; template DoubleTab& ajouter_faces_non_standard(const int, DoubleTab& ) const; }; @@ -191,6 +191,7 @@ DoubleTab& Iterateur_Source_VEF_Face<_TYPE_>::ajouter_faces_non_standard(const i CDoubleArrView coef = static_cast(tab_coef_).view_ro(); DoubleArrView bilan = tab_bilan.view_rw(); DoubleTabView resu = tab_resu.view_rw(); + ToDo_Kokkos("create once source_view"); Kokkos::View source_view("source", nf, ncomp); Kokkos::parallel_for(start_gpu_timer(__KERNEL_NAME__), range_1D(0, nf), KOKKOS_LAMBDA(const int ind_face) { diff --git a/src/VEF/Sources/Terme_Source_Acceleration_VEF_Face.cpp b/src/VEF/Sources/Terme_Source_Acceleration_VEF_Face.cpp index 7c88ed7178..68ca781236 100644 --- a/src/VEF/Sources/Terme_Source_Acceleration_VEF_Face.cpp +++ b/src/VEF/Sources/Terme_Source_Acceleration_VEF_Face.cpp @@ -202,7 +202,6 @@ DoubleTab& Terme_Source_Acceleration_VEF_Face::ajouter(DoubleTab& resu) const const Front_VF& le_bord = ref_cast(Front_VF,la_cl->frontiere_dis()); int nb_faces_bord=le_bord.nb_faces(); ArrOfInt fait(nb_faces_bord); - fait = 0; for (int ind_face=0; ind_face # + uprime_uprime moyenne { + t_deb 0 t_fin 1e6 + sources { + transformation { + methode formule + expression 1 uprime*uprime + localisation elem + sources { + transformation { methode composante numero 0 nom_source uprime sources_reference { uprime } } + } + } + } + } + vprime_vprime moyenne { + t_deb 0 t_fin 1e6 + sources { + transformation { + methode formule + expression 1 vprime*vprime + localisation elem + sources { + transformation { methode composante numero 1 nom_source vprime sources_reference { uprime } } + } + } + } + } + wprime_wprime moyenne { + t_deb 0 t_fin 1e6 + sources { + transformation { + methode formule + expression 1 wprime*wprime + localisation elem + sources { + transformation { methode composante numero 2 nom_source wprime sources_reference { uprime } } + } + } + } + } + uprime_vprime moyenne { + t_deb 0 t_fin 1e6 + sources { + transformation { + methode formule + expression 1 uprime*vprime + localisation elem + sources { + transformation { methode composante numero 0 nom_source uprime sources_reference { uprime } } , + transformation { methode composante numero 1 nom_source vprime sources_reference { uprime } } + } + } + } + } + uprime_wprime moyenne { + t_deb 0 t_fin 1e6 + sources { + transformation { + methode formule + expression 1 uprime*wprime + localisation elem + sources { + transformation { methode composante numero 0 nom_source uprime sources_reference { uprime } } , + transformation { methode composante numero 2 nom_source wprime sources_reference { uprime } } + } + } + } + } + vprime_wprime moyenne { + t_deb 0 t_fin 1e6 + sources { + transformation { + methode formule + expression 1 vprime*wprime + localisation elem + sources { + transformation { methode composante numero 1 nom_source vprime sources_reference { uprime } } , + transformation { methode composante numero 2 nom_source wprime sources_reference { uprime } } + } + } + } + } + + # Derivees de vitesse dui/dxj # + du_dx transformation { methode composante numero 0 localisation elem sources { refChamp { Pb_champ pb_dom gradient_vitesse } } } + du_dy transformation { methode composante numero 1 localisation elem sources { refChamp { Pb_champ pb_dom gradient_vitesse } } } + du_dz transformation { methode composante numero 2 localisation elem sources { refChamp { Pb_champ pb_dom gradient_vitesse } } } + dv_dx transformation { methode composante numero 3 localisation elem sources { refChamp { Pb_champ pb_dom gradient_vitesse } } } + dv_dy transformation { methode composante numero 4 localisation elem sources { refChamp { Pb_champ pb_dom gradient_vitesse } } } + dv_dz transformation { methode composante numero 5 localisation elem sources { refChamp { Pb_champ pb_dom gradient_vitesse } } } + dw_dx transformation { methode composante numero 6 localisation elem sources { refChamp { Pb_champ pb_dom gradient_vitesse } } } + dw_dy transformation { methode composante numero 7 localisation elem sources { refChamp { Pb_champ pb_dom gradient_vitesse } } } + dw_dz transformation { methode composante numero 8 localisation elem sources { refChamp { Pb_champ pb_dom gradient_vitesse } } } + + # Moyenne des derivees de vitesse (obligatoire pour calculer ensuite les ecarts-types) # + MOY_du_dx moyenne { t_deb 0 t_fin 1e6 sources_reference { du_dx } } + MOY_du_dy moyenne { t_deb 0 t_fin 1e6 sources_reference { du_dy } } + MOY_du_dz moyenne { t_deb 0 t_fin 1e6 sources_reference { du_dz } } + MOY_dv_dx moyenne { t_deb 0 t_fin 1e6 sources_reference { dv_dx } } + MOY_dv_dy moyenne { t_deb 0 t_fin 1e6 sources_reference { dv_dy } } + MOY_dv_dz moyenne { t_deb 0 t_fin 1e6 sources_reference { dv_dz } } + MOY_dw_dx moyenne { t_deb 0 t_fin 1e6 sources_reference { dw_dx } } + MOY_dw_dy moyenne { t_deb 0 t_fin 1e6 sources_reference { dw_dy } } + MOY_dw_dz moyenne { t_deb 0 t_fin 1e6 sources_reference { dw_dz } } + + # Ecart-type des derivees de vitesse # + EC_du_dx ecart_type { t_deb 0 t_fin 1e6 sources_reference { du_dx } } + EC_du_dy ecart_type { t_deb 0 t_fin 1e6 sources_reference { du_dy } } + EC_du_dz ecart_type { t_deb 0 t_fin 1e6 sources_reference { du_dz } } + EC_dv_dx ecart_type { t_deb 0 t_fin 1e6 sources_reference { dv_dx } } + EC_dv_dy ecart_type { t_deb 0 t_fin 1e6 sources_reference { dv_dy } } + EC_dv_dz ecart_type { t_deb 0 t_fin 1e6 sources_reference { dv_dz } } + EC_dw_dx ecart_type { t_deb 0 t_fin 1e6 sources_reference { dw_dx } } + EC_dw_dy ecart_type { t_deb 0 t_fin 1e6 sources_reference { dw_dy } } + EC_dw_dz ecart_type { t_deb 0 t_fin 1e6 sources_reference { dw_dz } } + + # Taille de maille # + Delta transformation { methode formule expression 1 vol^(1/3) localisation elem source refChamp { Pb_champ pb_dom volume_maille nom_source vol } } + + # Energie cinetique turbulente # + TKE transformation { + methode formule + expression 1 0.5*norme_EC_vitesse*norme_EC_vitesse + sources { + transformation { + methode norme + localisation elem + nom_source norme_EC_vitesse + sources_reference { EC_vitesse } + } + } + } + + # Dissipation turbulente # + epsilon transformation { + methode formule + expression 1 (1.41289e-04/1)*(EC_du_dx^2+EC_du_dy^2+EC_du_dz^2+EC_dv_dx^2+EC_dv_dy^2+EC_dv_dz^2+EC_dw_dx^2+EC_dw_dy^2+EC_dw_dz^2) + localisation elem + sources_reference { EC_du_dx , EC_du_dy , EC_du_dz , EC_dv_dx , EC_dv_dy , EC_dv_dz , EC_dw_dx , EC_dw_dy , EC_dw_dz } + } + # y+ et utau moyennes sur les parois # + MOY_yplus_ reduction_0D { + methode moyenne + source interpolation { + localisation elem + domaine ParoisD_dom + sources_reference { MOY_yplus } + } + } + MOY_utau_ transformation { + methode formule + expression 1 (2.84e-5/0.0976)*MOY_ustar/0.00125 + localisation elem + sources_reference { MOY_ustar } + } + } + Sondes { + MOY_yplus__dom MOY_yplus_ periode 1e-6 numero_elem_sur_maitre 0 + MOY_utau_dom MOY_utau_ periode 1e-6 numero_elem_sur_maitre 0 + MOY_taux_cis_dom MOY_taux_cis periode 1e-6 numero_elem_sur_maitre 0 + MOY_taux_cis_wall_dom MOY_taux_cis_wall periode 1e-6 numero_elem_sur_maitre 0 + MOY_vitesse_dom MOY_vitesse periode 1e-6 segment 10 1 0 0 1 2 0 + EC_vitesse_dom EC_vitesse periode 1e-6 position_like MOY_vitesse_dom + uprime_dom uprime periode 1e-6 position_like MOY_vitesse_dom + } + format lml fichier BFS + Champs dt_post 1e+6 + { + MOY_du_dy elem + MOY_pression elem + MOY_pression som + # uprime elem # # diff CPU-GPU # + # MOY_vitesse elem # # diff CPU-GPU # + # EC_vitesse elem # # diff CPU-GPU # + MOY_nut elem + TKE elem + epsilon elem + # MOY_yplus elem # # assert # + EC_yplus elem + uprime_uprime elem + uprime_vprime elem + uprime_wprime elem + vprime_vprime elem + vprime_wprime elem + wprime_wprime elem + } + } + } + Sauvegarde_simple single_hdf Cas_dom.sauv +} + +EcritureLectureSpecial 0 + +Solve pb + +End diff --git a/tests/GPU/BFS/BFS.lml.gz b/tests/GPU/BFS/BFS.lml.gz new file mode 100644 index 0000000000..648d393137 Binary files /dev/null and b/tests/GPU/BFS/BFS.lml.gz differ diff --git a/tests/GPU/BFS/BFS_BENCH.TU.adastra_gfx90a b/tests/GPU/BFS/BFS_BENCH.TU.adastra_gfx90a new file mode 100644 index 0000000000..fc9eb16502 --- /dev/null +++ b/tests/GPU/BFS/BFS_BENCH.TU.adastra_gfx90a @@ -0,0 +1,78 @@ + # Global performance file # + +This is the global file for tracking performance in TRUST. It stores aggregated quantities. +More detailed statistics can be found in the BFS_BENCH_csv.TU file +For time loop, only standard counters of level 1 are printed alongside your custom counters +Time is given in seconds + +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Context of the computation +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Date: 08-06-2026 -- 14:38:46 +OS: g1157__Linux__x86_64__5.14.0-570.69.1.el9_6.x86_64__#1 SMP PREEMPT_DYNAMIC Tue Nov 25 01:30:14 EST 2025 +CPU model : AMD EPYC 7A53 64-Core Processor +Total number of threads:128 +GPU model: AMD Instinct MI250X +HIP runtime version: 6.43 +HIP drivers version: 6.43 +Nb procs used for the computation: 1 +TRUST version: 1.9.8_beta +Total number of elements used for the calculation: 2000000 + +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Computation start-up statistics +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Total time of the start-up: 32.5304 + +Number of calls to the linear solver per time step: 4 +Average time of the resolution of the linear problem per call: 0.762309 +Average number of iteration of the linear solver per call: 0 + +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Time loop statistics +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +The first time step is not accounted for the computation of the time loop statistics +Total time of the time loop: 6.35718 +Number of time steps: 9 +Skipped time steps: 1 +Average time per time step: 0.706353 +Standard deviation between time steps: 0.202927 +Time elapsed in the skipped time steps: 0.741004 + + +Standard counter description | Time/step | % loop time | Call(s)/step +------------------------------------------------------------------------------------------ +Linear solver resolutions Ax=B | 0.0733417 | 10.4 | 6 +Convection operator | 0.008983073 | 1.3 | 6 +Diffusion operator | 0.01601393 | 2.3 | 6 +Gradient operator | 0.004995961 | 0.7 | 12 +Divergence operator | 0.005065148 | 0.7 | 8 +Source terms | 0.0007179104 | 0.1 | 3 +Update ::mettre_a_jour | 0.1242929 | 17.6 | 2 +Computation of the time step dt | 0.0009378377 | 0.1 | 4 +Turbulence model::update | 0.01756681 | 2.5 | 2 +Post-treatment operations | 0.4624406 | 65.5 | 2 +Other operations | -0.008002771 | -1.1 | + +Average number of iteration of the linear solver per call: 11.6 + + +----------------------------------------------------------------------------------------------------------- + GPU statistics +----------------------------------------------------------------------------------------------------------- +Counter description | Time per step | % loop time | Call(s)/step | Bandwidth +----------------------------------------------------------------------------------------------------------- +Libraries: | 0.0725315 | 10.3 | 6 | +Kernels: | 0.561421 | 79.5 | 1328 | +Copy host to device: | 0.0021288 | 0.3 | 92 | 3.8 GB/s +Copy device to host: | 0.0036422 | 0.5 | 47 | 14.7 GB/s +Alloc/Free on device: | 0.00138089 | 0.2 | 590 | +GPU: 90% Copy H<->D: 0.82% Alloc/free: 0.2% Comm: 0% CPU & I/O: 9.2% +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Post-resolution statistics +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Time of the post-resolution: 0.976141 + +Total time for the whole computation 40.6048 + +[Slurm] Power consumption (51 s): 0.486 kW 0.007 kWh 0.001 € (0.10€/kWh) diff --git a/tests/GPU/BFS/BFS_BENCH.TU.adastra_gfx942 b/tests/GPU/BFS/BFS_BENCH.TU.adastra_gfx942 new file mode 100644 index 0000000000..4294744882 --- /dev/null +++ b/tests/GPU/BFS/BFS_BENCH.TU.adastra_gfx942 @@ -0,0 +1,78 @@ + # Global performance file # + +This is the global file for tracking performance in TRUST. It stores aggregated quantities. +More detailed statistics can be found in the BFS_BENCH_csv.TU file +For time loop, only standard counters of level 1 are printed alongside your custom counters +Time is given in seconds + +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Context of the computation +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Date: 08-06-2026 -- 14:33:56 +OS: a1007__Linux__x86_64__5.14.0-570.69.1.el9_6.x86_64__#1 SMP PREEMPT_DYNAMIC Tue Nov 25 01:30:14 EST 2025 +CPU model : AMD Instinct MI300A Accelerator +Total number of threads:192 +GPU model: AMD Instinct MI300A +HIP runtime version: 6.43 +HIP drivers version: 6.43 +Nb procs used for the computation: 1 +TRUST version: 1.9.8_beta +Total number of elements used for the calculation: 2000000 + +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Computation start-up statistics +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Total time of the start-up: 29.709 + +Number of calls to the linear solver per time step: 4 +Average time of the resolution of the linear problem per call: 0.807292 +Average number of iteration of the linear solver per call: 0 + +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Time loop statistics +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +The first time step is not accounted for the computation of the time loop statistics +Total time of the time loop: 1.91469 +Number of time steps: 9 +Skipped time steps: 1 +Average time per time step: 0.212743 +Standard deviation between time steps: 0.0987842 +Time elapsed in the skipped time steps: 0.47084 + + +Standard counter description | Time/step | % loop time | Call(s)/step +------------------------------------------------------------------------------------------ +Linear solver resolutions Ax=B | 0.0450238 | 21.2 | 6 +Convection operator | 0.005347514 | 2.5 | 6 +Diffusion operator | 0.008616399 | 4.1 | 6 +Gradient operator | 0.003200318 | 1.5 | 12 +Divergence operator | 0.003620461 | 1.7 | 8 +Source terms | 0.000460308 | 0.2 | 3 +Update ::mettre_a_jour | 0.05670063 | 26.7 | 2 +Computation of the time step dt | 0.0005922301 | 0.3 | 4 +Turbulence model::update | 0.004389234 | 2.1 | 2 +Post-treatment operations | 0.08205428 | 38.6 | 2 +Other operations | 0.002738007 | 1.3 | + +Average number of iteration of the linear solver per call: 11.6 + + +----------------------------------------------------------------------------------------------------------- + GPU statistics +----------------------------------------------------------------------------------------------------------- +Counter description | Time per step | % loop time | Call(s)/step | Bandwidth +----------------------------------------------------------------------------------------------------------- +Libraries: | 0.044401 | 20.9 | 6 | +Kernels: | 0.105516 | 49.6 | 1328 | +Copy host to device: | 0.00202539 | 1.0 | 92 | 3.9 GB/s +Copy device to host: | 0.00238012 | 1.1 | 47 | 22.5 GB/s +Alloc/Free on device: | 0.000647061 | 0.3 | 590 | +GPU: 70% Copy H<->D: 2.1% Alloc/free: 0.3% Comm: 0% CPU & I/O: 27% +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Post-resolution statistics +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Time of the post-resolution: 1.10731 + +Total time for the whole computation 33.2018 + +[Slurm] Power consumption (44 s): 0.665 kW 0.008 kWh 0.001 € (0.10€/kWh) diff --git a/tests/GPU/BFS/BFS_BENCH.TU.dalianvl_cc100 b/tests/GPU/BFS/BFS_BENCH.TU.dalianvl_cc100 new file mode 100644 index 0000000000..509b9d567f --- /dev/null +++ b/tests/GPU/BFS/BFS_BENCH.TU.dalianvl_cc100 @@ -0,0 +1,77 @@ + # Global performance file # + +This is the global file for tracking performance in TRUST. It stores aggregated quantities. +More detailed statistics can be found in the BFS_BENCH_csv.TU file +For time loop, only standard counters of level 1 are printed alongside your custom counters +Time is given in seconds + +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Context of the computation +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Date: 22-05-2026 -- 12:12:20 +OS: dalianvl14__Linux__aarch64__5.14.0-503.40.1.el9_5.aarch64__#1 SMP PREEMPT_DYNAMIC Thu Apr 24 13:05:29 UTC 2025 +CPU model : Unknown Linux CPU +Total number of threads:144 +GPU model: NVIDIA GB200 +CUDA runtime version: 13.0 +CUDA drivers version: 13.20 +Nb procs used for the computation: 1 +TRUST version: 1.9.8_beta +Total number of elements used for the calculation: 2000000 + +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Computation start-up statistics +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Total time of the start-up: 14.0361 + +Number of calls to the linear solver per time step: 4 +Average time of the resolution of the linear problem per call: 0.444288 +Average number of iteration of the linear solver per call: 0 + +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Time loop statistics +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +The first time step is not accounted for the computation of the time loop statistics +Total time of the time loop: 1.31512 +Number of time steps: 9 +Skipped time steps: 1 +Average time per time step: 0.146125 +Standard deviation between time steps: 0.0606734 +Time elapsed in the skipped time steps: 0.284713 + + +Standard counter description | Time/step | % loop time | Call(s)/step +------------------------------------------------------------------------------------------ +Linear solver resolutions Ax=B | 0.0404097 | 27.7 | 6 +Convection operator | 0.003551145 | 2.4 | 6 +Diffusion operator | 0.00504075 | 3.4 | 6 +Gradient operator | 0.002410326 | 1.6 | 12 +Divergence operator | 0.003548001 | 2.4 | 8 +Source terms | 0.000598559 | 0.4 | 3 +Update ::mettre_a_jour | 0.02725214 | 18.6 | 2 +Computation of the time step dt | 0.0004161804 | 0.3 | 4 +Turbulence model::update | 0.001749288 | 1.2 | 2 +Post-treatment operations | 0.05599893 | 38.3 | 2 +Other operations | 0.005149901 | 3.5 | + +Average number of iteration of the linear solver per call: 11.6 + + +----------------------------------------------------------------------------------------------------------- + GPU statistics +----------------------------------------------------------------------------------------------------------- +Counter description | Time per step | % loop time | Call(s)/step | Bandwidth +----------------------------------------------------------------------------------------------------------- +Libraries: | 0.0392981 | 26.9 | 6 | +Kernels: | 0.0387839 | 26.5 | 1328 | +Copy host to device: | 0.00171442 | 1.2 | 92 | 4.7 GB/s +Copy device to host: | 0.00120926 | 0.8 | 35 | 44.3 GB/s +Alloc/Free on device: | 0.00383176 | 2.6 | 614 | +GPU: 53% Copy H<->D: 2% Alloc/free: 2.6% Comm: 0% CPU & I/O: 42% +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Post-resolution statistics +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Time of the post-resolution: 0.943158 + +Total time for the whole computation 16.5791 + diff --git a/tests/GPU/BFS/BFS_BENCH.TU.eureka_cc89 b/tests/GPU/BFS/BFS_BENCH.TU.eureka_cc89 new file mode 100644 index 0000000000..ae0b9baaa8 --- /dev/null +++ b/tests/GPU/BFS/BFS_BENCH.TU.eureka_cc89 @@ -0,0 +1,77 @@ + # Global performance file # + +This is the global file for tracking performance in TRUST. It stores aggregated quantities. +More detailed statistics can be found in the BFS_BENCH_csv.TU file +For time loop, only standard counters of level 1 are printed alongside your custom counters +Time is given in seconds + +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Context of the computation +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Date: 30-04-2026 -- 19:56:19 +OS: eureka__Linux__x86_64__6.14.0-37-generic__#37~24.04.1-Ubuntu SMP PREEMPT_DYNAMIC Thu Nov 20 10:25:38 UTC 2 +CPU model : INTEL(R) XEON(R) PLATINUM 8580 +Total number of threads:240 +GPU model: NVIDIA RTX 6000 Ada Generation +CUDA runtime version: 12.90 +CUDA drivers version: 13.10 +Nb procs used for the computation: 1 +TRUST version: 1.9.8_beta +Total number of elements used for the calculation: 2000000 + +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Computation start-up statistics +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Total time of the start-up: 112.693 + +Number of calls to the linear solver per time step: 4 +Average time of the resolution of the linear problem per call: 1.45516 +Average number of iteration of the linear solver per call: 0 + +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Time loop statistics +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +The first time step is not accounted for the computation of the time loop statistics +Total time of the time loop: 166.877 +Number of time steps: 9 +Skipped time steps: 1 +Average time per time step: 18.5419 +Standard deviation between time steps: 5.80467 +Time elapsed in the skipped time steps: 17.0173 + + +Standard counter description | Time/step | % loop time | Call(s)/step +------------------------------------------------------------------------------------------ +Linear solver resolutions Ax=B | 0.109143 | 0.6 | 6 +Convection operator | 0.0102421 | 0.1 | 6 +Diffusion operator | 0.01638421 | 0.1 | 6 +Gradient operator | 0.008781796 | 0.0 | 12 +Divergence operator | 0.008933099 | 0.0 | 8 +Source terms | 0.08644986 | 0.5 | 3 +Update ::mettre_a_jour | 12.18951 | 65.7 | 2 +Computation of the time step dt | 0.08896099 | 0.5 | 4 +Turbulence model::update | 2.906248 | 15.7 | 2 +Post-treatment operations | 5.947749 | 32.1 | 2 +Other operations | -2.830534 | -15.3 | + +Average number of iteration of the linear solver per call: 11.7 + + +----------------------------------------------------------------------------------------------------------- + GPU statistics +----------------------------------------------------------------------------------------------------------- +Counter description | Time per step | % loop time | Call(s)/step | Bandwidth +----------------------------------------------------------------------------------------------------------- +Libraries: | 0.0760458 | 0.4 | 6 | +Kernels: | 0.494343 | 2.7 | 1161 | +Copy host to device: | 0.421876 | 2.3 | 128 | 5.0 GB/s +Copy device to host: | 0.589114 | 3.2 | 213 | 5.7 GB/s +Alloc/Free on device: | 0.00224632 | 0.0 | 528 | +GPU: 3.1% Copy H<->D: 5.5% Alloc/free: 0.012% Comm: 0% CPU & I/O: 91% +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Post-resolution statistics +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Time of the post-resolution: 2.21071 + +Total time for the whole computation 298.798 + diff --git a/tests/GPU/BFS/BFS_BENCH.TU.irene-amd-ccrt_cc70 b/tests/GPU/BFS/BFS_BENCH.TU.irene-amd-ccrt_cc70 new file mode 100644 index 0000000000..2b67b15e45 --- /dev/null +++ b/tests/GPU/BFS/BFS_BENCH.TU.irene-amd-ccrt_cc70 @@ -0,0 +1,78 @@ + # Global performance file # + +This is the global file for tracking performance in TRUST. It stores aggregated quantities. +More detailed statistics can be found in the BFS_BENCH_csv.TU file +For time loop, only standard counters of level 1 are printed alongside your custom counters +Time is given in seconds + +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Context of the computation +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Date: 23-04-2026 -- 09:14:58 +OS: irene7066__Linux__x86_64__4.18.0-553.89.1.el8_10.x86_64__#1 SMP Sat Nov 29 00:49:18 EST 2025 +CPU model : Intel(R) Xeon(R) Gold 6230 CPU @ 2.10GHz +Total number of threads:80 +GPU model: Tesla V100-SXM2-16GB +CUDA runtime version: 12.90 +CUDA drivers version: 13.0 +Nb procs used for the computation: 1 +TRUST version: 1.9.8_beta +Total number of elements used for the calculation: 2000000 + +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Computation start-up statistics +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Total time of the start-up: 62.3224 + +Number of calls to the linear solver per time step: 4 +Average time of the resolution of the linear problem per call: 1.11062 +Average number of iteration of the linear solver per call: 0 + +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Time loop statistics +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +The first time step is not accounted for the computation of the time loop statistics +Total time of the time loop: 93.2092 +Number of time steps: 9 +Skipped time steps: 1 +Average time per time step: 10.3566 +Standard deviation between time steps: 3.95843 +Time elapsed in the skipped time steps: 9.48538 + + +Standard counter description | Time/step | % loop time | Call(s)/step +------------------------------------------------------------------------------------------ +Linear solver resolutions Ax=B | 0.132225 | 1.3 | 6 +Convection operator | 0.009519812 | 0.1 | 6 +Diffusion operator | 0.01566439 | 0.2 | 6 +Gradient operator | 0.01099676 | 0.1 | 12 +Divergence operator | 0.006549923 | 0.1 | 8 +Source terms | 0.05627705 | 0.5 | 3 +Update ::mettre_a_jour | 6.661139 | 64.3 | 2 +Computation of the time step dt | 0.04126574 | 0.4 | 4 +Turbulence model::update | 1.384435 | 13.4 | 2 +Post-treatment operations | 3.349094 | 32.3 | 2 +Other operations | -1.310585 | -12.7 | + +Average number of iteration of the linear solver per call: 11.7 + + +----------------------------------------------------------------------------------------------------------- + GPU statistics +----------------------------------------------------------------------------------------------------------- +Counter description | Time per step | % loop time | Call(s)/step | Bandwidth +----------------------------------------------------------------------------------------------------------- +Libraries: | 0.110164 | 1.1 | 6 | +Kernels: | 0.485927 | 4.7 | 1161 | +Copy host to device: | 0.480492 | 4.6 | 128 | 4.4 GB/s +Copy device to host: | 0.755923 | 7.3 | 213 | 4.5 GB/s +Alloc/Free on device: | 0.00129667 | 0.0 | 528 | +GPU: 5.8% Copy H<->D: 12% Alloc/free: 0.013% Comm: 0% CPU & I/O: 82% +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Post-resolution statistics +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Time of the post-resolution: 1.46305 + +Total time for the whole computation 166.48 + +[Slurm] Power consumption (196 s): 25.598 kW 1.394 kWh 0.139 € (0.10€/kWh) diff --git a/tests/GPU/BFS/BFS_BENCH.TU.is157091_cc86 b/tests/GPU/BFS/BFS_BENCH.TU.is157091_cc86 new file mode 100644 index 0000000000..a46ebac5f2 --- /dev/null +++ b/tests/GPU/BFS/BFS_BENCH.TU.is157091_cc86 @@ -0,0 +1,77 @@ + # Global performance file # + +This is the global file for tracking performance in TRUST. It stores aggregated quantities. +More detailed statistics can be found in the BFS_BENCH_csv.TU file +For time loop, only standard counters of level 1 are printed alongside your custom counters +Time is given in seconds + +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Context of the computation +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Date: 31-05-2026 -- 19:50:30 +OS: is157091__Linux__x86_64__5.11.12-300.fc34.x86_64__#1 SMP Wed Apr 7 16:31:13 UTC 2021 +CPU model : AMD Ryzen Threadripper PRO 5975WX 32-Cores +Total number of threads:64 +GPU model: NVIDIA RTX A6000 +CUDA runtime version: 12.90 +CUDA drivers version: 13.20 +Nb procs used for the computation: 1 +TRUST version: 1.9.8_beta +Total number of elements used for the calculation: 2000000 + +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Computation start-up statistics +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Total time of the start-up: 15.8403 + +Number of calls to the linear solver per time step: 4 +Average time of the resolution of the linear problem per call: 0.559062 +Average number of iteration of the linear solver per call: 0 + +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Time loop statistics +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +The first time step is not accounted for the computation of the time loop statistics +Total time of the time loop: 2.46987 +Number of time steps: 9 +Skipped time steps: 1 +Average time per time step: 0.27443 +Standard deviation between time steps: 0.0601585 +Time elapsed in the skipped time steps: 0.465938 + + +Standard counter description | Time/step | % loop time | Call(s)/step +------------------------------------------------------------------------------------------ +Linear solver resolutions Ax=B | 0.114364 | 41.7 | 6 +Convection operator | 0.007063331 | 2.6 | 6 +Diffusion operator | 0.01262074 | 4.6 | 6 +Gradient operator | 0.006030292 | 2.2 | 12 +Divergence operator | 0.004501881 | 1.6 | 8 +Source terms | 0.0007731341 | 0.3 | 3 +Update ::mettre_a_jour | 0.05517616 | 20.1 | 2 +Computation of the time step dt | 0.001173593 | 0.4 | 4 +Turbulence model::update | 0.00733994 | 2.7 | 2 +Post-treatment operations | 0.06002287 | 21.9 | 2 +Other operations | 0.005364006 | 2.0 | + +Average number of iteration of the linear solver per call: 11.6 + + +----------------------------------------------------------------------------------------------------------- + GPU statistics +----------------------------------------------------------------------------------------------------------- +Counter description | Time per step | % loop time | Call(s)/step | Bandwidth +----------------------------------------------------------------------------------------------------------- +Libraries: | 0.113484 | 41.4 | 6 | +Kernels: | 0.10959 | 39.9 | 1328 | +Copy host to device: | 0.0015817 | 0.6 | 92 | 5.1 GB/s +Copy device to host: | 0.00538939 | 2.0 | 35 | 9.9 GB/s +Alloc/Free on device: | 0.00187427 | 0.7 | 614 | +GPU: 81% Copy H<->D: 2.5% Alloc/free: 0.68% Comm: 0% CPU & I/O: 15% +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Post-resolution statistics +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Time of the post-resolution: 0.673743 + +Total time for the whole computation 19.4498 + diff --git a/tests/GPU/BFS/BFS_BENCH.TU.is159479_cc120 b/tests/GPU/BFS/BFS_BENCH.TU.is159479_cc120 new file mode 100644 index 0000000000..e4c8b3fe0a --- /dev/null +++ b/tests/GPU/BFS/BFS_BENCH.TU.is159479_cc120 @@ -0,0 +1,77 @@ + # Global performance file # + +This is the global file for tracking performance in TRUST. It stores aggregated quantities. +More detailed statistics can be found in the BFS_BENCH_csv.TU file +For time loop, only standard counters of level 1 are printed alongside your custom counters +Time is given in seconds + +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Context of the computation +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Date: 13-05-2026 -- 07:01:01 +OS: is159479.intra.cea.fr__Linux__x86_64__6.12.0-160000.29-default__#1 SMP PREEMPT_DYNAMIC Fri May 1 12:45:19 UTC 2026 (6 +CPU model : AMD Ryzen Threadripper PRO 9965WX 24-Cores +Total number of threads:48 +GPU model: NVIDIA RTX PRO 6000 Blackwell Workstation Edition +CUDA runtime version: 13.0 +CUDA drivers version: 13.0 +Nb procs used for the computation: 1 +TRUST version: 1.9.8_beta +Total number of elements used for the calculation: 2000000 + +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Computation start-up statistics +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Total time of the start-up: 12.9809 + +Number of calls to the linear solver per time step: 4 +Average time of the resolution of the linear problem per call: 0.365144 +Average number of iteration of the linear solver per call: 0 + +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Time loop statistics +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +The first time step is not accounted for the computation of the time loop statistics +Total time of the time loop: 1.20401 +Number of time steps: 9 +Skipped time steps: 1 +Average time per time step: 0.133779 +Standard deviation between time steps: 0.0368064 +Time elapsed in the skipped time steps: 0.346852 + + +Standard counter description | Time/step | % loop time | Call(s)/step +------------------------------------------------------------------------------------------ +Linear solver resolutions Ax=B | 0.0500903 | 37.4 | 6 +Convection operator | 0.003679159 | 2.8 | 6 +Diffusion operator | 0.006074906 | 4.5 | 6 +Gradient operator | 0.001982798 | 1.5 | 12 +Divergence operator | 0.002428775 | 1.8 | 8 +Source terms | 0.0003626534 | 0.3 | 3 +Update ::mettre_a_jour | 0.02154634 | 16.1 | 2 +Computation of the time step dt | 0.0004629608 | 0.3 | 4 +Turbulence model::update | 0.002795345 | 2.1 | 2 +Post-treatment operations | 0.04208834 | 31.5 | 2 +Other operations | 0.002267656 | 1.7 | + +Average number of iteration of the linear solver per call: 11.6 + + +----------------------------------------------------------------------------------------------------------- + GPU statistics +----------------------------------------------------------------------------------------------------------- +Counter description | Time per step | % loop time | Call(s)/step | Bandwidth +----------------------------------------------------------------------------------------------------------- +Libraries: | 0.0496036 | 37.1 | 6 | +Kernels: | 0.0422833 | 31.6 | 1390 | +Copy host to device: | 0.0012351 | 0.9 | 92 | 6.5 GB/s +Copy device to host: | 0.00569077 | 4.3 | 36 | 9.4 GB/s +Alloc/Free on device: | 0.00122093 | 0.9 | 614 | +GPU: 69% Copy H<->D: 5.2% Alloc/free: 0.91% Comm: 0% CPU & I/O: 25% +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Post-resolution statistics +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Time of the post-resolution: 0.542056 + +Total time for the whole computation 15.0739 + diff --git a/tests/GPU/BFS/BFS_BENCH.TU.is246827_cc86 b/tests/GPU/BFS/BFS_BENCH.TU.is246827_cc86 new file mode 100644 index 0000000000..f89780e8bd --- /dev/null +++ b/tests/GPU/BFS/BFS_BENCH.TU.is246827_cc86 @@ -0,0 +1,77 @@ + # Global performance file # + +This is the global file for tracking performance in TRUST. It stores aggregated quantities. +More detailed statistics can be found in the BFS_BENCH_csv.TU file +For time loop, only standard counters of level 1 are printed alongside your custom counters +Time is given in seconds + +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Context of the computation +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Date: 04-05-2026 -- 19:18:54 +OS: is246827__Linux__x86_64__6.2.9-300.fc38.x86_64__#1 SMP PREEMPT_DYNAMIC Thu Mar 30 22:32:58 UTC 2023 +CPU model : 12th Gen Intel(R) Core(TM) i7-12850HX +Total number of threads:24 +GPU model: NVIDIA RTX A3000 12GB Laptop GPU +CUDA runtime version: 12.90 +CUDA drivers version: 12.10 +Nb procs used for the computation: 1 +TRUST version: 1.9.8_beta +Total number of elements used for the calculation: 2000000 + +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Computation start-up statistics +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Total time of the start-up: 26.4517 + +Number of calls to the linear solver per time step: 4 +Average time of the resolution of the linear problem per call: 0.818174 +Average number of iteration of the linear solver per call: 0 + +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Time loop statistics +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +The first time step is not accounted for the computation of the time loop statistics +Total time of the time loop: 8.40028 +Number of time steps: 9 +Skipped time steps: 1 +Average time per time step: 0.933365 +Standard deviation between time steps: 0.137856 +Time elapsed in the skipped time steps: 1.01629 + + +Standard counter description | Time/step | % loop time | Call(s)/step +------------------------------------------------------------------------------------------ +Linear solver resolutions Ax=B | 0.485421 | 52.0 | 6 +Convection operator | 0.0194105 | 2.1 | 6 +Diffusion operator | 0.04692109 | 5.0 | 6 +Gradient operator | 0.01502096 | 1.6 | 12 +Divergence operator | 0.01077627 | 1.2 | 8 +Source terms | 0.001760336 | 0.2 | 3 +Update ::mettre_a_jour | 0.1845253 | 19.8 | 2 +Computation of the time step dt | 0.005321171 | 0.6 | 4 +Turbulence model::update | 0.02338764 | 2.5 | 2 +Post-treatment operations | 0.1294219 | 13.9 | 2 +Other operations | 0.01139873 | 1.2 | + +Average number of iteration of the linear solver per call: 11.6 + + +----------------------------------------------------------------------------------------------------------- + GPU statistics +----------------------------------------------------------------------------------------------------------- +Counter description | Time per step | % loop time | Call(s)/step | Bandwidth +----------------------------------------------------------------------------------------------------------- +Libraries: | 0.48335 | 51.8 | 6 | +Kernels: | 0.355819 | 38.1 | 1346 | +Copy host to device: | 0.00315205 | 0.3 | 80 | 4.5 GB/s +Copy device to host: | 0.0202789 | 2.2 | 86 | 7.7 GB/s +Alloc/Free on device: | 0.00149475 | 0.2 | 568 | +GPU: 90% Copy H<->D: 2.5% Alloc/free: 0.16% Comm: 0% CPU & I/O: 7.4% +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Post-resolution statistics +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Time of the post-resolution: 0.929967 + +Total time for the whole computation 36.7983 + diff --git a/tests/GPU/BFS/BFS_BENCH.TU.is247793_gfx1100 b/tests/GPU/BFS/BFS_BENCH.TU.is247793_gfx1100 new file mode 100644 index 0000000000..3d23b652a7 --- /dev/null +++ b/tests/GPU/BFS/BFS_BENCH.TU.is247793_gfx1100 @@ -0,0 +1,77 @@ + # Global performance file # + +This is the global file for tracking performance in TRUST. It stores aggregated quantities. +More detailed statistics can be found in the BFS_BENCH_csv.TU file +For time loop, only standard counters of level 1 are printed alongside your custom counters +Time is given in seconds + +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Context of the computation +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Date: 22-05-2026 -- 18:29:42 +OS: is247793__Linux__x86_64__6.8.0-107-generic__#107~22.04.1-Ubuntu SMP PREEMPT_DYNAMIC Wed Mar 18 23:40:43 UTC +CPU model : AMD Ryzen Threadripper PRO 7965WX 24-Cores +Total number of threads:48 +GPU model: AMD Radeon PRO W7900 +HIP runtime version: 7.53 +HIP drivers version: 7.53 +Nb procs used for the computation: 1 +TRUST version: 1.9.8_beta +Total number of elements used for the calculation: 2000000 + +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Computation start-up statistics +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Total time of the start-up: 15.1276 + +Number of calls to the linear solver per time step: 4 +Average time of the resolution of the linear problem per call: 0.812923 +Average number of iteration of the linear solver per call: 0 + +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Time loop statistics +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +The first time step is not accounted for the computation of the time loop statistics +Total time of the time loop: 3.70678 +Number of time steps: 9 +Skipped time steps: 1 +Average time per time step: 0.411865 +Standard deviation between time steps: 0.0871853 +Time elapsed in the skipped time steps: 0.705107 + + +Standard counter description | Time/step | % loop time | Call(s)/step +------------------------------------------------------------------------------------------ +Linear solver resolutions Ax=B | 0.116422 | 28.3 | 6 +Convection operator | 0.008319375 | 2.0 | 6 +Diffusion operator | 0.01516901 | 3.7 | 6 +Gradient operator | 0.005098916 | 1.2 | 12 +Divergence operator | 0.005316149 | 1.3 | 8 +Source terms | 0.0009473977 | 0.2 | 3 +Update ::mettre_a_jour | 0.05988134 | 14.5 | 2 +Computation of the time step dt | 0.001219753 | 0.3 | 4 +Turbulence model::update | 0.008179361 | 2.0 | 2 +Post-treatment operations | 0.1878769 | 45.6 | 2 +Other operations | 0.003434582 | 0.8 | + +Average number of iteration of the linear solver per call: 11.6 + + +----------------------------------------------------------------------------------------------------------- + GPU statistics +----------------------------------------------------------------------------------------------------------- +Counter description | Time per step | % loop time | Call(s)/step | Bandwidth +----------------------------------------------------------------------------------------------------------- +Libraries: | 0.115614 | 28.1 | 6 | +Kernels: | 0.247483 | 60.1 | 1328 | +Copy host to device: | 0.00247318 | 0.6 | 92 | 3.2 GB/s +Copy device to host: | 0.00313158 | 0.8 | 35 | 17.1 GB/s +Alloc/Free on device: | 0.00252754 | 0.6 | 614 | +GPU: 88% Copy H<->D: 1.4% Alloc/free: 0.61% Comm: 0% CPU & I/O: 9.9% +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Post-resolution statistics +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Time of the post-resolution: 0.993357 + +Total time for the whole computation 20.5329 + diff --git a/tests/GPU/BFS/BFS_BENCH.TU.jean-zay_cc90 b/tests/GPU/BFS/BFS_BENCH.TU.jean-zay_cc90 new file mode 100644 index 0000000000..93e7847b7e --- /dev/null +++ b/tests/GPU/BFS/BFS_BENCH.TU.jean-zay_cc90 @@ -0,0 +1,78 @@ + # Global performance file # + +This is the global file for tracking performance in TRUST. It stores aggregated quantities. +More detailed statistics can be found in the BFS_BENCH_csv.TU file +For time loop, only standard counters of level 1 are printed alongside your custom counters +Time is given in seconds + +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Context of the computation +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Date: 13-05-2026 -- 09:48:00 +OS: jzxh116__Linux__x86_64__5.14.0-570.58.1.el9_6.x86_64__#1 SMP PREEMPT_DYNAMIC Tue Oct 21 04:15:07 EDT 2025 +CPU model : Intel(R) Xeon(R) Platinum 8468 +Total number of threads:192 +GPU model: NVIDIA H100 80GB HBM3 +CUDA runtime version: 12.60 +CUDA drivers version: 13.0 +Nb procs used for the computation: 1 +TRUST version: 1.9.8_beta +Total number of elements used for the calculation: 2000000 + +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Computation start-up statistics +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Total time of the start-up: 27.5174 + +Number of calls to the linear solver per time step: 4 +Average time of the resolution of the linear problem per call: 0.937468 +Average number of iteration of the linear solver per call: 0 + +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Time loop statistics +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +The first time step is not accounted for the computation of the time loop statistics +Total time of the time loop: 1.7255 +Number of time steps: 9 +Skipped time steps: 1 +Average time per time step: 0.191723 +Standard deviation between time steps: 0.135704 +Time elapsed in the skipped time steps: 1.00399 + + +Standard counter description | Time/step | % loop time | Call(s)/step +------------------------------------------------------------------------------------------ +Linear solver resolutions Ax=B | 0.0478737 | 25.0 | 6 +Convection operator | 0.005209725 | 2.7 | 6 +Diffusion operator | 0.00718375 | 3.7 | 6 +Gradient operator | 0.002878709 | 1.5 | 12 +Divergence operator | 0.003925051 | 2.0 | 8 +Source terms | 0.0005442999 | 0.3 | 3 +Update ::mettre_a_jour | 0.02739966 | 14.3 | 2 +Computation of the time step dt | 0.0004660597 | 0.2 | 4 +Turbulence model::update | 0.002827991 | 1.5 | 2 +Post-treatment operations | 0.08851215 | 46.2 | 2 +Other operations | 0.004901591 | 2.6 | + +Average number of iteration of the linear solver per call: 11.6 + + +----------------------------------------------------------------------------------------------------------- + GPU statistics +----------------------------------------------------------------------------------------------------------- +Counter description | Time per step | % loop time | Call(s)/step | Bandwidth +----------------------------------------------------------------------------------------------------------- +Libraries: | 0.0470228 | 24.5 | 6 | +Kernels: | 0.0509353 | 26.6 | 1390 | +Copy host to device: | 0.00195763 | 1.0 | 92 | 4.1 GB/s +Copy device to host: | 0.00493976 | 2.6 | 36 | 10.9 GB/s +Alloc/Free on device: | 0.00211832 | 1.1 | 614 | +GPU: 51% Copy H<->D: 3.6% Alloc/free: 1.1% Comm: 0% CPU & I/O: 44% +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Post-resolution statistics +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Time of the post-resolution: 0.87826 + +Total time for the whole computation 31.1252 + +[Slurm] Power consumption (57 s): 0.444 kW 0.007 kWh 0.001 € (0.10€/kWh) diff --git a/tests/GPU/BFS/BFS_BENCH.TU.lumi_gfx90a b/tests/GPU/BFS/BFS_BENCH.TU.lumi_gfx90a new file mode 100644 index 0000000000..1b6a3f5859 --- /dev/null +++ b/tests/GPU/BFS/BFS_BENCH.TU.lumi_gfx90a @@ -0,0 +1,78 @@ + # Global performance file # + +This is the global file for tracking performance in TRUST. It stores aggregated quantities. +More detailed statistics can be found in the BFS_BENCH_csv.TU file +For time loop, only standard counters of level 1 are printed alongside your custom counters +Time is given in seconds + +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Context of the computation +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Date: 18-05-2026 -- 08:29:33 +OS: nid007956__Linux__x86_64__6.4.0-150600.23.73_15.0.14-cray_shasta_c__#1 SMP Tue Oct 21 20:32:25 UTC 2025 (89d3c98) +CPU model : AMD EPYC 7A53 64-Core Processor +Total number of threads:128 +GPU model: AMD Instinct MI250X +HIP runtime version: 6.43 +HIP drivers version: 6.43 +Nb procs used for the computation: 1 +TRUST version: 1.9.8_beta +Total number of elements used for the calculation: 2000000 + +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Computation start-up statistics +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Total time of the start-up: 70.5197 + +Number of calls to the linear solver per time step: 4 +Average time of the resolution of the linear problem per call: 1.27103 +Average number of iteration of the linear solver per call: 0 + +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Time loop statistics +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +The first time step is not accounted for the computation of the time loop statistics +Total time of the time loop: 6.05094 +Number of time steps: 9 +Skipped time steps: 1 +Average time per time step: 0.672327 +Standard deviation between time steps: 0.171609 +Time elapsed in the skipped time steps: 0.825163 + + +Standard counter description | Time/step | % loop time | Call(s)/step +------------------------------------------------------------------------------------------ +Linear solver resolutions Ax=B | 0.0661606 | 9.8 | 6 +Convection operator | 0.01064927 | 1.6 | 6 +Diffusion operator | 0.01735671 | 2.6 | 6 +Gradient operator | 0.004805947 | 0.7 | 12 +Divergence operator | 0.005487979 | 0.8 | 8 +Source terms | 0.000698267 | 0.1 | 3 +Update ::mettre_a_jour | 0.1170135 | 17.4 | 2 +Computation of the time step dt | 0.0009277774 | 0.1 | 4 +Turbulence model::update | 0.01760316 | 2.6 | 2 +Post-treatment operations | 0.4398524 | 65.4 | 2 +Other operations | -0.008228412 | -1.2 | + +Average number of iteration of the linear solver per call: 11.6 + + +----------------------------------------------------------------------------------------------------------- + GPU statistics +----------------------------------------------------------------------------------------------------------- +Counter description | Time per step | % loop time | Call(s)/step | Bandwidth +----------------------------------------------------------------------------------------------------------- +Libraries: | 0.0653104 | 9.7 | 6 | +Kernels: | 0.537028 | 79.9 | 1328 | +Copy host to device: | 0.00251406 | 0.4 | 92 | 3.2 GB/s +Copy device to host: | 0.00320432 | 0.5 | 35 | 16.7 GB/s +Alloc/Free on device: | 0.00185866 | 0.3 | 614 | +GPU: 90% Copy H<->D: 0.85% Alloc/free: 0.28% Comm: 0% CPU & I/O: 9.3% +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Post-resolution statistics +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Time of the post-resolution: 1.66404 + +Total time for the whole computation 79.0599 + +[Slurm] Power consumption (106 s): 0.471 kW 0.014 kWh 0.001 € (0.10€/kWh) diff --git a/tests/GPU/BFS/BFS_BENCH.TU.topaze_cc80 b/tests/GPU/BFS/BFS_BENCH.TU.topaze_cc80 new file mode 100644 index 0000000000..db25e32d24 --- /dev/null +++ b/tests/GPU/BFS/BFS_BENCH.TU.topaze_cc80 @@ -0,0 +1,78 @@ + # Global performance file # + +This is the global file for tracking performance in TRUST. It stores aggregated quantities. +More detailed statistics can be found in the BFS_BENCH_csv.TU file +For time loop, only standard counters of level 1 are printed alongside your custom counters +Time is given in seconds + +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Context of the computation +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Date: 15-05-2026 -- 13:20:33 +OS: topaze7064__Linux__x86_64__4.18.0-553.89.1.el8_10.x86_64__#1 SMP Sat Nov 29 00:49:18 EST 2025 +CPU model : AMD EPYC 7763 64-Core Processor +Total number of threads:256 +GPU model: NVIDIA A100-SXM4-80GB +CUDA runtime version: 12.90 +CUDA drivers version: 13.0 +Nb procs used for the computation: 1 +TRUST version: 1.9.8_beta +Total number of elements used for the calculation: 2000000 + +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Computation start-up statistics +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Total time of the start-up: 22.475 + +Number of calls to the linear solver per time step: 4 +Average time of the resolution of the linear problem per call: 0.868012 +Average number of iteration of the linear solver per call: 0 + +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Time loop statistics +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +The first time step is not accounted for the computation of the time loop statistics +Total time of the time loop: 1.82319 +Number of time steps: 9 +Skipped time steps: 1 +Average time per time step: 0.202576 +Standard deviation between time steps: 0.0779406 +Time elapsed in the skipped time steps: 0.885518 + + +Standard counter description | Time/step | % loop time | Call(s)/step +------------------------------------------------------------------------------------------ +Linear solver resolutions Ax=B | 0.0650238 | 32.1 | 6 +Convection operator | 0.006038481 | 3.0 | 6 +Diffusion operator | 0.008644277 | 4.3 | 6 +Gradient operator | 0.003602949 | 1.8 | 12 +Divergence operator | 0.00453026 | 2.2 | 8 +Source terms | 0.000635081 | 0.3 | 3 +Update ::mettre_a_jour | 0.03531282 | 17.4 | 2 +Computation of the time step dt | 0.0006298844 | 0.3 | 4 +Turbulence model::update | 0.003786783 | 1.9 | 2 +Post-treatment operations | 0.07020698 | 34.7 | 2 +Other operations | 0.004165048 | 2.1 | + +Average number of iteration of the linear solver per call: 11.6 + + +----------------------------------------------------------------------------------------------------------- + GPU statistics +----------------------------------------------------------------------------------------------------------- +Counter description | Time per step | % loop time | Call(s)/step | Bandwidth +----------------------------------------------------------------------------------------------------------- +Libraries: | 0.0639933 | 31.6 | 6 | +Kernels: | 0.0611641 | 30.2 | 1328 | +Copy host to device: | 0.00190596 | 0.9 | 92 | 4.2 GB/s +Copy device to host: | 0.00386666 | 1.9 | 35 | 13.9 GB/s +Alloc/Free on device: | 0.00295996 | 1.5 | 614 | +GPU: 62% Copy H<->D: 2.8% Alloc/free: 1.5% Comm: 0% CPU & I/O: 34% +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Post-resolution statistics +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Time of the post-resolution: 1.33353 + +Total time for the whole computation 26.5173 + +[Slurm] Power consumption (66 s): 0.545 kW 0.010 kWh 0.001 € (0.10€/kWh) diff --git a/tests/GPU/BFS/BFS_K_DOM.son.ref b/tests/GPU/BFS/BFS_K_DOM.son.ref new file mode 100644 index 0000000000..f318db270d --- /dev/null +++ b/tests/GPU/BFS/BFS_K_DOM.son.ref @@ -0,0 +1,15 @@ +# BFS_K_DOM.son +# Temps x= 7.50000000e+00 y= 0.00000000e+00 z= 5.00000000e+00 x= 7.50000000e+00 y= 2.22222222e-01 z= 5.00000000e+00 x= 7.50000000e+00 y= 4.44444444e-01 z= 5.00000000e+00 x= 7.50000000e+00 y= 6.66666667e-01 z= 5.00000000e+00 x= 7.50000000e+00 y= 8.88888889e-01 z= 5.00000000e+00 x= 7.50000000e+00 y= 1.11111111e+00 z= 5.00000000e+00 x= 7.50000000e+00 y= 1.33333333e+00 z= 5.00000000e+00 x= 7.50000000e+00 y= 1.55555556e+00 z= 5.00000000e+00 x= 7.50000000e+00 y= 1.77777778e+00 z= 5.00000000e+00 x= 7.50000000e+00 y= 2.00000000e+00 z= 5.00000000e+00 +# Champ K [m2/s2] +# Type SEGMENT 7.500000 0.000000 5.000000 7.500000 2.000000 5.000000 +0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 +4.77545847e-02 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 +9.55010851e-02 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 +1.43253236e-01 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 +1.91000668e-01 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 +2.38742902e-01 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 +2.86475840e-01 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 +3.34185730e-01 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 +3.81859477e-01 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 +4.29444051e-01 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 +4.76994676e-01 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 diff --git a/tests/GPU/BFS/BFS_MOY_TAUX_CIS_DOM.son.ref b/tests/GPU/BFS/BFS_MOY_TAUX_CIS_DOM.son.ref new file mode 100644 index 0000000000..ff32e84b81 --- /dev/null +++ b/tests/GPU/BFS/BFS_MOY_TAUX_CIS_DOM.son.ref @@ -0,0 +1,15 @@ +# BFS_MOY_TAUX_CIS_DOM.son +# Temps x= 1.10000000e+00 y= 1.00000000e-01 z= 1.00000000e-01 +# Champ MOY_TAUX_CIS [s-1] +# Type NUMERO_ELEM_SUR_MAITRE +0.00000000e+00 0.00000000e+00 +4.77545847e-02 3.14010496e+01 +9.55010851e-02 3.14003238e+01 +1.43253236e-01 3.13999909e+01 +1.91000668e-01 3.14006385e+01 +2.38742902e-01 3.14009290e+01 +2.86475840e-01 3.14032656e+01 +3.34185730e-01 3.14032909e+01 +3.81859477e-01 3.14079857e+01 +4.29444051e-01 3.14065507e+01 +4.76994676e-01 3.14143110e+01 diff --git a/tests/GPU/BFS/BFS_MOY_TAUX_CIS_WALL_DOM.son.ref b/tests/GPU/BFS/BFS_MOY_TAUX_CIS_WALL_DOM.son.ref new file mode 100644 index 0000000000..06bb4ac1ce --- /dev/null +++ b/tests/GPU/BFS/BFS_MOY_TAUX_CIS_WALL_DOM.son.ref @@ -0,0 +1,15 @@ +# BFS_MOY_TAUX_CIS_WALL_DOM.son +# Temps x= 1.10000000e+00 y= 0.00000000e+00 z= 1.00000000e-01 +# Champ MOY_TAUX_CIS_WALL [s-1] +# Type NUMERO_ELEM_SUR_MAITRE +0.00000000e+00 0.00000000e+00 +4.77545847e-02 5.02399803e+01 +9.55010851e-02 5.02370293e+01 +1.43253236e-01 5.02342198e+01 +1.91000668e-01 5.02319335e+01 +2.38742902e-01 5.02303891e+01 +2.86475840e-01 5.02291756e+01 +3.34185730e-01 5.02292593e+01 +3.81859477e-01 5.02286506e+01 +4.29444051e-01 5.02306561e+01 +4.76994676e-01 5.02297405e+01 diff --git a/tests/GPU/BFS/BFS_MOY_UTAU_DOM.son.ref b/tests/GPU/BFS/BFS_MOY_UTAU_DOM.son.ref new file mode 100644 index 0000000000..0e1da14445 --- /dev/null +++ b/tests/GPU/BFS/BFS_MOY_UTAU_DOM.son.ref @@ -0,0 +1,15 @@ +# BFS_MOY_UTAU_DOM.son +# Temps x= 1.10000000e+00 y= 1.00000000e-01 z= 1.00000000e-01 +# Champ MOY_UTAU_ [??] +# Type NUMERO_ELEM_SUR_MAITRE +0.00000000e+00 0.00000000e+00 +4.77545847e-02 0.00000000e+00 +9.55010851e-02 0.00000000e+00 +1.43253236e-01 0.00000000e+00 +1.91000668e-01 0.00000000e+00 +2.38742902e-01 0.00000000e+00 +2.86475840e-01 0.00000000e+00 +3.34185730e-01 0.00000000e+00 +3.81859477e-01 0.00000000e+00 +4.29444051e-01 0.00000000e+00 +4.76994676e-01 0.00000000e+00 diff --git a/tests/GPU/BFS/BFS_MOY_VITESSE_DOM.son.ref b/tests/GPU/BFS/BFS_MOY_VITESSE_DOM.son.ref new file mode 100644 index 0000000000..cac261ddc0 --- /dev/null +++ b/tests/GPU/BFS/BFS_MOY_VITESSE_DOM.son.ref @@ -0,0 +1,15 @@ +# BFS_MOY_VITESSE_DOM.son +# Temps x= 1.00000000e+00 y= 0.00000000e+00 z= 0.00000000e+00 x= 1.00000000e+00 y= 2.22222222e-01 z= 0.00000000e+00 x= 1.00000000e+00 y= 4.44444444e-01 z= 0.00000000e+00 x= 1.00000000e+00 y= 6.66666667e-01 z= 0.00000000e+00 x= 1.00000000e+00 y= 8.88888889e-01 z= 0.00000000e+00 x= 1.00000000e+00 y= 1.11111111e+00 z= 0.00000000e+00 x= 1.00000000e+00 y= 1.33333333e+00 z= 0.00000000e+00 x= 1.00000000e+00 y= 1.55555556e+00 z= 0.00000000e+00 x= 1.00000000e+00 y= 1.77777778e+00 z= 0.00000000e+00 x= 1.00000000e+00 y= 2.00000000e+00 z= 0.00000000e+00 +# Champ MOY_VITESSE [m/s] +# Type SEGMENT 1.000000 0.000000 0.000000 1.000000 2.000000 0.000000 +0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 +4.77545847e-02 1.25600000e+01 0.00000000e+00 1.86893863e-10 1.25600000e+01 2.49272462e-04 2.25726378e-10 1.25600000e+01 3.26202885e-05 2.26774129e-10 1.25600000e+01 -1.30174838e-04 1.52115499e-10 1.25600000e+01 -1.51717562e-04 1.94261376e-10 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 +9.55010851e-02 1.25578821e+01 0.00000000e+00 -6.74814495e-11 1.25599998e+01 -5.00094655e-04 -9.33348727e-11 1.25600000e+01 -3.70617170e-05 2.44534252e-10 1.25599998e+01 2.29229623e-04 1.09188400e-10 1.25578821e+01 3.07926204e-04 -2.09276414e-10 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 +1.43253236e-01 1.25558778e+01 0.00000000e+00 -2.33903184e-10 1.25601124e+01 -1.08661111e-03 -4.33076349e-10 1.25601130e+01 -1.34745899e-04 9.29268864e-11 1.25601124e+01 5.59067494e-04 4.05838848e-12 1.25558778e+01 6.62296352e-04 -4.03340234e-10 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 +1.91000668e-01 1.25540127e+01 0.00000000e+00 -2.54717524e-10 1.25603630e+01 -1.11814645e-03 -4.76759542e-10 1.25603643e+01 -1.18765710e-04 2.36734674e-10 1.25603630e+01 5.52917218e-04 3.25693525e-10 1.25540127e+01 6.83996235e-04 -8.06715768e-10 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 +2.38742902e-01 1.25523012e+01 0.00000000e+00 -3.25918676e-10 1.25607663e+01 -1.41217546e-03 -8.40371750e-10 1.25607685e+01 -1.76303606e-04 -2.36876881e-10 1.25607663e+01 7.27905812e-04 6.96667544e-11 1.25523012e+01 8.60593826e-04 -9.60868743e-10 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 +2.86475840e-01 1.25507473e+01 0.00000000e+00 -2.90044977e-11 1.25613259e+01 -1.39385504e-03 -1.95285024e-10 1.25613292e+01 -1.70363051e-04 6.15474726e-10 1.25613259e+01 7.14357395e-04 1.11326854e-09 1.25507473e+01 8.49866465e-04 -1.23527436e-09 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 +3.34185730e-01 1.25493451e+01 0.00000000e+00 -8.07691414e-10 1.25620356e+01 -1.45720521e-03 -1.68645372e-09 1.25620402e+01 -1.55088937e-04 -1.26564152e-09 1.25620356e+01 7.20918718e-04 -7.41614738e-10 1.25493451e+01 8.91452479e-04 -1.50045991e-09 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 +3.81859477e-01 1.25480804e+01 0.00000000e+00 1.17994467e-09 1.25628805e+01 -1.86490361e-03 1.64173303e-09 1.25628868e+01 -2.90941960e-04 2.62554731e-09 1.25628805e+01 1.02667324e-03 3.58882467e-09 1.25480804e+01 1.12919124e-03 -7.34502178e-10 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 +4.29444051e-01 1.25469333e+01 0.00000000e+00 -3.43417239e-09 1.25638388e+01 -9.58508785e-04 -5.50618813e-09 1.25638468e+01 7.74674626e-06 -5.16764276e-09 1.25638388e+01 3.50728206e-04 -5.32881603e-09 1.25469333e+01 6.00250616e-04 -3.57486443e-09 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 +4.76994676e-01 1.25458771e+01 0.00000000e+00 7.27711023e-09 1.25648858e+01 -3.08862823e-03 9.80741271e-09 1.25648959e+01 -6.15540112e-04 1.04150364e-08 1.25648858e+01 1.85084787e-03 1.36067776e-08 1.25458771e+01 1.85341917e-03 4.08437442e-09 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 diff --git a/tests/GPU/BFS/BFS_MOY_YPLUS__DOM.son.ref b/tests/GPU/BFS/BFS_MOY_YPLUS__DOM.son.ref new file mode 100644 index 0000000000..36fee62da6 --- /dev/null +++ b/tests/GPU/BFS/BFS_MOY_YPLUS__DOM.son.ref @@ -0,0 +1,15 @@ +# BFS_MOY_YPLUS__DOM.son +# Temps x= 1.10000000e+00 y= 0.00000000e+00 z= 1.00000000e-01 +# Champ MOY_YPLUS_ [adimensionnel] +# Type NUMERO_ELEM_SUR_MAITRE +0.00000000e+00 0.00000000e+00 +4.77545847e-02 9.42798792e+01 +9.55010851e-02 9.42755547e+01 +1.43253236e-01 9.42701369e+01 +1.91000668e-01 9.42645492e+01 +2.38742902e-01 9.42588960e+01 +2.86475840e-01 9.42535715e+01 +3.34185730e-01 9.42490186e+01 +3.81859477e-01 9.42439293e+01 +4.29444051e-01 9.42412777e+01 +4.76994676e-01 9.42350202e+01 diff --git a/tests/GPU/BFS/BFS_NUT_MAX_BOX.son.ref b/tests/GPU/BFS/BFS_NUT_MAX_BOX.son.ref new file mode 100644 index 0000000000..d4ffda5873 --- /dev/null +++ b/tests/GPU/BFS/BFS_NUT_MAX_BOX.son.ref @@ -0,0 +1,15 @@ +# BFS_NUT_MAX_BOX.son +# Temps x= 1.00000000e-01 y= 1.00000000e-01 z= 1.00000000e-01 +# Champ NUT_MAX [m2/s] +# Type NUMERO_ELEM_SUR_MAITRE +0.00000000e+00 0.00000000e+00 +4.77545847e-02 0.00000000e+00 +9.55010851e-02 0.00000000e+00 +1.43253236e-01 0.00000000e+00 +1.91000668e-01 0.00000000e+00 +2.38742902e-01 0.00000000e+00 +2.86475840e-01 0.00000000e+00 +3.34185730e-01 0.00000000e+00 +3.81859477e-01 0.00000000e+00 +4.29444051e-01 0.00000000e+00 +4.76994676e-01 0.00000000e+00 diff --git a/tests/GPU/BFS/BFS_PRESSION_DOM.son.ref b/tests/GPU/BFS/BFS_PRESSION_DOM.son.ref new file mode 100644 index 0000000000..9d5fd3115e --- /dev/null +++ b/tests/GPU/BFS/BFS_PRESSION_DOM.son.ref @@ -0,0 +1,15 @@ +# BFS_PRESSION_DOM.son +# Temps x= 7.50000000e+00 y= 0.00000000e+00 z= 5.00000000e+00 x= 7.50000000e+00 y= 2.22222222e-01 z= 5.00000000e+00 x= 7.50000000e+00 y= 4.44444444e-01 z= 5.00000000e+00 x= 7.50000000e+00 y= 6.66666667e-01 z= 5.00000000e+00 x= 7.50000000e+00 y= 8.88888889e-01 z= 5.00000000e+00 x= 7.50000000e+00 y= 1.11111111e+00 z= 5.00000000e+00 x= 7.50000000e+00 y= 1.33333333e+00 z= 5.00000000e+00 x= 7.50000000e+00 y= 1.55555556e+00 z= 5.00000000e+00 x= 7.50000000e+00 y= 1.77777778e+00 z= 5.00000000e+00 x= 7.50000000e+00 y= 2.00000000e+00 z= 5.00000000e+00 +# Champ PRESSION [Pa.m3/kg] +# Type SEGMENT 7.500000 0.000000 5.000000 7.500000 2.000000 5.000000 +0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 +4.77545847e-02 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 +9.55010851e-02 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 +1.43253236e-01 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 +1.91000668e-01 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 +2.38742902e-01 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 +2.86475840e-01 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 +3.34185730e-01 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 +3.81859477e-01 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 +4.29444051e-01 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 +4.76994676e-01 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 diff --git a/tests/GPU/BFS/BFS_TAUX_CIS_0_BOX.son.ref b/tests/GPU/BFS/BFS_TAUX_CIS_0_BOX.son.ref new file mode 100644 index 0000000000..47d276472b --- /dev/null +++ b/tests/GPU/BFS/BFS_TAUX_CIS_0_BOX.son.ref @@ -0,0 +1,15 @@ +# BFS_TAUX_CIS_0_BOX.son +# Temps x= -9.20000000e+00 y= 1.50000000e+00 z= 3.14000000e+00 +# Champ TAUX_CISAILLEMENT [s-1] +# Type POINTS +0.00000000e+00 0.00000000e+00 +4.77545847e-02 0.00000000e+00 +9.55010851e-02 0.00000000e+00 +1.43253236e-01 0.00000000e+00 +1.91000668e-01 0.00000000e+00 +2.38742902e-01 0.00000000e+00 +2.86475840e-01 0.00000000e+00 +3.34185730e-01 0.00000000e+00 +3.81859477e-01 0.00000000e+00 +4.29444051e-01 0.00000000e+00 +4.76994676e-01 0.00000000e+00 diff --git a/tests/GPU/BFS/BFS_TAUX_CIS_ELEM0_BOX.son.ref b/tests/GPU/BFS/BFS_TAUX_CIS_ELEM0_BOX.son.ref new file mode 100644 index 0000000000..2383069f4f --- /dev/null +++ b/tests/GPU/BFS/BFS_TAUX_CIS_ELEM0_BOX.son.ref @@ -0,0 +1,15 @@ +# BFS_TAUX_CIS_ELEM0_BOX.son +# Temps x= 1.00000000e-01 y= 1.00000000e-01 z= 1.00000000e-01 +# Champ TAUX_CISAILLEMENT [s-1] +# Type NUMERO_ELEM_SUR_MAITRE +0.00000000e+00 6.28000000e+01 +4.77545847e-02 6.27894087e+01 +9.55010851e-02 6.27805172e+01 +1.43253236e-01 6.27738299e+01 +1.91000668e-01 6.27695848e+01 +2.38742902e-01 6.27677501e+01 +2.86475840e-01 6.27680561e+01 +3.34185730e-01 6.27700461e+01 +3.81859477e-01 6.27731398e+01 +4.29444051e-01 6.27767021e+01 +4.76994676e-01 6.27801229e+01 diff --git a/tests/GPU/BFS/BFS_TAUX_CIS_WALL_BOX.son.ref b/tests/GPU/BFS/BFS_TAUX_CIS_WALL_BOX.son.ref new file mode 100644 index 0000000000..fe62cab1ed --- /dev/null +++ b/tests/GPU/BFS/BFS_TAUX_CIS_WALL_BOX.son.ref @@ -0,0 +1,15 @@ +# BFS_TAUX_CIS_WALL_BOX.son +# Temps x= 1.00000000e-01 y= 0.00000000e+00 z= 1.00000000e-01 +# Champ TAUX_CIS_WALL [s-1] +# Type NUMERO_ELEM_SUR_MAITRE +0.00000000e+00 6.28000000e+01 +4.77545847e-02 6.27894087e+01 +9.55010851e-02 6.27805172e+01 +1.43253236e-01 6.27738299e+01 +1.91000668e-01 6.27695848e+01 +2.38742902e-01 6.27677501e+01 +2.86475840e-01 6.27680561e+01 +3.34185730e-01 6.27700461e+01 +3.81859477e-01 6.27731398e+01 +4.29444051e-01 6.27767021e+01 +4.76994676e-01 6.27801229e+01 diff --git a/tests/GPU/BFS/BFS_UPRIME_DOM.son.ref b/tests/GPU/BFS/BFS_UPRIME_DOM.son.ref new file mode 100644 index 0000000000..2c34efdee5 --- /dev/null +++ b/tests/GPU/BFS/BFS_UPRIME_DOM.son.ref @@ -0,0 +1,15 @@ +# BFS_UPRIME_DOM.son +# Temps x= 1.00000000e+00 y= 0.00000000e+00 z= 0.00000000e+00 x= 1.00000000e+00 y= 2.22222222e-01 z= 0.00000000e+00 x= 1.00000000e+00 y= 4.44444444e-01 z= 0.00000000e+00 x= 1.00000000e+00 y= 6.66666667e-01 z= 0.00000000e+00 x= 1.00000000e+00 y= 8.88888889e-01 z= 0.00000000e+00 x= 1.00000000e+00 y= 1.11111111e+00 z= 0.00000000e+00 x= 1.00000000e+00 y= 1.33333333e+00 z= 0.00000000e+00 x= 1.00000000e+00 y= 1.55555556e+00 z= 0.00000000e+00 x= 1.00000000e+00 y= 1.77777778e+00 z= 0.00000000e+00 x= 1.00000000e+00 y= 2.00000000e+00 z= 0.00000000e+00 +# Champ UPRIME [??] +# Type SEGMENT 1.000000 0.000000 0.000000 1.000000 2.000000 0.000000 +0.00000000e+00 1.25600000e+01 0.00000000e+00 0.00000000e+00 1.25600000e+01 0.00000000e+00 0.00000000e+00 1.25600000e+01 0.00000000e+00 0.00000000e+00 1.25600000e+01 0.00000000e+00 0.00000000e+00 1.25600000e+01 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 +4.77545847e-02 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 1.77635684e-15 0.00000000e+00 0.00000000e+00 1.77635684e-15 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 +9.55010851e-02 -1.70451372e-03 -4.13749826e-04 -2.05368209e-10 -3.51204031e-04 -4.76474206e-04 -1.92154026e-10 -1.25448652e-04 -1.10328445e-10 7.95685926e-13 -3.51204035e-04 4.76473702e-04 -1.43875005e-11 -1.70451396e-03 4.13749433e-04 -1.49678097e-10 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 +1.43253236e-01 -3.37059554e-03 -6.37830869e-04 -5.20499010e-10 -2.36895221e-04 -8.13654229e-04 -3.28825657e-10 -1.25695954e-04 -5.60994423e-09 -1.89704729e-10 -2.36915066e-04 8.13662528e-04 -2.00995072e-10 -3.37058137e-03 6.37844777e-04 -2.63889988e-10 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 +1.91000668e-01 -5.53720379e-03 -5.86131934e-05 -8.92472976e-11 6.50296474e-04 -1.54672586e-05 2.45320995e-11 8.40300507e-04 9.67721645e-09 1.60411020e-10 6.50324971e-04 1.54579017e-05 5.40508648e-10 -5.53722287e-03 5.85941593e-05 -5.30380284e-10 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 +2.38742902e-01 -6.21119627e-03 -6.35780484e-04 -8.29535899e-10 1.18474075e-03 -8.42938087e-04 -6.49151322e-10 1.20264813e-03 -3.30813897e-08 -1.07479952e-09 1.18463084e-03 8.42981478e-04 -5.64928920e-10 -6.21111987e-03 6.35856955e-04 -5.29458841e-10 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 +2.86475840e-01 -7.82040184e-03 4.82026519e-05 1.25767714e-09 2.82047684e-03 7.49260474e-05 1.75858909e-09 2.85804539e-03 4.46541338e-08 2.15503276e-09 2.82060851e-03 -7.49676853e-05 2.62876573e-09 -7.82048881e-03 -4.82889439e-05 -5.18621113e-10 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 +3.34185730e-01 -8.19520068e-03 -2.24296228e-04 -3.79980139e-09 3.95431085e-03 -1.41674697e-04 -4.81063324e-09 4.43486449e-03 -1.97341878e-07 -6.21367146e-09 3.95367378e-03 1.41915440e-04 -5.60304701e-09 -8.19476162e-03 2.24734313e-04 -1.26783565e-09 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 +3.81859477e-01 -7.36501478e-03 -1.50041972e-03 8.78165311e-09 5.27973068e-03 -2.35762953e-03 1.28814437e-08 4.21941288e-03 3.29118509e-07 1.53989989e-08 5.28062267e-03 2.35740031e-03 1.55525360e-08 -7.36557577e-03 1.49986138e-03 3.22715690e-09 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 +4.29444051e-01 -1.30274103e-02 3.82176250e-03 -2.17536301e-08 9.35376067e-03 5.97926885e-03 -3.23084243e-08 1.20198925e-02 -6.62978936e-07 -3.65563323e-08 9.35153267e-03 -5.97837566e-03 -3.74076058e-08 -1.30258490e-02 -3.82020633e-03 -1.22488296e-08 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 +4.76994676e-01 6.49716025e-04 -1.01878961e-02 5.36417951e-08 4.33501496e-03 -1.52546095e-02 7.96838950e-08 -6.59891810e-04 1.32714207e-06 8.54376745e-08 4.33828965e-03 1.52540096e-02 9.24238394e-08 6.47779337e-04 1.01859691e-02 3.61955081e-08 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 diff --git a/tests/GPU/BFS/BFS_USTAR_BOX.son.ref b/tests/GPU/BFS/BFS_USTAR_BOX.son.ref new file mode 100644 index 0000000000..d55d0b146c --- /dev/null +++ b/tests/GPU/BFS/BFS_USTAR_BOX.son.ref @@ -0,0 +1,15 @@ +# BFS_USTAR_BOX.son +# Temps x= 1.00000000e-01 y= 1.00000000e-01 z= 1.00000000e-01 +# Champ U_STAR [m2/s2] +# Type NUMERO_ELEM_SUR_MAITRE +0.00000000e+00 0.00000000e+00 +4.77545847e-02 0.00000000e+00 +9.55010851e-02 0.00000000e+00 +1.43253236e-01 0.00000000e+00 +1.91000668e-01 0.00000000e+00 +2.38742902e-01 0.00000000e+00 +2.86475840e-01 0.00000000e+00 +3.34185730e-01 0.00000000e+00 +3.81859477e-01 0.00000000e+00 +4.29444051e-01 0.00000000e+00 +4.76994676e-01 0.00000000e+00 diff --git a/tests/GPU/BFS/BFS_UTAU_BOX.son.ref b/tests/GPU/BFS/BFS_UTAU_BOX.son.ref new file mode 100644 index 0000000000..63d1a953b1 --- /dev/null +++ b/tests/GPU/BFS/BFS_UTAU_BOX.son.ref @@ -0,0 +1,15 @@ +# BFS_UTAU_BOX.son +# Temps x= 1.00000000e-01 y= 1.00000000e-01 z= 1.00000000e-01 +# Champ UTAU [??] +# Type NUMERO_ELEM_SUR_MAITRE +0.00000000e+00 0.00000000e+00 +4.77545847e-02 0.00000000e+00 +9.55010851e-02 0.00000000e+00 +1.43253236e-01 0.00000000e+00 +1.91000668e-01 0.00000000e+00 +2.38742902e-01 0.00000000e+00 +2.86475840e-01 0.00000000e+00 +3.34185730e-01 0.00000000e+00 +3.81859477e-01 0.00000000e+00 +4.29444051e-01 0.00000000e+00 +4.76994676e-01 0.00000000e+00 diff --git a/tests/GPU/BFS/BFS_VISC_DOM.son.ref b/tests/GPU/BFS/BFS_VISC_DOM.son.ref new file mode 100644 index 0000000000..b5175fac37 --- /dev/null +++ b/tests/GPU/BFS/BFS_VISC_DOM.son.ref @@ -0,0 +1,15 @@ +# BFS_VISC_DOM.son +# Temps x= 7.50000000e+00 y= 0.00000000e+00 z= 5.00000000e+00 x= 7.50000000e+00 y= 2.22222222e-01 z= 5.00000000e+00 x= 7.50000000e+00 y= 4.44444444e-01 z= 5.00000000e+00 x= 7.50000000e+00 y= 6.66666667e-01 z= 5.00000000e+00 x= 7.50000000e+00 y= 8.88888889e-01 z= 5.00000000e+00 x= 7.50000000e+00 y= 1.11111111e+00 z= 5.00000000e+00 x= 7.50000000e+00 y= 1.33333333e+00 z= 5.00000000e+00 x= 7.50000000e+00 y= 1.55555556e+00 z= 5.00000000e+00 x= 7.50000000e+00 y= 1.77777778e+00 z= 5.00000000e+00 x= 7.50000000e+00 y= 2.00000000e+00 z= 5.00000000e+00 +# Champ VISCOSITE_TURBULENTE [m2/s] +# Type SEGMENT 7.500000 0.000000 5.000000 7.500000 2.000000 5.000000 +0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 +4.77545847e-02 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 +9.55010851e-02 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 +1.43253236e-01 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 +1.91000668e-01 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 +2.38742902e-01 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 +2.86475840e-01 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 +3.34185730e-01 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 +3.81859477e-01 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 +4.29444051e-01 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 +4.76994676e-01 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 diff --git a/tests/GPU/BFS/BFS_VITESSE_BOX.son.ref b/tests/GPU/BFS/BFS_VITESSE_BOX.son.ref new file mode 100644 index 0000000000..b43caa59fd --- /dev/null +++ b/tests/GPU/BFS/BFS_VITESSE_BOX.son.ref @@ -0,0 +1,15 @@ +# BFS_VITESSE_BOX.son +# Temps x= -9.20000000e+00 y= 1.00000000e+00 z= 0.00000000e+00 x= -9.20000000e+00 y= 1.11111111e+00 z= 0.00000000e+00 x= -9.20000000e+00 y= 1.22222222e+00 z= 0.00000000e+00 x= -9.20000000e+00 y= 1.33333333e+00 z= 0.00000000e+00 x= -9.20000000e+00 y= 1.44444444e+00 z= 0.00000000e+00 x= -9.20000000e+00 y= 1.55555556e+00 z= 0.00000000e+00 x= -9.20000000e+00 y= 1.66666667e+00 z= 0.00000000e+00 x= -9.20000000e+00 y= 1.77777778e+00 z= 0.00000000e+00 x= -9.20000000e+00 y= 1.88888889e+00 z= 0.00000000e+00 x= -9.20000000e+00 y= 2.00000000e+00 z= 0.00000000e+00 +# Champ VITESSE [m/s] +# Type SEGMENT -9.200000 1.000000 0.000000 -9.200000 2.000000 0.000000 +0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 +4.77545847e-02 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 +9.55010851e-02 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 +1.43253236e-01 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 +1.91000668e-01 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 +2.38742902e-01 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 +2.86475840e-01 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 +3.34185730e-01 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 +3.81859477e-01 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 +4.29444051e-01 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 +4.76994676e-01 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 diff --git a/tests/GPU/BFS/BFS_VITESSE_DOM.son.ref b/tests/GPU/BFS/BFS_VITESSE_DOM.son.ref new file mode 100644 index 0000000000..2461a73758 --- /dev/null +++ b/tests/GPU/BFS/BFS_VITESSE_DOM.son.ref @@ -0,0 +1,15 @@ +# BFS_VITESSE_DOM.son +# Temps x= 7.50000000e+00 y= 0.00000000e+00 z= 5.00000000e+00 x= 7.50000000e+00 y= 2.22222222e-01 z= 5.00000000e+00 x= 7.50000000e+00 y= 4.44444444e-01 z= 5.00000000e+00 x= 7.50000000e+00 y= 6.66666667e-01 z= 5.00000000e+00 x= 7.50000000e+00 y= 8.88888889e-01 z= 5.00000000e+00 x= 7.50000000e+00 y= 1.11111111e+00 z= 5.00000000e+00 x= 7.50000000e+00 y= 1.33333333e+00 z= 5.00000000e+00 x= 7.50000000e+00 y= 1.55555556e+00 z= 5.00000000e+00 x= 7.50000000e+00 y= 1.77777778e+00 z= 5.00000000e+00 x= 7.50000000e+00 y= 2.00000000e+00 z= 5.00000000e+00 +# Champ VITESSE [m/s] +# Type SEGMENT 7.500000 0.000000 5.000000 7.500000 2.000000 5.000000 +0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 +4.77545847e-02 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 +9.55010851e-02 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 +1.43253236e-01 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 +1.91000668e-01 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 +2.38742902e-01 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 +2.86475840e-01 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 +3.34185730e-01 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 +3.81859477e-01 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 +4.29444051e-01 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 +4.76994676e-01 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 diff --git a/tests/GPU/BFS/BFS_VITESSE_DOM_PT_DNS.son.ref b/tests/GPU/BFS/BFS_VITESSE_DOM_PT_DNS.son.ref new file mode 100644 index 0000000000..69c9bbeed6 --- /dev/null +++ b/tests/GPU/BFS/BFS_VITESSE_DOM_PT_DNS.son.ref @@ -0,0 +1,15 @@ +# BFS_VITESSE_DOM_PT_DNS.son +# Temps x= 1.60000000e+00 y= 1.00000000e+00 z= 3.14000000e+00 x= 1.60000000e+00 y= 4.84000000e-01 z= 3.14000000e+00 x= 9.20000000e+00 y= 4.84000000e-01 z= 3.14000000e+00 x= 1.68000000e+01 y= 1.00000000e+00 z= 3.14000000e+00 +# Champ VITESSE [m/s] +# Type POINTS +0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 +4.77545847e-02 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 +9.55010851e-02 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 +1.43253236e-01 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 +1.91000668e-01 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 +2.38742902e-01 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 +2.86475840e-01 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 +3.34185730e-01 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 +3.81859477e-01 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 +4.29444051e-01 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 +4.76994676e-01 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 diff --git a/tests/GPU/BFS/BFS_VITESSE_MAX_BOX.son.ref b/tests/GPU/BFS/BFS_VITESSE_MAX_BOX.son.ref new file mode 100644 index 0000000000..0ef4d6f1b3 --- /dev/null +++ b/tests/GPU/BFS/BFS_VITESSE_MAX_BOX.son.ref @@ -0,0 +1,15 @@ +# BFS_VITESSE_MAX_BOX.son +# Temps x= 1.00000000e-01 y= 1.00000000e-01 z= 1.00000000e-01 +# Champ VITESSE_MAX [m/s] +# Type NUMERO_ELEM_SUR_MAITRE +0.00000000e+00 1.25600000e+01 0.00000000e+00 0.00000000e+00 +4.77545847e-02 1.25600000e+01 0.00000000e+00 0.00000000e+00 +9.55010851e-02 1.25603389e+01 0.00000000e+00 0.00000000e+00 +1.43253236e-01 1.25611183e+01 0.00000000e+00 0.00000000e+00 +1.91000668e-01 1.25623854e+01 0.00000000e+00 0.00000000e+00 +2.38742902e-01 1.25641338e+01 0.00000000e+00 0.00000000e+00 +2.86475840e-01 1.25663097e+01 0.00000000e+00 0.00000000e+00 +3.34185730e-01 1.25688209e+01 0.00000000e+00 0.00000000e+00 +3.81859477e-01 1.25715511e+01 0.00000000e+00 0.00000000e+00 +4.29444051e-01 1.25743707e+01 0.00000000e+00 0.00000000e+00 +4.76994676e-01 1.25771604e+01 0.00000000e+00 0.00000000e+00 diff --git a/tests/GPU/BFS/BFS_YPLUS_BOX.son.ref b/tests/GPU/BFS/BFS_YPLUS_BOX.son.ref new file mode 100644 index 0000000000..5e96303584 --- /dev/null +++ b/tests/GPU/BFS/BFS_YPLUS_BOX.son.ref @@ -0,0 +1,15 @@ +# BFS_YPLUS_BOX.son +# Temps x= 1.00000000e-01 y= 0.00000000e+00 z= 1.00000000e-01 +# Champ YPLUS [adimensionnel] +# Type NUMERO_ELEM_SUR_MAITRE +0.00000000e+00 9.42845742e+01 +4.77545847e-02 9.42686730e+01 +9.55010851e-02 9.42540521e+01 +1.43253236e-01 9.42410884e+01 +1.91000668e-01 9.42299628e+01 +2.38742902e-01 9.42206518e+01 +2.86475840e-01 9.42129538e+01 +3.34185730e-01 9.42065280e+01 +3.81859477e-01 9.42009407e+01 +4.29444051e-01 9.41957235e+01 +4.76994676e-01 9.41904089e+01 diff --git a/tests/GPU/BFS/BFS_YPLUS_MAX_BOX.son.ref b/tests/GPU/BFS/BFS_YPLUS_MAX_BOX.son.ref new file mode 100644 index 0000000000..8f61c144fa --- /dev/null +++ b/tests/GPU/BFS/BFS_YPLUS_MAX_BOX.son.ref @@ -0,0 +1,15 @@ +# BFS_YPLUS_MAX_BOX.son +# Temps x= 1.00000000e-01 y= 1.00000000e-01 z= 1.00000000e-01 +# Champ YPLUS_MAX [adimensionnel] +# Type NUMERO_ELEM_SUR_MAITRE +0.00000000e+00 9.42845742e+01 +4.77545847e-02 9.42686730e+01 +9.55010851e-02 9.42540521e+01 +1.43253236e-01 9.42410884e+01 +1.91000668e-01 9.42299628e+01 +2.38742902e-01 9.42206518e+01 +2.86475840e-01 9.42129538e+01 +3.34185730e-01 9.42065280e+01 +3.81859477e-01 9.42009407e+01 +4.29444051e-01 9.41957235e+01 +4.76994676e-01 9.41904089e+01 diff --git a/tests/GPU/BFS/check_perf.sh b/tests/GPU/BFS/check_perf.sh new file mode 120000 index 0000000000..6d20411c12 --- /dev/null +++ b/tests/GPU/BFS/check_perf.sh @@ -0,0 +1 @@ +../DomainFlowLES/check_perf.sh \ No newline at end of file diff --git a/tests/GPU/BFS/verifie b/tests/GPU/BFS/verifie new file mode 100755 index 0000000000..f989b60130 --- /dev/null +++ b/tests/GPU/BFS/verifie @@ -0,0 +1,17 @@ +message() +{ + [ $1 != $2 ] && echo $ECHO_OPTS "Error ($1!=$2) when checking:\n $msg" && err=1 + #echo $msg +} + +##################################### +# Comparaison non regression des .son (reduction) +##################################### +err=0 +for file in `ls *.son.ref 2>/dev/null` +do + msg="compare_sonde $file ${file%.ref}" + eval $msg 1>verifie.log 2>&1 + message $? 0 +done +exit $err diff --git a/tests/GPU/Canal_VDF/Canal_VDF.data b/tests/GPU/Canal_VDF/Canal_VDF.data new file mode 100644 index 0000000000..fa40868626 --- /dev/null +++ b/tests/GPU/Canal_VDF/Canal_VDF.data @@ -0,0 +1,193 @@ +# LES with periodic box # +# PARALLEL OK # +Dimension 3 + +Pb_hydraulique_turbulent pb + +Domaine dom_perio + +# BEGIN MESH # +Mailler dom_perio +{ + Pave pave + { + /* warning dumb geometry */ + Origine -30 0. 0. + Nombre_de_Noeuds 6 6 6 + /* Nombre_de_Noeuds 101 101 101 */ + Longueurs 30 2 10 + } + { + Bord Periox X = -30 0. <= Y <= 2. 0. <= Z <= 10. + Bord Periox X = 0 0. <= Y <= 2. 0. <= Z <= 10. + Bord LowerWall Y = 0. -30. <= X <= 0. 0. <= Z <= 10. + Bord UpperWall Y = 2. -30. <= X <= 0. 0. <= Z <= 10. + Bord Perioz Z = 0. -30. <= X <= 0. 0. <= Y <= 2. + Bord Perioz Z = 10. -30. <= X <= 0. 0. <= Y <= 2. + } +} +Declarer_bord_perio { domaine dom_perio bord Periox } +Declarer_bord_perio { domaine dom_perio bord Perioz } +# END MESH # + +# BEGIN PARTITION +Partition dom_perio +{ + Partition_tool Metis { Nb_parts 4 } + Larg_joint 2 + zones_name DOM +} +End +END PARTITION # + +# BEGIN SCATTER +Scatter DOM.Zones dom_perio +END SCATTER # + +VDF dis +Lire dis { reorder { algo hilbert } } + +Runge_Kutta_ordre_3 sch +Lire sch +{ + nb_pas_dt_max 10 + tinit 0 + dt_impr 1e-6 + facsec 2 + precision_impr 8 + tcpumax 23 +} + +Associer pb dom_perio +Associer pb sch + +Discretiser pb dis + +Lire pb +{ + Fluide_incompressible + { + mu champ_uniforme 1 3.5e-04 + rho champ_uniforme 1 1 + } + Navier_Stokes_turbulent + { + Solveur_pression AMG GCP { rtol 1e-15 impr } +# + solveur_pression petsc cli + { + -ksp_view + -ksp_type gmres + -ksp_norm_type unpreconditioned + -pc_type hypre + -pc_hypre_type boomeramg + -pc_mg_galerkin_mat_product_algorithm hypre + -pc_hypre_boomeramg_relax_type_all l1scaled-Jacobi + -pc_hypre_boomeramg_coarsen_type pmis + -pc_hypre_boomeramg_interp_type ext+i + -pc_hypre_boomeramg_strong_threshold 0.30 + -pc_hypre_boomeramg_print_statistics 1 + -ksp_rtol 1e-15 impr + } +# + conditions_initiales { + vitesse champ_uniforme 3 1 0 0 + pression champ_uniforme 1 0 + } + conditions_limites { + Periox periodique + Perioz periodique + LowerWall paroi_fixe + UpperWall paroi_fixe + } + convection { centre4 } + diffusion { } + sources { canal_perio { bord Periox } } + modele_turbulence null { } + } + Postraitement + { + definition_champs { + # champs instantanes # + p refChamp { pb_champ pb pression_pa } + ui refChamp { pb_champ pb vitesse } + duidxj refChamp { pb_champ pb gradient_vitesse } + + # statistiques # + moy_p moyenne { t_deb 0 t_fin 1e+6 sources_reference { p } } + ec_p ecart_type { t_deb 0 t_fin 1e+6 sources_reference { p } } + moy_ui moyenne { t_deb 0 t_fin 1e+6 sources_reference { ui } } + ec_ui ecart_type { t_deb 0 t_fin 1e+6 sources_reference { ui } } + moy_duidxj moyenne { t_deb 0 t_fin 1e+6 sources_reference { duidxj } } + ec_duidxj ecart_type { t_deb 0 t_fin 1e+6 sources_reference { duidxj } } + pui correlation { t_deb 0 t_fin 1e+6 sources_reference { p , ui } } + pduidxj correlation { t_deb 0 t_fin 1e+6 sources_reference { p , duidxj } } # vecteur 9 composantes : composante (i,j) -> colonne 3*i+j-4 # + uiuj correlation { t_deb 0 t_fin 1e+6 sources_reference { ui , ui } } # vecteur 9 composantes : composante (i,j) -> colonne 3*i+j-4 # + duidxj_dukdxl correlation { t_deb 0 t_fin 1e+6 sources_reference { duidxj , duidxj } } # vecteur 81 composantes : composante (i,j,k,l) -> colonne 27*i+9*j+3*k+l-40 # + uiujuk correlation_triple { t_deb 0 t_fin 1e+6 sources_reference { ui , ui , ui } } # vecteur 27 composantes : composante (i,j,k) -> colonne 9*i+3*j+k-13 # + + # pour snapshots # + U transformation { methode composante numero 0 localisation elem sources_reference { ui } } + V transformation { methode composante numero 1 localisation elem sources_reference { ui } } + W transformation { methode composante numero 2 localisation elem sources_reference { ui } } + UU transformation { methode formule expression 1 U*U localisation elem sources_reference { U } } + VV transformation { methode formule expression 1 V*V localisation elem sources_reference { V } } + WW transformation { methode formule expression 1 W*W localisation elem sources_reference { W } } + UV transformation { methode formule expression 1 U*V localisation elem sources_reference { U , V } } + UW transformation { methode formule expression 1 U*W localisation elem sources_reference { U , W } } + VW transformation { methode formule expression 1 V*W localisation elem sources_reference { V , W } } + } + sondes { + # pour controle convergence # + moy_p_streamwise moy_p periode 1e-6 segment 10 -25.13 1 0 0 1 0 + moy_p_spanwise moy_p periode 1e-6 segment 10 0 1 0 0 1 9.42 + moy_p_normal moy_p periode 1e-6 segment 10 0 0 0 0 2 0 + + moy_ui_streamwise moy_ui periode 1e-6 position_like moy_p_streamwise + moy_ui_spanwise moy_ui periode 1e-6 position_like moy_p_spanwise + moy_ui_normal moy_ui periode 1e-6 position_like moy_p_normal + + uiuj_streamwise uiuj periode 1e-6 position_like moy_p_streamwise + uiuj_spanwise uiuj periode 1e-6 position_like moy_p_spanwise + uiuj_normal uiuj periode 1e-6 position_like moy_p_normal + + ec_duidxj_streamwise ec_duidxj periode 1e-6 position_like moy_p_streamwise + ec_duidxj_spanwise ec_duidxj periode 1e-6 position_like moy_p_spanwise + ec_duidxj_normal ec_duidxj periode 1e-6 position_like moy_p_normal + + uiujuk_streamwise uiujuk periode 1e-6 position_like moy_p_streamwise + uiujuk_spanwise uiujuk periode 1e-6 position_like moy_p_spanwise + uiujuk_normal uiujuk periode 1e-6 position_like moy_p_normal + + # pour autocorrelations spatiales et temporelles # + p_streamwise grav p periode 1e-6 segment 10000 -25.1327 1 0 0 1 0 + p_spanwise grav p periode 1e-6 segment 10000 0 1 0 0 1 9.42478 + p_normal grav p periode 1e-6 segment 10000 0 0 0 0 2 0 + + ui_streamwise grav ui periode 1e-6 position_like p_streamwise + ui_spanwise grav ui periode 1e-6 position_like p_spanwise + ui_normal grav ui periode 1e-6 position_like p_normal + } + format lml + champs dt_post 1e6 + { + p elem + U elem + V elem + W elem + UU elem + VV elem + WW elem + UV elem + UW elem + VW elem + } + } + Sauvegarde_simple pdi Cas.sauv +} + +EcritureLectureSpecial 0 + +Resoudre pb + +Fin diff --git a/tests/GPU/Canal_VDF/Canal_VDF.lml.gz b/tests/GPU/Canal_VDF/Canal_VDF.lml.gz new file mode 100644 index 0000000000..e9a97eefdf Binary files /dev/null and b/tests/GPU/Canal_VDF/Canal_VDF.lml.gz differ diff --git a/tests/GPU/Canal_VDF/Canal_VDF_BENCH.TU.adastra_gfx90a b/tests/GPU/Canal_VDF/Canal_VDF_BENCH.TU.adastra_gfx90a new file mode 100644 index 0000000000..68df863854 --- /dev/null +++ b/tests/GPU/Canal_VDF/Canal_VDF_BENCH.TU.adastra_gfx90a @@ -0,0 +1,77 @@ + # Global performance file # + +This is the global file for tracking performance in TRUST. It stores aggregated quantities. +More detailed statistics can be found in the Canal_VDF_BENCH_csv.TU file +For time loop, only standard counters of level 1 are printed alongside your custom counters +Time is given in seconds + +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Context of the computation +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Date: 24-05-2026 -- 15:52:42 +OS: g1301__Linux__x86_64__5.14.0-570.69.1.el9_6.x86_64__#1 SMP PREEMPT_DYNAMIC Tue Nov 25 01:30:14 EST 2025 +CPU model : AMD EPYC 7A53 64-Core Processor +Total number of threads:128 +GPU model: AMD Instinct MI250X +HIP runtime version: 6.43 +HIP drivers version: 6.43 +Nb procs used for the computation: 1 +TRUST version: 1.9.8_beta +Total number of elements used for the calculation: 1000000 + +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Computation start-up statistics +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Total time of the start-up: 17.2369 + +Number of calls to the linear solver per time step: 2 +Average time of the resolution of the linear problem per call: 0.751213 +Average number of iteration of the linear solver per call: 0 + +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Time loop statistics +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +The first time step is not accounted for the computation of the time loop statistics +Total time of the time loop: 2.44631 +Number of time steps: 9 +Skipped time steps: 1 +Average time per time step: 0.271812 +Standard deviation between time steps: 0.103236 +Time elapsed in the skipped time steps: 2.40684 + + +Standard counter description | Time/step | % loop time | Call(s)/step +------------------------------------------------------------------------------------------ +Linear solver resolutions Ax=B | 0.00308358 | 1.1 | 3 +Convection operator | 0.01080208 | 4.0 | 3 +Diffusion operator | 0.007258781 | 2.7 | 3 +Gradient operator | 0.001891219 | 0.7 | 6 +Divergence operator | 0.001879412 | 0.7 | 4 +Source terms | 0.0007146429 | 0.3 | 3 +Update ::mettre_a_jour | 0.03159206 | 11.6 | 1 +Computation of the time step dt | 0.0004381009 | 0.2 | 2 +Post-treatment operations | 0.2100512 | 77.3 | 1 +Other operations | 0.004101117 | 1.5 | + +Average number of iteration of the linear solver per call: 0.37 + + +----------------------------------------------------------------------------------------------------------- + GPU statistics +----------------------------------------------------------------------------------------------------------- +Counter description | Time per step | % loop time | Call(s)/step | Bandwidth +----------------------------------------------------------------------------------------------------------- +Libraries: | 0.00271389 | 1.0 | 3 | +Kernels: | 0.254017 | 93.5 | 451 | +Copy host to device: | 0.000601216 | 0.2 | 33 | 0.0 GB/s +Copy device to host: | 0.000993021 | 0.4 | 26 | 8.4 GB/s +Alloc/Free on device: | 0.000607269 | 0.2 | 269 | +GPU: 94% Copy H<->D: 0.59% Alloc/free: 0.22% Comm: 0% CPU & I/O: 4.7% +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Post-resolution statistics +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Time of the post-resolution: 1.87675 + +Total time for the whole computation 23.9668 + +[Slurm] Power consumption (32 s): 0.426 kW 0.004 kWh 0.000 € (0.10€/kWh) diff --git a/tests/GPU/Canal_VDF/Canal_VDF_BENCH.TU.adastra_gfx942 b/tests/GPU/Canal_VDF/Canal_VDF_BENCH.TU.adastra_gfx942 new file mode 100644 index 0000000000..6a03fc85ea --- /dev/null +++ b/tests/GPU/Canal_VDF/Canal_VDF_BENCH.TU.adastra_gfx942 @@ -0,0 +1,77 @@ + # Global performance file # + +This is the global file for tracking performance in TRUST. It stores aggregated quantities. +More detailed statistics can be found in the Canal_VDF_BENCH_csv.TU file +For time loop, only standard counters of level 1 are printed alongside your custom counters +Time is given in seconds + +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Context of the computation +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Date: 08-06-2026 -- 14:34:27 +OS: a1007__Linux__x86_64__5.14.0-570.69.1.el9_6.x86_64__#1 SMP PREEMPT_DYNAMIC Tue Nov 25 01:30:14 EST 2025 +CPU model : AMD Instinct MI300A Accelerator +Total number of threads:192 +GPU model: AMD Instinct MI300A +HIP runtime version: 6.43 +HIP drivers version: 6.43 +Nb procs used for the computation: 1 +TRUST version: 1.9.8_beta +Total number of elements used for the calculation: 1000000 + +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Computation start-up statistics +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Total time of the start-up: 13.9131 + +Number of calls to the linear solver per time step: 2 +Average time of the resolution of the linear problem per call: 0.68663 +Average number of iteration of the linear solver per call: 0 + +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Time loop statistics +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +The first time step is not accounted for the computation of the time loop statistics +Total time of the time loop: 0.384921 +Number of time steps: 9 +Skipped time steps: 1 +Average time per time step: 0.0427689 +Standard deviation between time steps: 0.0226606 +Time elapsed in the skipped time steps: 1.95805 + + +Standard counter description | Time/step | % loop time | Call(s)/step +------------------------------------------------------------------------------------------ +Linear solver resolutions Ax=B | 0.00332468 | 7.8 | 3 +Convection operator | 0.00420472 | 9.8 | 3 +Diffusion operator | 0.002919138 | 6.8 | 3 +Gradient operator | 0.001110474 | 2.6 | 6 +Divergence operator | 0.001066916 | 2.5 | 4 +Source terms | 0.0004038194 | 0.9 | 3 +Update ::mettre_a_jour | 0.009225593 | 21.6 | 1 +Computation of the time step dt | 0.0002626509 | 0.6 | 2 +Post-treatment operations | 0.0174134 | 40.7 | 1 +Other operations | 0.002837557 | 6.6 | + +Average number of iteration of the linear solver per call: 0.37 + + +----------------------------------------------------------------------------------------------------------- + GPU statistics +----------------------------------------------------------------------------------------------------------- +Counter description | Time per step | % loop time | Call(s)/step | Bandwidth +----------------------------------------------------------------------------------------------------------- +Libraries: | 0.00305653 | 7.1 | 3 | +Kernels: | 0.0285809 | 66.8 | 451 | +Copy host to device: | 0.000510485 | 1.2 | 33 | 0.0 GB/s +Copy device to host: | 0.00102205 | 2.4 | 34 | 8.1 GB/s +Alloc/Free on device: | 9.63393e-05 | 0.2 | 253 | +GPU: 74% Copy H<->D: 3.6% Alloc/free: 0.23% Comm: 0% CPU & I/O: 22% +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Post-resolution statistics +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Time of the post-resolution: 1.75888 + +Total time for the whole computation 18.015 + +[Slurm] Power consumption (25 s): 0.597 kW 0.004 kWh 0.000 € (0.10€/kWh) diff --git a/tests/GPU/Canal_VDF/Canal_VDF_BENCH.TU.dalianvl_cc100 b/tests/GPU/Canal_VDF/Canal_VDF_BENCH.TU.dalianvl_cc100 new file mode 100644 index 0000000000..2cd913980f --- /dev/null +++ b/tests/GPU/Canal_VDF/Canal_VDF_BENCH.TU.dalianvl_cc100 @@ -0,0 +1,76 @@ + # Global performance file # + +This is the global file for tracking performance in TRUST. It stores aggregated quantities. +More detailed statistics can be found in the Canal_VDF_BENCH_csv.TU file +For time loop, only standard counters of level 1 are printed alongside your custom counters +Time is given in seconds + +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Context of the computation +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Date: 22-05-2026 -- 12:12:33 +OS: dalianvl14__Linux__aarch64__5.14.0-503.40.1.el9_5.aarch64__#1 SMP PREEMPT_DYNAMIC Thu Apr 24 13:05:29 UTC 2025 +CPU model : Unknown Linux CPU +Total number of threads:144 +GPU model: NVIDIA GB200 +CUDA runtime version: 13.0 +CUDA drivers version: 13.20 +Nb procs used for the computation: 1 +TRUST version: 1.9.8_beta +Total number of elements used for the calculation: 1000000 + +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Computation start-up statistics +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Total time of the start-up: 6.5758 + +Number of calls to the linear solver per time step: 2 +Average time of the resolution of the linear problem per call: 0.466502 +Average number of iteration of the linear solver per call: 0 + +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Time loop statistics +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +The first time step is not accounted for the computation of the time loop statistics +Total time of the time loop: 0.267319 +Number of time steps: 9 +Skipped time steps: 1 +Average time per time step: 0.0297022 +Standard deviation between time steps: 0.0168388 +Time elapsed in the skipped time steps: 1.67483 + + +Standard counter description | Time/step | % loop time | Call(s)/step +------------------------------------------------------------------------------------------ +Linear solver resolutions Ax=B | 0.00289441 | 9.7 | 3 +Convection operator | 0.001819897 | 6.1 | 3 +Diffusion operator | 0.001806095 | 6.1 | 3 +Gradient operator | 0.0009081656 | 3.1 | 6 +Divergence operator | 0.001402306 | 4.7 | 4 +Source terms | 0.0005865699 | 2.0 | 3 +Update ::mettre_a_jour | 0.004325817 | 14.6 | 1 +Computation of the time step dt | 0.0001865382 | 0.6 | 2 +Post-treatment operations | 0.01290875 | 43.5 | 1 +Other operations | 0.002863604 | 9.6 | + +Average number of iteration of the linear solver per call: 0.37 + + +----------------------------------------------------------------------------------------------------------- + GPU statistics +----------------------------------------------------------------------------------------------------------- +Counter description | Time per step | % loop time | Call(s)/step | Bandwidth +----------------------------------------------------------------------------------------------------------- +Libraries: | 0.00221351 | 7.5 | 3 | +Kernels: | 0.0144006 | 48.5 | 451 | +Copy host to device: | 0.00057469 | 1.9 | 33 | 0.0 GB/s +Copy device to host: | 0.000497414 | 1.7 | 26 | 16.7 GB/s +Alloc/Free on device: | 0.00182166 | 6.1 | 269 | +GPU: 56% Copy H<->D: 3.6% Alloc/free: 6.1% Comm: 0% CPU & I/O: 34% +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Post-resolution statistics +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Time of the post-resolution: 1.05186 + +Total time for the whole computation 9.56983 + diff --git a/tests/GPU/Canal_VDF/Canal_VDF_BENCH.TU.is157091_cc86 b/tests/GPU/Canal_VDF/Canal_VDF_BENCH.TU.is157091_cc86 new file mode 100644 index 0000000000..7f06a54c3b --- /dev/null +++ b/tests/GPU/Canal_VDF/Canal_VDF_BENCH.TU.is157091_cc86 @@ -0,0 +1,76 @@ + # Global performance file # + +This is the global file for tracking performance in TRUST. It stores aggregated quantities. +More detailed statistics can be found in the Canal_VDF_BENCH_csv.TU file +For time loop, only standard counters of level 1 are printed alongside your custom counters +Time is given in seconds + +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Context of the computation +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Date: 31-05-2026 -- 19:50:10 +OS: is157091__Linux__x86_64__5.11.12-300.fc34.x86_64__#1 SMP Wed Apr 7 16:31:13 UTC 2021 +CPU model : AMD Ryzen Threadripper PRO 5975WX 32-Cores +Total number of threads:64 +GPU model: NVIDIA RTX A6000 +CUDA runtime version: 12.90 +CUDA drivers version: 13.20 +Nb procs used for the computation: 1 +TRUST version: 1.9.8_beta +Total number of elements used for the calculation: 1000000 + +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Computation start-up statistics +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Total time of the start-up: 7.00921 + +Number of calls to the linear solver per time step: 2 +Average time of the resolution of the linear problem per call: 0.519914 +Average number of iteration of the linear solver per call: 0 + +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Time loop statistics +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +The first time step is not accounted for the computation of the time loop statistics +Total time of the time loop: 0.56956 +Number of time steps: 9 +Skipped time steps: 1 +Average time per time step: 0.0632844 +Standard deviation between time steps: 0.0113425 +Time elapsed in the skipped time steps: 1.82503 + + +Standard counter description | Time/step | % loop time | Call(s)/step +------------------------------------------------------------------------------------------ +Linear solver resolutions Ax=B | 0.00376538 | 5.9 | 3 +Convection operator | 0.008504433 | 13.4 | 3 +Diffusion operator | 0.005449793 | 8.6 | 3 +Gradient operator | 0.002636608 | 4.2 | 6 +Divergence operator | 0.001631437 | 2.6 | 4 +Source terms | 0.0007752543 | 1.2 | 3 +Update ::mettre_a_jour | 0.02134089 | 33.7 | 1 +Computation of the time step dt | 0.0005473281 | 0.9 | 2 +Post-treatment operations | 0.01282795 | 20.3 | 1 +Other operations | 0.005805318 | 9.2 | + +Average number of iteration of the linear solver per call: 0.37 + + +----------------------------------------------------------------------------------------------------------- + GPU statistics +----------------------------------------------------------------------------------------------------------- +Counter description | Time per step | % loop time | Call(s)/step | Bandwidth +----------------------------------------------------------------------------------------------------------- +Libraries: | 0.00337555 | 5.3 | 3 | +Kernels: | 0.0529067 | 83.6 | 451 | +Copy host to device: | 0.000292819 | 0.5 | 33 | 0.1 GB/s +Copy device to host: | 0.00106282 | 1.7 | 26 | 7.8 GB/s +Alloc/Free on device: | 0.000734244 | 1.2 | 269 | +GPU: 89% Copy H<->D: 2.1% Alloc/free: 1.2% Comm: 0% CPU & I/O: 7.8% +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Post-resolution statistics +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Time of the post-resolution: 0.56347 + +Total time for the whole computation 9.96728 + diff --git a/tests/GPU/Canal_VDF/Canal_VDF_BENCH.TU.is159479_cc120 b/tests/GPU/Canal_VDF/Canal_VDF_BENCH.TU.is159479_cc120 new file mode 100644 index 0000000000..fb973e970a --- /dev/null +++ b/tests/GPU/Canal_VDF/Canal_VDF_BENCH.TU.is159479_cc120 @@ -0,0 +1,76 @@ + # Global performance file # + +This is the global file for tracking performance in TRUST. It stores aggregated quantities. +More detailed statistics can be found in the Canal_VDF_BENCH_csv.TU file +For time loop, only standard counters of level 1 are printed alongside your custom counters +Time is given in seconds + +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Context of the computation +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Date: 13-05-2026 -- 07:01:09 +OS: is159479.intra.cea.fr__Linux__x86_64__6.12.0-160000.29-default__#1 SMP PREEMPT_DYNAMIC Fri May 1 12:45:19 UTC 2026 (6 +CPU model : AMD Ryzen Threadripper PRO 9965WX 24-Cores +Total number of threads:48 +GPU model: NVIDIA RTX PRO 6000 Blackwell Workstation Edition +CUDA runtime version: 13.0 +CUDA drivers version: 13.0 +Nb procs used for the computation: 1 +TRUST version: 1.9.8_beta +Total number of elements used for the calculation: 1000000 + +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Computation start-up statistics +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Total time of the start-up: 5.37035 + +Number of calls to the linear solver per time step: 2 +Average time of the resolution of the linear problem per call: 0.342442 +Average number of iteration of the linear solver per call: 0 + +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Time loop statistics +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +The first time step is not accounted for the computation of the time loop statistics +Total time of the time loop: 0.254878 +Number of time steps: 9 +Skipped time steps: 1 +Average time per time step: 0.0283198 +Standard deviation between time steps: 0.00794149 +Time elapsed in the skipped time steps: 1.32327 + + +Standard counter description | Time/step | % loop time | Call(s)/step +------------------------------------------------------------------------------------------ +Linear solver resolutions Ax=B | 0.00213505 | 7.5 | 3 +Convection operator | 0.002743935 | 9.7 | 3 +Diffusion operator | 0.00208883 | 7.4 | 3 +Gradient operator | 0.0006796297 | 2.4 | 6 +Divergence operator | 0.0006408326 | 2.3 | 4 +Source terms | 0.000344486 | 1.2 | 3 +Update ::mettre_a_jour | 0.009316315 | 32.9 | 1 +Computation of the time step dt | 0.0002445942 | 0.9 | 2 +Post-treatment operations | 0.008318809 | 29.4 | 1 +Other operations | 0.001807347 | 6.4 | + +Average number of iteration of the linear solver per call: 0.37 + + +----------------------------------------------------------------------------------------------------------- + GPU statistics +----------------------------------------------------------------------------------------------------------- +Counter description | Time per step | % loop time | Call(s)/step | Bandwidth +----------------------------------------------------------------------------------------------------------- +Libraries: | 0.00189167 | 6.7 | 3 | +Kernels: | 0.0215655 | 76.1 | 461 | +Copy host to device: | 0.000192181 | 0.7 | 33 | 0.1 GB/s +Copy device to host: | 0.0011407 | 4.0 | 26 | 7.3 GB/s +Alloc/Free on device: | 0.000567154 | 2.0 | 271 | +GPU: 83% Copy H<->D: 4.7% Alloc/free: 2% Comm: 0% CPU & I/O: 10% +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Post-resolution statistics +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Time of the post-resolution: 0.441399 + +Total time for the whole computation 7.38991 + diff --git a/tests/GPU/Canal_VDF/Canal_VDF_BENCH.TU.is247793_gfx1100 b/tests/GPU/Canal_VDF/Canal_VDF_BENCH.TU.is247793_gfx1100 new file mode 100644 index 0000000000..15535d1138 --- /dev/null +++ b/tests/GPU/Canal_VDF/Canal_VDF_BENCH.TU.is247793_gfx1100 @@ -0,0 +1,76 @@ + # Global performance file # + +This is the global file for tracking performance in TRUST. It stores aggregated quantities. +More detailed statistics can be found in the Canal_VDF_BENCH_csv.TU file +For time loop, only standard counters of level 1 are printed alongside your custom counters +Time is given in seconds + +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Context of the computation +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Date: 22-05-2026 -- 18:58:02 +OS: is247793__Linux__x86_64__6.8.0-107-generic__#107~22.04.1-Ubuntu SMP PREEMPT_DYNAMIC Wed Mar 18 23:40:43 UTC +CPU model : AMD Ryzen Threadripper PRO 7965WX 24-Cores +Total number of threads:48 +GPU model: AMD Radeon PRO W7900 +HIP runtime version: 7.53 +HIP drivers version: 7.53 +Nb procs used for the computation: 1 +TRUST version: 1.9.8_beta +Total number of elements used for the calculation: 1000000 + +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Computation start-up statistics +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Total time of the start-up: 7.62223 + +Number of calls to the linear solver per time step: 2 +Average time of the resolution of the linear problem per call: 0.897327 +Average number of iteration of the linear solver per call: 0 + +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Time loop statistics +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +The first time step is not accounted for the computation of the time loop statistics +Total time of the time loop: 1.34957 +Number of time steps: 9 +Skipped time steps: 1 +Average time per time step: 0.149952 +Standard deviation between time steps: 0.0408195 +Time elapsed in the skipped time steps: 1.88478 + + +Standard counter description | Time/step | % loop time | Call(s)/step +------------------------------------------------------------------------------------------ +Linear solver resolutions Ax=B | 0.010451 | 7.0 | 3 +Convection operator | 0.00830886 | 5.5 | 3 +Diffusion operator | 0.006585594 | 4.4 | 3 +Gradient operator | 0.002247286 | 1.5 | 6 +Divergence operator | 0.002138017 | 1.4 | 4 +Source terms | 0.001000203 | 0.7 | 3 +Update ::mettre_a_jour | 0.02632996 | 17.6 | 1 +Computation of the time step dt | 0.0007627901 | 0.5 | 2 +Post-treatment operations | 0.08695642 | 58.0 | 1 +Other operations | 0.005171515 | 3.4 | + +Average number of iteration of the linear solver per call: 0.37 + + +----------------------------------------------------------------------------------------------------------- + GPU statistics +----------------------------------------------------------------------------------------------------------- +Counter description | Time per step | % loop time | Call(s)/step | Bandwidth +----------------------------------------------------------------------------------------------------------- +Libraries: | 0.01003 | 6.7 | 3 | +Kernels: | 0.13202 | 88.0 | 451 | +Copy host to device: | 0.000783234 | 0.5 | 33 | 0.0 GB/s +Copy device to host: | 0.00111942 | 0.7 | 26 | 7.4 GB/s +Alloc/Free on device: | 0.00106206 | 0.7 | 269 | +GPU: 95% Copy H<->D: 1.3% Alloc/free: 0.71% Comm: 0% CPU & I/O: 3.3% +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Post-resolution statistics +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Time of the post-resolution: 0.722624 + +Total time for the whole computation 11.5792 + diff --git a/tests/GPU/Canal_VDF/Canal_VDF_BENCH.TU.jean-zay_cc90 b/tests/GPU/Canal_VDF/Canal_VDF_BENCH.TU.jean-zay_cc90 new file mode 100644 index 0000000000..12c6f370df --- /dev/null +++ b/tests/GPU/Canal_VDF/Canal_VDF_BENCH.TU.jean-zay_cc90 @@ -0,0 +1,77 @@ + # Global performance file # + +This is the global file for tracking performance in TRUST. It stores aggregated quantities. +More detailed statistics can be found in the Canal_VDF_BENCH_csv.TU file +For time loop, only standard counters of level 1 are printed alongside your custom counters +Time is given in seconds + +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Context of the computation +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Date: 13-05-2026 -- 09:48:35 +OS: jzxh116__Linux__x86_64__5.14.0-570.58.1.el9_6.x86_64__#1 SMP PREEMPT_DYNAMIC Tue Oct 21 04:15:07 EDT 2025 +CPU model : Intel(R) Xeon(R) Platinum 8468 +Total number of threads:192 +GPU model: NVIDIA H100 80GB HBM3 +CUDA runtime version: 12.60 +CUDA drivers version: 13.0 +Nb procs used for the computation: 1 +TRUST version: 1.9.8_beta +Total number of elements used for the calculation: 1000000 + +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Computation start-up statistics +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Total time of the start-up: 10.006 + +Number of calls to the linear solver per time step: 2 +Average time of the resolution of the linear problem per call: 0.498991 +Average number of iteration of the linear solver per call: 0 + +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Time loop statistics +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +The first time step is not accounted for the computation of the time loop statistics +Total time of the time loop: 0.450887 +Number of time steps: 9 +Skipped time steps: 1 +Average time per time step: 0.0500986 +Standard deviation between time steps: 0.0548609 +Time elapsed in the skipped time steps: 2.21404 + + +Standard counter description | Time/step | % loop time | Call(s)/step +------------------------------------------------------------------------------------------ +Linear solver resolutions Ax=B | 0.00273623 | 5.5 | 3 +Convection operator | 0.002399998 | 4.8 | 3 +Diffusion operator | 0.002259157 | 4.5 | 3 +Gradient operator | 0.001007029 | 2.0 | 6 +Divergence operator | 0.001120468 | 2.2 | 4 +Source terms | 0.0005228747 | 1.0 | 3 +Update ::mettre_a_jour | 0.008564734 | 17.1 | 1 +Computation of the time step dt | 0.0002005593 | 0.4 | 2 +Post-treatment operations | 0.02815309 | 56.2 | 1 +Other operations | 0.003134466 | 6.3 | + +Average number of iteration of the linear solver per call: 0.37 + + +----------------------------------------------------------------------------------------------------------- + GPU statistics +----------------------------------------------------------------------------------------------------------- +Counter description | Time per step | % loop time | Call(s)/step | Bandwidth +----------------------------------------------------------------------------------------------------------- +Libraries: | 0.00221922 | 4.4 | 3 | +Kernels: | 0.0194838 | 38.9 | 461 | +Copy host to device: | 0.000445538 | 0.9 | 33 | 0.0 GB/s +Copy device to host: | 0.00111509 | 2.2 | 26 | 7.4 GB/s +Alloc/Free on device: | 0.00107831 | 2.2 | 271 | +GPU: 43% Copy H<->D: 3.1% Alloc/free: 2.2% Comm: 0% CPU & I/O: 51% +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Post-resolution statistics +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Time of the post-resolution: 0.861773 + +Total time for the whole computation 13.5327 + +[Slurm] Power consumption (21 s): 0.431 kW 0.003 kWh 0.000 € (0.10€/kWh) diff --git a/tests/GPU/Canal_VDF/Canal_VDF_BENCH.TU.lumi_gfx90a b/tests/GPU/Canal_VDF/Canal_VDF_BENCH.TU.lumi_gfx90a new file mode 100644 index 0000000000..0766d8527d --- /dev/null +++ b/tests/GPU/Canal_VDF/Canal_VDF_BENCH.TU.lumi_gfx90a @@ -0,0 +1,77 @@ + # Global performance file # + +This is the global file for tracking performance in TRUST. It stores aggregated quantities. +More detailed statistics can be found in the Canal_VDF_BENCH_csv.TU file +For time loop, only standard counters of level 1 are printed alongside your custom counters +Time is given in seconds + +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Context of the computation +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Date: 15-05-2026 -- 19:42:18 +OS: nid005018__Linux__x86_64__6.4.0-150600.23.73_15.0.14-cray_shasta_c__#1 SMP Tue Oct 21 20:32:25 UTC 2025 (89d3c98) +CPU model : AMD EPYC 7A53 64-Core Processor +Total number of threads:128 +GPU model: AMD Instinct MI250X +HIP runtime version: 6.43 +HIP drivers version: 6.43 +Nb procs used for the computation: 1 +TRUST version: 1.9.8_beta +Total number of elements used for the calculation: 1000000 + +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Computation start-up statistics +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Total time of the start-up: 52.831 + +Number of calls to the linear solver per time step: 2 +Average time of the resolution of the linear problem per call: 1.5103 +Average number of iteration of the linear solver per call: 0 + +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Time loop statistics +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +The first time step is not accounted for the computation of the time loop statistics +Total time of the time loop: 2.24607 +Number of time steps: 9 +Skipped time steps: 1 +Average time per time step: 0.249563 +Standard deviation between time steps: 0.0899302 +Time elapsed in the skipped time steps: 2.44083 + + +Standard counter description | Time/step | % loop time | Call(s)/step +------------------------------------------------------------------------------------------ +Linear solver resolutions Ax=B | 0.00276879 | 1.1 | 3 +Convection operator | 0.0105047 | 4.2 | 3 +Diffusion operator | 0.007269927 | 2.9 | 3 +Gradient operator | 0.001888666 | 0.8 | 6 +Divergence operator | 0.001826117 | 0.7 | 4 +Source terms | 0.0006813406 | 0.3 | 3 +Update ::mettre_a_jour | 0.03157014 | 12.7 | 1 +Computation of the time step dt | 0.0004289448 | 0.2 | 2 +Post-treatment operations | 0.1885739 | 75.6 | 1 +Other operations | 0.004050516 | 1.6 | + +Average number of iteration of the linear solver per call: 0.37 + + +----------------------------------------------------------------------------------------------------------- + GPU statistics +----------------------------------------------------------------------------------------------------------- +Counter description | Time per step | % loop time | Call(s)/step | Bandwidth +----------------------------------------------------------------------------------------------------------- +Libraries: | 0.00239408 | 1.0 | 3 | +Kernels: | 0.235169 | 94.2 | 451 | +Copy host to device: | 0.000564144 | 0.2 | 33 | 0.0 GB/s +Copy device to host: | 0.000952524 | 0.4 | 26 | 8.7 GB/s +Alloc/Free on device: | 0.000560723 | 0.2 | 269 | +GPU: 95% Copy H<->D: 0.61% Alloc/free: 0.22% Comm: 0% CPU & I/O: 4% +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Post-resolution statistics +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Time of the post-resolution: 1.70586 + +Total time for the whole computation 59.2237 + +[Slurm] Power consumption (79 s): 0.480 kW 0.011 kWh 0.001 € (0.10€/kWh) diff --git a/tests/GPU/Canal_VDF/Canal_VDF_BENCH.TU.topaze_cc80 b/tests/GPU/Canal_VDF/Canal_VDF_BENCH.TU.topaze_cc80 new file mode 100644 index 0000000000..1132193177 --- /dev/null +++ b/tests/GPU/Canal_VDF/Canal_VDF_BENCH.TU.topaze_cc80 @@ -0,0 +1,77 @@ + # Global performance file # + +This is the global file for tracking performance in TRUST. It stores aggregated quantities. +More detailed statistics can be found in the Canal_VDF_BENCH_csv.TU file +For time loop, only standard counters of level 1 are printed alongside your custom counters +Time is given in seconds + +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Context of the computation +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Date: 15-05-2026 -- 13:21:39 +OS: topaze7064__Linux__x86_64__4.18.0-553.89.1.el8_10.x86_64__#1 SMP Sat Nov 29 00:49:18 EST 2025 +CPU model : AMD EPYC 7763 64-Core Processor +Total number of threads:256 +GPU model: NVIDIA A100-SXM4-80GB +CUDA runtime version: 12.90 +CUDA drivers version: 13.0 +Nb procs used for the computation: 1 +TRUST version: 1.9.8_beta +Total number of elements used for the calculation: 1000000 + +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Computation start-up statistics +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Total time of the start-up: 9.93835 + +Number of calls to the linear solver per time step: 2 +Average time of the resolution of the linear problem per call: 0.688868 +Average number of iteration of the linear solver per call: 0 + +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Time loop statistics +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +The first time step is not accounted for the computation of the time loop statistics +Total time of the time loop: 0.395459 +Number of time steps: 9 +Skipped time steps: 1 +Average time per time step: 0.0439399 +Standard deviation between time steps: 0.0239771 +Time elapsed in the skipped time steps: 2.57337 + + +Standard counter description | Time/step | % loop time | Call(s)/step +------------------------------------------------------------------------------------------ +Linear solver resolutions Ax=B | 0.00333603 | 7.6 | 3 +Convection operator | 0.003698002 | 8.4 | 3 +Diffusion operator | 0.003023572 | 6.9 | 3 +Gradient operator | 0.001306829 | 3.0 | 6 +Divergence operator | 0.001484432 | 3.4 | 4 +Source terms | 0.0006081321 | 1.4 | 3 +Update ::mettre_a_jour | 0.01068189 | 24.3 | 1 +Computation of the time step dt | 0.0002696096 | 0.6 | 2 +Post-treatment operations | 0.01635107 | 37.2 | 1 +Other operations | 0.003180315 | 7.2 | + +Average number of iteration of the linear solver per call: 0.37 + + +----------------------------------------------------------------------------------------------------------- + GPU statistics +----------------------------------------------------------------------------------------------------------- +Counter description | Time per step | % loop time | Call(s)/step | Bandwidth +----------------------------------------------------------------------------------------------------------- +Libraries: | 0.00269444 | 6.1 | 3 | +Kernels: | 0.0273097 | 62.2 | 451 | +Copy host to device: | 0.000447015 | 1.0 | 33 | 0.0 GB/s +Copy device to host: | 0.00123098 | 2.8 | 26 | 6.7 GB/s +Alloc/Free on device: | 0.00108718 | 2.5 | 269 | +GPU: 68% Copy H<->D: 3.8% Alloc/free: 2.5% Comm: 0% CPU & I/O: 25% +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Post-resolution statistics +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Time of the post-resolution: 1.36392 + +Total time for the whole computation 14.2711 + +[Slurm] Power consumption (51 s): 0.611 kW 0.009 kWh 0.001 € (0.10€/kWh) diff --git a/tests/GPU/Canal_VDF/check_perf.sh b/tests/GPU/Canal_VDF/check_perf.sh new file mode 120000 index 0000000000..6d20411c12 --- /dev/null +++ b/tests/GPU/Canal_VDF/check_perf.sh @@ -0,0 +1 @@ +../DomainFlowLES/check_perf.sh \ No newline at end of file diff --git a/tests/GPU/ColdLegMixing/ColdLegMixing.data b/tests/GPU/ColdLegMixing/ColdLegMixing.data index 322a967d4f..b6e284f0e1 100644 --- a/tests/GPU/ColdLegMixing/ColdLegMixing.data +++ b/tests/GPU/ColdLegMixing/ColdLegMixing.data @@ -42,7 +42,7 @@ Scatter DOM.Zones dom END SCATTER # VEFPreP1B dis -lire dis { P0 P1 changement_de_base_P1bulle 1 CL_pression_sommet_faible 0 modif_div_face_dirichlet 0 } +Lire dis { reorder { algo none } } # reorder make differences # Runge_Kutta_rationnel_ordre_2 sch lire sch @@ -91,7 +91,7 @@ Lire pb preconditionnement_diag 1 seuil_diffusion_implicite 1e-10 } - solveur_pression AMG GCP { atol 1e-5 impr } + solveur_pression AMG GCP { atol 1e-9 impr } convection { MUSCL } diffusion { } conditions_initiales { vitesse champ_uniforme 3 0 0 0 } diff --git a/tests/GPU/ColdLegMixing/ColdLegMixing_BENCH.TU.adastra_gfx90a b/tests/GPU/ColdLegMixing/ColdLegMixing_BENCH.TU.adastra_gfx90a index dda0d84caa..34cd597e00 100644 --- a/tests/GPU/ColdLegMixing/ColdLegMixing_BENCH.TU.adastra_gfx90a +++ b/tests/GPU/ColdLegMixing/ColdLegMixing_BENCH.TU.adastra_gfx90a @@ -8,8 +8,8 @@ Time is given in seconds ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Context of the computation ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -Date: 20-02-2026 -- 21:07:42 -OS: g1031__Linux__x86_64__5.14.0-570.69.1.el9_6.x86_64__#1 SMP PREEMPT_DYNAMIC Tue Nov 25 01:30:14 EST 2025 +Date: 23-04-2026 -- 09:16:32 +OS: g1182__Linux__x86_64__5.14.0-570.69.1.el9_6.x86_64__#1 SMP PREEMPT_DYNAMIC Tue Nov 25 01:30:14 EST 2025 CPU model : AMD EPYC 7A53 64-Core Processor Total number of threads:128 GPU model: AMD Instinct MI250X @@ -22,40 +22,40 @@ Total number of elements used for the calculation: 2160000 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Computation start-up statistics ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -Total time of the start-up: 54.8818 +Total time of the start-up: 55.4215 Number of calls to the linear solver per time step: 2 -Average time of the resolution of the linear problem per call: 1.50432 +Average time of the resolution of the linear problem per call: 1.57831 Average number of iteration of the linear solver per call: 0 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Time loop statistics ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ The first time step is not accounted for the computation of the time loop statistics -Total time of the time loop: 8.39766 +Total time of the time loop: 7.89771 Number of time steps: 9 Skipped time steps: 1 -Average time per time step: 0.933074 -Standard deviation between time steps: 0.168845 -Time elapsed in the skipped time steps: 1.56224 +Average time per time step: 0.877524 +Standard deviation between time steps: 0.345544 +Time elapsed in the skipped time steps: 1.75101 Standard counter description | Time/step | % loop time | Call(s)/step ------------------------------------------------------------------------------------------ -Linear solver resolutions Ax=B | 0.121511 | 11.0 | 2 -Matrix assembly for implicit scheme | 0.04890356 | 4.4 | 4 -Convection operator | 0.1384608 | 12.5 | 10 -Diffusion operator | 0.1001888 | 9.1 | 18 -Gradient operator | 0.1041166 | 9.4 | 5 -Divergence operator | 0.03953337 | 3.6 | 6 -Source terms | 0.001168745 | 0.1 | 4 -Update ::mettre_a_jour | 0.2427784 | 21.9 | 4 -Solver for implicit diffusion | 0.02933033 | 2.7 | 4 -Computation of the time step dt | 0.02264737 | 2.0 | 6 -Post-treatment operations | 0.07146423 | 6.5 | 1 -Other operations | 0.01297066 | 1.2 | +Linear solver resolutions Ax=B | 0.154441 | 17.6 | 2 +Matrix assembly for implicit scheme | 0.0378347 | 4.3 | 4 +Convection operator | 0.0779121 | 8.9 | 9 +Diffusion operator | 0.08704269 | 9.9 | 18 +Gradient operator | 0.05078478 | 5.8 | 5 +Divergence operator | 0.02666375 | 3.0 | 6 +Source terms | 0.0009005066 | 0.1 | 4 +Update ::mettre_a_jour | 0.2270792 | 25.9 | 4 +Solver for implicit diffusion | 0.02766899 | 3.2 | 4 +Computation of the time step dt | 0.01654435 | 1.9 | 6 +Post-treatment operations | 0.1431845 | 16.3 | 1 +Other operations | 0.02746652 | 3.1 | -Average number of iteration of the linear solver per call: 33.7 +Average number of iteration of the linear solver per call: 52.7 ----------------------------------------------------------------------------------------------------------- @@ -63,17 +63,17 @@ Average number of iteration of the linear solver per call: 33.7 ----------------------------------------------------------------------------------------------------------- Counter description | Time per step | % loop time | Call(s)/step | Bandwidth ----------------------------------------------------------------------------------------------------------- -Libraries: | 0.121127 | 13.0 | 2 | -Kernels: | 0.775968 | 83.2 | 1111 | -Copy host to device: | 0.000841133 | 0.1 | 42 | 2.6 GB/s -Copy device to host: | 0.00288309 | 0.3 | 44 | 13.7 GB/s -Alloc/Free on device: | 0.000129747 | 0.0 | 11 | -GPU: 96% Copy H<->D: 0.4% Alloc/free: 0.014% Comm: 0% CPU & I/O: 3.4% +Libraries: | 0.154065 | 17.6 | 2 | +Kernels: | 0.613257 | 69.9 | 1097 | +Copy host to device: | 0.000828083 | 0.1 | 42 | 2.6 GB/s +Copy device to host: | 0.00289064 | 0.3 | 44 | 13.6 GB/s +Alloc/Free on device: | 0.000112598 | 0.0 | 11 | +GPU: 87% Copy H<->D: 0.42% Alloc/free: 0.013% Comm: 0% CPU & I/O: 12% ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Post-resolution statistics ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -Time of the post-resolution: 2.54669 +Time of the post-resolution: 2.69953 -Total time for the whole computation 67.3884 +Total time for the whole computation 67.7697 -Power consumption: 0 kW 0 kWh 0 € (0.12€/kWh) +[Slurm] Power consumption (76 s): 0.483 kW 0.010 kWh 0.001 € (0.10€/kWh) diff --git a/tests/GPU/ColdLegMixing/ColdLegMixing_BENCH.TU.dalianvl_cc100 b/tests/GPU/ColdLegMixing/ColdLegMixing_BENCH.TU.dalianvl_cc100 new file mode 100644 index 0000000000..e99fd2ca48 --- /dev/null +++ b/tests/GPU/ColdLegMixing/ColdLegMixing_BENCH.TU.dalianvl_cc100 @@ -0,0 +1,78 @@ + # Global performance file # + +This is the global file for tracking performance in TRUST. It stores aggregated quantities. +More detailed statistics can be found in the ColdLegMixing_BENCH_csv.TU file +For time loop, only standard counters of level 1 are printed alongside your custom counters +Time is given in seconds + +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Context of the computation +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Date: 22-05-2026 -- 12:13:09 +OS: dalianvl14__Linux__aarch64__5.14.0-503.40.1.el9_5.aarch64__#1 SMP PREEMPT_DYNAMIC Thu Apr 24 13:05:29 UTC 2025 +CPU model : Unknown Linux CPU +Total number of threads:144 +GPU model: NVIDIA GB200 +CUDA runtime version: 13.0 +CUDA drivers version: 13.20 +Nb procs used for the computation: 1 +TRUST version: 1.9.8_beta +Total number of elements used for the calculation: 2160000 + +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Computation start-up statistics +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Total time of the start-up: 27.0598 + +Number of calls to the linear solver per time step: 2 +Average time of the resolution of the linear problem per call: 1.11402 +Average number of iteration of the linear solver per call: 0 + +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Time loop statistics +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +The first time step is not accounted for the computation of the time loop statistics +Total time of the time loop: 1.86596 +Number of time steps: 9 +Skipped time steps: 1 +Average time per time step: 0.207329 +Standard deviation between time steps: 0.0668016 +Time elapsed in the skipped time steps: 0.674962 + + +Standard counter description | Time/step | % loop time | Call(s)/step +------------------------------------------------------------------------------------------ +Linear solver resolutions Ax=B | 0.0830543 | 40.1 | 2 +Matrix assembly for implicit scheme | 0.003942188 | 1.9 | 4 +Convection operator | 0.02023463 | 9.8 | 10 +Diffusion operator | 0.009598779 | 4.6 | 18 +Gradient operator | 0.005206757 | 2.5 | 5 +Divergence operator | 0.002221272 | 1.1 | 6 +Source terms | 0.0004284402 | 0.2 | 4 +Update ::mettre_a_jour | 0.02895494 | 14.0 | 4 +Solver for implicit diffusion | 0.009809403 | 4.7 | 4 +Computation of the time step dt | 0.001842386 | 0.9 | 6 +Post-treatment operations | 0.02996372 | 14.5 | 1 +Other operations | 0.01207206 | 5.8 | + +Average number of iteration of the linear solver per call: 53 + + +----------------------------------------------------------------------------------------------------------- + GPU statistics +----------------------------------------------------------------------------------------------------------- +Counter description | Time per step | % loop time | Call(s)/step | Bandwidth +----------------------------------------------------------------------------------------------------------- +Libraries: | 0.0827913 | 39.9 | 2 | +Kernels: | 0.0879506 | 42.4 | 1063 | +Copy host to device: | 0.000721296 | 0.3 | 42 | 3.0 GB/s +Copy device to host: | 0.000766822 | 0.4 | 31 | 44.4 GB/s +Alloc/Free on device: | 0.0016505 | 0.8 | 39 | +GPU: 82% Copy H<->D: 0.72% Alloc/free: 0.8% Comm: 0% CPU & I/O: 16% +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Post-resolution statistics +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Time of the post-resolution: 1.59505 + +Total time for the whole computation 31.1958 + diff --git a/tests/GPU/ColdLegMixing/ColdLegMixing_BENCH.TU.eureka_cc89 b/tests/GPU/ColdLegMixing/ColdLegMixing_BENCH.TU.eureka_cc89 new file mode 100644 index 0000000000..ea0331118d --- /dev/null +++ b/tests/GPU/ColdLegMixing/ColdLegMixing_BENCH.TU.eureka_cc89 @@ -0,0 +1,78 @@ + # Global performance file # + +This is the global file for tracking performance in TRUST. It stores aggregated quantities. +More detailed statistics can be found in the ColdLegMixing_BENCH_csv.TU file +For time loop, only standard counters of level 1 are printed alongside your custom counters +Time is given in seconds + +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Context of the computation +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Date: 11-05-2026 -- 08:29:14 +OS: eureka__Linux__x86_64__6.17.0-20-generic__#20~24.04.1-Ubuntu SMP PREEMPT_DYNAMIC Thu Mar 19 01:28:37 UTC 2 +CPU model : INTEL(R) XEON(R) PLATINUM 8580 +Total number of threads:240 +GPU model: NVIDIA RTX 6000 Ada Generation +CUDA runtime version: 12.90 +CUDA drivers version: 13.10 +Nb procs used for the computation: 1 +TRUST version: 1.9.8_beta +Total number of elements used for the calculation: 2160000 + +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Computation start-up statistics +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Total time of the start-up: 40.8787 + +Number of calls to the linear solver per time step: 2 +Average time of the resolution of the linear problem per call: 1.41704 +Average number of iteration of the linear solver per call: 0 + +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Time loop statistics +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +The first time step is not accounted for the computation of the time loop statistics +Total time of the time loop: 4.14205 +Number of time steps: 9 +Skipped time steps: 1 +Average time per time step: 0.460228 +Standard deviation between time steps: 0.13345 +Time elapsed in the skipped time steps: 1.76797 + + +Standard counter description | Time/step | % loop time | Call(s)/step +------------------------------------------------------------------------------------------ +Linear solver resolutions Ax=B | 0.150019 | 32.6 | 2 +Matrix assembly for implicit scheme | 0.01175697 | 2.6 | 4 +Convection operator | 0.05860689 | 12.7 | 10 +Diffusion operator | 0.03013431 | 6.5 | 18 +Gradient operator | 0.01550689 | 3.4 | 5 +Divergence operator | 0.008359857 | 1.8 | 6 +Source terms | 0.001063969 | 0.2 | 4 +Update ::mettre_a_jour | 0.0559526 | 12.2 | 4 +Solver for implicit diffusion | 0.03351775 | 7.3 | 4 +Computation of the time step dt | 0.005934557 | 1.3 | 6 +Post-treatment operations | 0.05752008 | 12.5 | 1 +Other operations | 0.03185519 | 6.9 | + +Average number of iteration of the linear solver per call: 53 + + +----------------------------------------------------------------------------------------------------------- + GPU statistics +----------------------------------------------------------------------------------------------------------- +Counter description | Time per step | % loop time | Call(s)/step | Bandwidth +----------------------------------------------------------------------------------------------------------- +Libraries: | 0.149355 | 32.5 | 2 | +Kernels: | 0.291186 | 63.3 | 1110 | +Copy host to device: | 0.000486001 | 0.1 | 42 | 4.4 GB/s +Copy device to host: | 0.00310949 | 0.7 | 45 | 12.7 GB/s +Alloc/Free on device: | 0.000238185 | 0.1 | 11 | +GPU: 96% Copy H<->D: 0.78% Alloc/free: 0.052% Comm: 0% CPU & I/O: 3.4% +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Post-resolution statistics +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Time of the post-resolution: 0.773176 + +Total time for the whole computation 47.5619 + diff --git a/tests/GPU/ColdLegMixing/ColdLegMixing_BENCH.TU.irene-amd-ccrt_cc70 b/tests/GPU/ColdLegMixing/ColdLegMixing_BENCH.TU.irene-amd-ccrt_cc70 index 7741a83d11..3d7f462e42 100644 --- a/tests/GPU/ColdLegMixing/ColdLegMixing_BENCH.TU.irene-amd-ccrt_cc70 +++ b/tests/GPU/ColdLegMixing/ColdLegMixing_BENCH.TU.irene-amd-ccrt_cc70 @@ -8,8 +8,8 @@ Time is given in seconds ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Context of the computation ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -Date: 20-03-2026 -- 14:08:56 -OS: irene7056__Linux__x86_64__4.18.0-553.89.1.el8_10.x86_64__#1 SMP Sat Nov 29 00:49:18 EST 2025 +Date: 23-04-2026 -- 14:44:19 +OS: irene7051__Linux__x86_64__4.18.0-553.89.1.el8_10.x86_64__#1 SMP Sat Nov 29 00:49:18 EST 2025 CPU model : Intel(R) Xeon(R) Gold 6230 CPU @ 2.10GHz Total number of threads:80 GPU model: Tesla V100-SXM2-16GB @@ -22,40 +22,40 @@ Total number of elements used for the calculation: 2160000 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Computation start-up statistics ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -Total time of the start-up: 68.8927 +Total time of the start-up: 72.2546 Number of calls to the linear solver per time step: 2 -Average time of the resolution of the linear problem per call: 2.55071 +Average time of the resolution of the linear problem per call: 2.2267 Average number of iteration of the linear solver per call: 0 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Time loop statistics ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ The first time step is not accounted for the computation of the time loop statistics -Total time of the time loop: 6.35237 +Total time of the time loop: 5.59819 Number of time steps: 9 Skipped time steps: 1 -Average time per time step: 0.705819 -Standard deviation between time steps: 0.111849 -Time elapsed in the skipped time steps: 2.75659 +Average time per time step: 0.622021 +Standard deviation between time steps: 0.100609 +Time elapsed in the skipped time steps: 2.66817 Standard counter description | Time/step | % loop time | Call(s)/step ------------------------------------------------------------------------------------------ -Linear solver resolutions Ax=B | 0.166276 | 23.6 | 2 -Matrix assembly for implicit scheme | 0.03361593 | 4.8 | 4 -Convection operator | 0.1028779 | 14.6 | 10 -Diffusion operator | 0.06363367 | 9.0 | 18 -Gradient operator | 0.04134556 | 5.9 | 5 -Divergence operator | 0.04251433 | 6.0 | 6 -Source terms | 0.001611651 | 0.2 | 4 -Update ::mettre_a_jour | 0.09962028 | 14.1 | 4 -Solver for implicit diffusion | 0.04006988 | 5.7 | 4 -Computation of the time step dt | 0.02328207 | 3.3 | 6 -Post-treatment operations | 0.05626412 | 8.0 | 1 -Other operations | 0.0347072 | 4.9 | +Linear solver resolutions Ax=B | 0.228427 | 36.7 | 2 +Matrix assembly for implicit scheme | 0.02427188 | 3.9 | 4 +Convection operator | 0.05417733 | 8.7 | 9 +Diffusion operator | 0.03900059 | 6.3 | 18 +Gradient operator | 0.02075651 | 3.3 | 5 +Divergence operator | 0.02327661 | 3.7 | 6 +Source terms | 0.001577241 | 0.3 | 4 +Update ::mettre_a_jour | 0.08490219 | 13.6 | 4 +Solver for implicit diffusion | 0.03963722 | 6.4 | 4 +Computation of the time step dt | 0.0155664 | 2.5 | 6 +Post-treatment operations | 0.05371404 | 8.6 | 1 +Other operations | 0.03671423 | 5.9 | -Average number of iteration of the linear solver per call: 32.7 +Average number of iteration of the linear solver per call: 52.8 ----------------------------------------------------------------------------------------------------------- @@ -63,17 +63,17 @@ Average number of iteration of the linear solver per call: 32.7 ----------------------------------------------------------------------------------------------------------- Counter description | Time per step | % loop time | Call(s)/step | Bandwidth ----------------------------------------------------------------------------------------------------------- -Libraries: | 0.165709 | 23.5 | 2 | -Kernels: | 0.49784 | 70.5 | 1111 | -Copy host to device: | 0.00112713 | 0.2 | 42 | 1.9 GB/s -Copy device to host: | 0.00941522 | 1.3 | 44 | 4.2 GB/s -Alloc/Free on device: | 0.000490555 | 0.1 | 11 | -GPU: 94% Copy H<->D: 1.5% Alloc/free: 0.07% Comm: 0% CPU & I/O: 4.4% +Libraries: | 0.227866 | 36.6 | 2 | +Kernels: | 0.350872 | 56.4 | 1097 | +Copy host to device: | 0.00111442 | 0.2 | 42 | 1.9 GB/s +Copy device to host: | 0.0095051 | 1.5 | 44 | 4.1 GB/s +Alloc/Free on device: | 0.00031579 | 0.1 | 11 | +GPU: 93% Copy H<->D: 1.7% Alloc/free: 0.051% Comm: 0% CPU & I/O: 5.2% ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Post-resolution statistics ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -Time of the post-resolution: 1.89088 +Time of the post-resolution: 2.05112 -Total time for the whole computation 79.8925 +Total time for the whole computation 82.5721 -[Slurm] Power consumption (104 s): 0.220 kW 0.006 kWh 0.001 € (0.10€/kWh) +[Slurm] Power consumption (97 s): 0.226 kW 0.006 kWh 0.001 € (0.10€/kWh) diff --git a/tests/GPU/ColdLegMixing/ColdLegMixing_BENCH.TU.is157091_cc86 b/tests/GPU/ColdLegMixing/ColdLegMixing_BENCH.TU.is157091_cc86 index 0ccdcbfa56..6bc6865978 100644 --- a/tests/GPU/ColdLegMixing/ColdLegMixing_BENCH.TU.is157091_cc86 +++ b/tests/GPU/ColdLegMixing/ColdLegMixing_BENCH.TU.is157091_cc86 @@ -8,7 +8,7 @@ Time is given in seconds ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Context of the computation ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -Date: 10-03-2026 -- 08:37:35 +Date: 22-04-2026 -- 07:49:49 OS: is157091__Linux__x86_64__5.11.12-300.fc34.x86_64__#1 SMP Wed Apr 7 16:31:13 UTC 2021 CPU model : AMD Ryzen Threadripper PRO 5975WX 32-Cores Total number of threads:64 @@ -22,38 +22,38 @@ Total number of elements used for the calculation: 2160000 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Computation start-up statistics ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -Total time of the start-up: 42.1372 +Total time of the start-up: 44.4438 Number of calls to the linear solver per time step: 2 -Average time of the resolution of the linear problem per call: 1.15194 +Average time of the resolution of the linear problem per call: 1.42068 Average number of iteration of the linear solver per call: 0 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Time loop statistics ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ The first time step is not accounted for the computation of the time loop statistics -Total time of the time loop: 6.28092 +Total time of the time loop: 6.11625 Number of time steps: 9 Skipped time steps: 1 -Average time per time step: 0.69788 -Standard deviation between time steps: 0.11746 -Time elapsed in the skipped time steps: 1.51742 +Average time per time step: 0.679583 +Standard deviation between time steps: 0.087326 +Time elapsed in the skipped time steps: 1.67971 Standard counter description | Time/step | % loop time | Call(s)/step ------------------------------------------------------------------------------------------ -Linear solver resolutions Ax=B | 0.167858 | 24.1 | 2 -Matrix assembly for implicit scheme | 0.03240089 | 4.6 | 4 -Convection operator | 0.09896009 | 14.2 | 10 -Diffusion operator | 0.05078759 | 7.3 | 18 -Gradient operator | 0.03526034 | 5.1 | 5 -Divergence operator | 0.0328344 | 4.7 | 6 -Source terms | 0.001295119 | 0.2 | 4 -Update ::mettre_a_jour | 0.1129639 | 16.2 | 4 -Solver for implicit diffusion | 0.04478752 | 6.4 | 4 -Computation of the time step dt | 0.02486768 | 3.6 | 6 -Post-treatment operations | 0.05516704 | 7.9 | 1 -Other operations | 0.04069752 | 5.8 | +Linear solver resolutions Ax=B | 0.16942 | 24.9 | 2 +Matrix assembly for implicit scheme | 0.03782602 | 5.6 | 4 +Convection operator | 0.100055 | 14.7 | 10 +Diffusion operator | 0.05079105 | 7.5 | 18 +Gradient operator | 0.02477128 | 3.6 | 5 +Divergence operator | 0.03284109 | 4.8 | 6 +Source terms | 0.001298781 | 0.2 | 4 +Update ::mettre_a_jour | 0.107247 | 15.8 | 4 +Solver for implicit diffusion | 0.04514434 | 6.6 | 4 +Computation of the time step dt | 0.02508486 | 3.7 | 6 +Post-treatment operations | 0.04197711 | 6.2 | 1 +Other operations | 0.04312715 | 6.3 | Average number of iteration of the linear solver per call: 32.7 @@ -63,16 +63,16 @@ Average number of iteration of the linear solver per call: 32.7 ----------------------------------------------------------------------------------------------------------- Counter description | Time per step | % loop time | Call(s)/step | Bandwidth ----------------------------------------------------------------------------------------------------------- -Libraries: | 0.167302 | 24.0 | 2 | -Kernels: | 0.496374 | 71.1 | 1111 | -Copy host to device: | 0.00053393 | 0.1 | 42 | 4.0 GB/s -Copy device to host: | 0.00382078 | 0.5 | 44 | 10.3 GB/s -Alloc/Free on device: | 0.000359834 | 0.1 | 11 | -GPU: 95% Copy H<->D: 0.62% Alloc/free: 0.052% Comm: 0% CPU & I/O: 4.2% +Libraries: | 0.168832 | 24.8 | 2 | +Kernels: | 0.488327 | 71.9 | 1107 | +Copy host to device: | 0.000582148 | 0.1 | 42 | 3.7 GB/s +Copy device to host: | 0.00392578 | 0.6 | 44 | 10.0 GB/s +Alloc/Free on device: | 0.000275118 | 0.0 | 11 | +GPU: 97% Copy H<->D: 0.66% Alloc/free: 0.04% Comm: 0% CPU & I/O: 2.6% ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Post-resolution statistics ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -Time of the post-resolution: 0.87779 +Time of the post-resolution: 0.919342 -Total time for the whole computation 50.8133 +Total time for the whole computation 53.1591 diff --git a/tests/GPU/ColdLegMixing/ColdLegMixing_BENCH.TU.is159479_cc120 b/tests/GPU/ColdLegMixing/ColdLegMixing_BENCH.TU.is159479_cc120 new file mode 100644 index 0000000000..c6cae453a5 --- /dev/null +++ b/tests/GPU/ColdLegMixing/ColdLegMixing_BENCH.TU.is159479_cc120 @@ -0,0 +1,78 @@ + # Global performance file # + +This is the global file for tracking performance in TRUST. It stores aggregated quantities. +More detailed statistics can be found in the ColdLegMixing_BENCH_csv.TU file +For time loop, only standard counters of level 1 are printed alongside your custom counters +Time is given in seconds + +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Context of the computation +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Date: 24-04-2026 -- 14:32:59 +OS: is159479.intra.cea.fr__Linux__x86_64__6.12.0-160000.26-default__#1 SMP PREEMPT_DYNAMIC Thu Feb 5 00:00:11 UTC 2026 (f +CPU model : AMD Ryzen Threadripper PRO 9965WX 24-Cores +Total number of threads:48 +GPU model: NVIDIA RTX PRO 6000 Blackwell Workstation Edition +CUDA runtime version: 13.0 +CUDA drivers version: 13.0 +Nb procs used for the computation: 1 +TRUST version: 1.9.8_beta +Total number of elements used for the calculation: 2160000 + +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Computation start-up statistics +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Total time of the start-up: 33.0966 + +Number of calls to the linear solver per time step: 2 +Average time of the resolution of the linear problem per call: 0.828613 +Average number of iteration of the linear solver per call: 0 + +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Time loop statistics +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +The first time step is not accounted for the computation of the time loop statistics +Total time of the time loop: 2.23231 +Number of time steps: 9 +Skipped time steps: 1 +Average time per time step: 0.248034 +Standard deviation between time steps: 0.0364159 +Time elapsed in the skipped time steps: 0.984948 + + +Standard counter description | Time/step | % loop time | Call(s)/step +------------------------------------------------------------------------------------------ +Linear solver resolutions Ax=B | 0.103753 | 41.8 | 2 +Matrix assembly for implicit scheme | 0.005429958 | 2.2 | 4 +Convection operator | 0.02593384 | 10.5 | 9 +Diffusion operator | 0.01465164 | 5.9 | 18 +Gradient operator | 0.007345582 | 3.0 | 5 +Divergence operator | 0.003139903 | 1.3 | 6 +Source terms | 0.0005302821 | 0.2 | 4 +Update ::mettre_a_jour | 0.03202612 | 12.9 | 4 +Solver for implicit diffusion | 0.01674319 | 6.8 | 4 +Computation of the time step dt | 0.002658233 | 1.1 | 6 +Post-treatment operations | 0.01895837 | 7.6 | 1 +Other operations | 0.01686431 | 6.8 | + +Average number of iteration of the linear solver per call: 52.8 + + +----------------------------------------------------------------------------------------------------------- + GPU statistics +----------------------------------------------------------------------------------------------------------- +Counter description | Time per step | % loop time | Call(s)/step | Bandwidth +----------------------------------------------------------------------------------------------------------- +Libraries: | 0.103627 | 41.8 | 2 | +Kernels: | 0.128405 | 51.8 | 1097 | +Copy host to device: | 0.000396511 | 0.2 | 42 | 5.5 GB/s +Copy device to host: | 0.00455285 | 1.8 | 44 | 8.7 GB/s +Alloc/Free on device: | 0.000110567 | 0.0 | 11 | +GPU: 94% Copy H<->D: 2% Alloc/free: 0.045% Comm: 0% CPU & I/O: 4.4% +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Post-resolution statistics +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Time of the post-resolution: 0.711175 + +Total time for the whole computation 37.025 + diff --git a/tests/GPU/ColdLegMixing/ColdLegMixing_BENCH.TU.is247793_gfx1100 b/tests/GPU/ColdLegMixing/ColdLegMixing_BENCH.TU.is247793_gfx1100 index 4a55029b7d..036dba545d 100644 --- a/tests/GPU/ColdLegMixing/ColdLegMixing_BENCH.TU.is247793_gfx1100 +++ b/tests/GPU/ColdLegMixing/ColdLegMixing_BENCH.TU.is247793_gfx1100 @@ -1,53 +1,78 @@ -Statistiques d'initialisation du calcul - -Temps total 57.599 - -Statistiques de resolution du probleme - -Temps total 14.8858 - - -Timesteps 10 -Secondes / pas de temps 1.48857 -Dont solveurs Ax=B 0.154625 10% (2 appels/pas de temps) -Dont solveur diffusion_implicite 0.060475 4% (4 appels/pas de temps) -Dont assemblage matrice_implicite 0.071929 4% (4 appels/pas de temps) -Dont mettre_a_jour 0.152573 10% (4 appels/pas de temps) -Dont operateurs convection 0.227592 15% (9.6 appels/pas de temps) -Dont operateurs diffusion 0.108532 7% (18 appels/pas de temps) -Dont operateurs gradient 0.044300 2% (5 appels/pas de temps) -Dont operateurs divergence 0.022561 1% (6 appels/pas de temps) -Dont operateurs source 0.005090 0% (4 appels/pas de temps) -Dont operations postraitement 0.574336 38% (1 appel/pas de temps) -Dont calcul dt 0.013271 0% (6 appels/pas de temps) -Dont calcul divers 0.053290 3% (0 appels/pas de temps) -Nb solveur / pas de temps 2 -Secondes / solveur 0.0773127 -Iterations / solveur 21.4 -GPU statistics per time step (experimental): -Libraries : 0.154164 s 10.4% 2.0 calls -Kernels : 0.622647 s 41.8% 4609110.7 calls -Copy H2D : 0.042943 s 2.9% 102.3 calls 9.2 GB/s -Copy D2H : 0.010556 s 0.7% 102.4 calls 15.4 GB/s -Alloc/Free: 0.006001 s 0.4% 74.7 calls -GPU: 52.1% Copy H<->D: 3.5% Alloc/Free: 0.4% Comm: 0% CPU & Others: 43.8% -I/O: - -Timesteps = number of time steps -Nb solveur = number of linear system resolutions -Nb assemblage implicite = number of matrix assemblies for the implicit scheme -Iterations = average number of iterations of the solver -Communications = fraction of the time spent - in communications between processors (excluding io files) -Network latency = time of one mpsum measured by an internal bench over 0.1s -Network bandwidth = maximum on all processors - of the average bandwidth of send_recv operations -Waiting time = estimation of the waiting time of the different processors - -Max_waiting_time big => probably due to a bad partitioning -Communications > 30% => too many processors or network too slow - -Statistiques de post resolution - -Temps total 4.71802 + # Global performance file # + +This is the global file for tracking performance in TRUST. It stores aggregated quantities. +More detailed statistics can be found in the ColdLegMixing_BENCH_csv.TU file +For time loop, only standard counters of level 1 are printed alongside your custom counters +Time is given in seconds + +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Context of the computation +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Date: 22-05-2026 -- 18:30:47 +OS: is247793__Linux__x86_64__6.8.0-107-generic__#107~22.04.1-Ubuntu SMP PREEMPT_DYNAMIC Wed Mar 18 23:40:43 UTC +CPU model : AMD Ryzen Threadripper PRO 7965WX 24-Cores +Total number of threads:48 +GPU model: AMD Radeon PRO W7900 +HIP runtime version: 7.53 +HIP drivers version: 7.53 +Nb procs used for the computation: 1 +TRUST version: 1.9.8_beta +Total number of elements used for the calculation: 2160000 + +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Computation start-up statistics +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Total time of the start-up: 38.1848 + +Number of calls to the linear solver per time step: 2 +Average time of the resolution of the linear problem per call: 1.76015 +Average number of iteration of the linear solver per call: 0 + +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Time loop statistics +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +The first time step is not accounted for the computation of the time loop statistics +Total time of the time loop: 7.398 +Number of time steps: 9 +Skipped time steps: 1 +Average time per time step: 0.822 +Standard deviation between time steps: 0.165676 +Time elapsed in the skipped time steps: 2.16854 + + +Standard counter description | Time/step | % loop time | Call(s)/step +------------------------------------------------------------------------------------------ +Linear solver resolutions Ax=B | 0.271718 | 33.1 | 2 +Matrix assembly for implicit scheme | 0.02632243 | 3.2 | 4 +Convection operator | 0.1223453 | 14.9 | 10 +Diffusion operator | 0.06110811 | 7.4 | 18 +Gradient operator | 0.03366703 | 4.1 | 5 +Divergence operator | 0.02193279 | 2.7 | 6 +Source terms | 0.001620552 | 0.2 | 4 +Update ::mettre_a_jour | 0.1051999 | 12.8 | 4 +Solver for implicit diffusion | 0.04933715 | 6.0 | 4 +Computation of the time step dt | 0.01476115 | 1.8 | 6 +Post-treatment operations | 0.06948995 | 8.5 | 1 +Other operations | 0.04449732 | 5.4 | + +Average number of iteration of the linear solver per call: 53 + + +----------------------------------------------------------------------------------------------------------- + GPU statistics +----------------------------------------------------------------------------------------------------------- +Counter description | Time per step | % loop time | Call(s)/step | Bandwidth +----------------------------------------------------------------------------------------------------------- +Libraries: | 0.270707 | 32.9 | 2 | +Kernels: | 0.519918 | 63.3 | 1063 | +Copy host to device: | 0.00111484 | 0.1 | 42 | 1.9 GB/s +Copy device to host: | 0.00244084 | 0.3 | 31 | 14.0 GB/s +Alloc/Free on device: | 0.00112784 | 0.1 | 39 | +GPU: 96% Copy H<->D: 0.43% Alloc/free: 0.14% Comm: 0% CPU & I/O: 3.2% +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Post-resolution statistics +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Time of the post-resolution: 1.09806 + +Total time for the whole computation 48.8494 diff --git a/tests/GPU/ColdLegMixing/ColdLegMixing_TMIN.son.ref b/tests/GPU/ColdLegMixing/ColdLegMixing_TMIN.son.ref index b8608d39cf..a2b95357eb 100644 --- a/tests/GPU/ColdLegMixing/ColdLegMixing_TMIN.son.ref +++ b/tests/GPU/ColdLegMixing/ColdLegMixing_TMIN.son.ref @@ -4,12 +4,12 @@ # Type NUMERO_ELEM_SUR_MAITRE 0.00000000e+00 0.00000000e+00 5.00000000e-03 0.00000000e+00 -1.00000000e-02 -1.54054285e-11 -1.50000000e-02 -1.35238391e-10 -2.00000000e-02 -5.29718902e-10 -2.50000000e-02 -1.44619321e-09 -3.00000000e-02 -3.20614607e-09 -3.50000000e-02 -6.20193023e-09 -4.00000000e-02 -1.08928586e-08 -4.50000000e-02 -1.78005486e-08 -5.00000000e-02 -2.75035004e-08 +1.00000000e-02 -1.54054293e-11 +1.50000000e-02 -1.35238393e-10 +2.00000000e-02 -5.29718896e-10 +2.50000000e-02 -1.44619318e-09 +3.00000000e-02 -3.20614598e-09 +3.50000000e-02 -6.20193004e-09 +4.00000000e-02 -1.08928583e-08 +4.50000000e-02 -1.78005482e-08 +5.00000000e-02 -2.75034999e-08 diff --git a/tests/GPU/ColdLegMixing/ColdLegMixing_VMAX.son.ref b/tests/GPU/ColdLegMixing/ColdLegMixing_VMAX.son.ref index 4b5bf6d664..5a4f10ebd6 100644 --- a/tests/GPU/ColdLegMixing/ColdLegMixing_VMAX.son.ref +++ b/tests/GPU/ColdLegMixing/ColdLegMixing_VMAX.son.ref @@ -3,13 +3,13 @@ # Champ VMAX [m/s] # Type NUMERO_ELEM_SUR_MAITRE 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 -5.00000000e-03 2.83341776e-03 2.29668974e-03 9.28005173e-04 -1.00000000e-02 5.68379870e-03 4.55499731e-03 1.85889956e-03 -1.50000000e-02 8.54247059e-03 6.79018237e-03 2.79168084e-03 -2.00000000e-02 1.14022905e-02 9.01491002e-03 3.72483538e-03 +5.00000000e-03 2.83341777e-03 2.29668974e-03 9.28005164e-04 +1.00000000e-02 5.68379868e-03 4.55499732e-03 1.85889956e-03 +1.50000000e-02 8.54247057e-03 6.79018239e-03 2.79168083e-03 +2.00000000e-02 1.14022905e-02 9.01491004e-03 3.72483537e-03 2.50000000e-02 1.42619333e-02 1.12325293e-02 4.65783489e-03 -3.00000000e-02 1.71207647e-02 1.34443266e-02 5.59032875e-03 -3.50000000e-02 1.99784164e-02 1.56505655e-02 6.52205471e-03 -4.00000000e-02 2.28346205e-02 1.78509971e-02 7.45279984e-03 -4.50000000e-02 2.56891456e-02 2.00536201e-02 8.38237896e-03 +3.00000000e-02 1.71207647e-02 1.34443265e-02 5.59032877e-03 +3.50000000e-02 1.99784164e-02 1.56505655e-02 6.52205474e-03 +4.00000000e-02 2.28346205e-02 1.78509971e-02 7.45279986e-03 +4.50000000e-02 2.56891456e-02 2.00536201e-02 8.38237897e-03 5.00000000e-02 2.85417720e-02 2.22548485e-02 9.31062254e-03 diff --git a/tests/GPU/ColdLegMixing/ColdLegMixing_VMIN.son.ref b/tests/GPU/ColdLegMixing/ColdLegMixing_VMIN.son.ref index 3adde92724..c4e9503b6a 100644 --- a/tests/GPU/ColdLegMixing/ColdLegMixing_VMIN.son.ref +++ b/tests/GPU/ColdLegMixing/ColdLegMixing_VMIN.son.ref @@ -3,13 +3,13 @@ # Champ VMIN [m/s] # Type NUMERO_ELEM_SUR_MAITRE 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 -5.00000000e-03 -2.83341863e-03 -3.24383912e-03 -9.28005166e-04 -1.00000000e-02 -5.60710555e-03 -6.54040291e-03 -1.88279343e-03 -1.50000000e-02 -8.34715109e-03 -9.86741368e-03 -2.85002332e-03 -2.00000000e-02 -1.10749232e-02 -1.32043684e-02 -3.81937590e-03 -2.50000000e-02 -1.37964246e-02 -1.65457438e-02 -4.78892735e-03 -3.00000000e-02 -1.65145141e-02 -1.98887507e-02 -5.75785211e-03 -3.50000000e-02 -1.92305284e-02 -2.32318978e-02 -6.72575799e-03 -4.00000000e-02 -2.19653111e-02 -2.65743352e-02 -7.69241057e-03 -4.50000000e-02 -2.47150783e-02 -2.99155320e-02 -8.65762676e-03 +5.00000000e-03 -2.83341860e-03 -3.24383911e-03 -9.28005164e-04 +1.00000000e-02 -5.60710555e-03 -6.54040292e-03 -1.88279341e-03 +1.50000000e-02 -8.34715112e-03 -9.86741370e-03 -2.85002328e-03 +2.00000000e-02 -1.10749232e-02 -1.32043684e-02 -3.81937586e-03 +2.50000000e-02 -1.37964246e-02 -1.65457438e-02 -4.78892732e-03 +3.00000000e-02 -1.65145140e-02 -1.98887507e-02 -5.75785210e-03 +3.50000000e-02 -1.92305283e-02 -2.32318978e-02 -6.72575799e-03 +4.00000000e-02 -2.19653111e-02 -2.65743352e-02 -7.69241058e-03 +4.50000000e-02 -2.47150784e-02 -2.99155320e-02 -8.65762676e-03 5.00000000e-02 -2.74653003e-02 -3.32551150e-02 -9.62123630e-03 diff --git a/tests/GPU/ColdLegMixing/PAR_ColdLegMixing_BENCH.TU.adastra_gfx942x8 b/tests/GPU/ColdLegMixing/PAR_ColdLegMixing_BENCH.TU.adastra_gfx942x8 new file mode 100644 index 0000000000..bfe1fb7576 --- /dev/null +++ b/tests/GPU/ColdLegMixing/PAR_ColdLegMixing_BENCH.TU.adastra_gfx942x8 @@ -0,0 +1,114 @@ + # Global performance file # + +This is the global file for tracking performance in TRUST. It stores aggregated quantities. +More detailed statistics can be found in the PAR_ColdLegMixing_BENCH_csv.TU file +For time loop, only standard counters of level 1 are printed alongside your custom counters +Time is given in seconds + +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Context of the computation +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Date: 08-06-2026 -- 14:36:34 +OS: a1007__Linux__x86_64__5.14.0-570.69.1.el9_6.x86_64__#1 SMP PREEMPT_DYNAMIC Tue Nov 25 01:30:14 EST 2025 +CPU model : AMD Instinct MI300A Accelerator +Total number of threads:192 +GPU model: AMD Instinct MI300A +HIP runtime version: 6.43 +HIP drivers version: 6.43 +Nb procs used for the computation: 8 +TRUST version: 1.9.8_beta +Total number of elements used for the calculation: 2160000 + +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Computation start-up statistics +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Total time of the start-up: 23.5004 +Number of virtual exchanges: 186 +Maximum number of MPI allreduce per time step 400 + +Number of calls to the linear solver per time step: 2 +Average time of the resolution of the linear problem per call: 2.04898 +Average number of iteration of the linear solver per call: 0 + +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Time loop statistics +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +The first time step is not accounted for the computation of the time loop statistics +Total time of the time loop: 3.27794 +Number of time steps: 9 +Skipped time steps: 1 +Average time per time step: 0.364215 +Standard deviation between time steps: 0.0790277 +Time elapsed in the skipped time steps: 0.728734 + +Percent of total time spend in communication: 15.8925 + +Standard counter description | Time/step | % loop time | Call(s)/step +------------------------------------------------------------------------------------------ +Linear solver resolutions Ax=B | 0.194097 | 53.3 | 2 +Matrix assembly for implicit scheme | 0.004431142 | 1.2 | 4 +Convection operator | 0.007754889 | 2.1 | 9 +Diffusion operator | 0.005278478 | 1.4 | 18 +Gradient operator | 0.00350893 | 1.0 | 5 +Divergence operator | 0.00190806 | 0.5 | 6 +Source terms | 0.0002011109 | 0.1 | 4 +Update ::mettre_a_jour | 0.04637919 | 12.7 | 4 +Solver for implicit diffusion | 0.01803998 | 5.0 | 4 +Computation of the time step dt | 0.002887528 | 0.8 | 6 +Post-treatment operations | 0.04766509 | 13.1 | 1 +Other operations | 0.03206428 | 8.8 | +Number of virtual exchanges per time step: 148 +Maximum number of MPI allreduce per time step 78.8 + +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Warning: number of MPI allreduce calls per time step is high. Contact TRUST team to run massive parallel calculations +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Average number of iteration of the linear solver per call: 52.7 + + +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Time loop statistics: IO +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Output write sequential: 1066 MB/s + +--------------------------------------------------------------------------------------------------------- +Warning: One or several PETSc solvers are used and thus the communication time below are under-estimated. +--------------------------------------------------------------------------------------------------------- + +Average of the fraction of the time spent in communications between processors: 14.7 % +Max of the fraction of the time spent in communications between processors: 25.7 % +Min of the fraction of the time spent in communications between processors: 12.5 % +Time of one mpsum measured by an internal bench over 0.1s (network latency): 1.38126e-05 +Network maximum bandwidth on all processors: 41.9 GB/s +Total network traffic: 4692.53 MB/time step +Average message size: 401.819 kB +Min waiting time: 12.3 % of total time +Max waiting time: 25.4 % of total time +Avg waiting time: 17.65 % of total time + +----------------------------------------------------------------------------------------------------------- + GPU statistics +----------------------------------------------------------------------------------------------------------- +Counter description | Time per step | % loop time | Call(s)/step | Bandwidth +----------------------------------------------------------------------------------------------------------- +Libraries: | 0.19315 | 53.0 | 2 | +Kernels: | 0.0718475 | 19.7 | 2040 | +Copy host to device: | 0.000624345 | 0.2 | 33 | 0.6 GB/s +Copy device to host: | 0.000775929 | 0.2 | 22 | 5.7 GB/s +Alloc/Free on device: | 0.000295281 | 0.1 | 37 | +GPU: 73% Copy H<->D: 0.38% Alloc/free: 0.081% Comm: 19% CPU & I/O: 7.4% +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Post-resolution statistics +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Time of the post-resolution: 2.30176 +Maximum number of MPI allreduce per time step 57 + +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Max waiting time big => probably due to a bad partitioning +Communications > 30% => too many processors or network too slow +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Total time for the whole computation 29.8089 + +[Slurm] Power consumption (38 s): 1.690 kW 0.018 kWh 0.002 € (0.10€/kWh) diff --git a/tests/GPU/DomainFlowLES/DomainFlowLES.data b/tests/GPU/DomainFlowLES/DomainFlowLES.data index 077a878e64..9db085c491 100644 --- a/tests/GPU/DomainFlowLES/DomainFlowLES.data +++ b/tests/GPU/DomainFlowLES/DomainFlowLES.data @@ -81,7 +81,8 @@ Scatter DOM.Zones dom END SCATTER # # Discretization # -VEFPrep1B dis +VEFPrep1B dis +Lire dis { reorder { algo Hilbert } } Runge_Kutta_Rationnel_ordre_2 sch Lire sch diff --git a/tests/GPU/DomainFlowLES/DomainFlowLES_BENCH.TU.adastra_gfx90a b/tests/GPU/DomainFlowLES/DomainFlowLES_BENCH.TU.adastra_gfx90a index 80e249a8cb..856fc2d793 100644 --- a/tests/GPU/DomainFlowLES/DomainFlowLES_BENCH.TU.adastra_gfx90a +++ b/tests/GPU/DomainFlowLES/DomainFlowLES_BENCH.TU.adastra_gfx90a @@ -8,8 +8,8 @@ Time is given in seconds ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Context of the computation ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -Date: 20-02-2026 -- 18:03:12 -OS: g1081__Linux__x86_64__5.14.0-570.69.1.el9_6.x86_64__#1 SMP PREEMPT_DYNAMIC Tue Nov 25 01:30:14 EST 2025 +Date: 24-05-2026 -- 15:56:05 +OS: g1321__Linux__x86_64__5.14.0-570.69.1.el9_6.x86_64__#1 SMP PREEMPT_DYNAMIC Tue Nov 25 01:30:14 EST 2025 CPU model : AMD EPYC 7A53 64-Core Processor Total number of threads:128 GPU model: AMD Instinct MI250X @@ -22,41 +22,41 @@ Total number of elements used for the calculation: 3276800 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Computation start-up statistics ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -Total time of the start-up: 67.2841 +Total time of the start-up: 55.684 Number of calls to the linear solver per time step: 4 -Average time of the resolution of the linear problem per call: 0.793104 -Average number of iteration of the linear solver per call: 21 +Average time of the resolution of the linear problem per call: 1.0001 +Average number of iteration of the linear solver per call: 21.75 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Time loop statistics ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ The first time step is not accounted for the computation of the time loop statistics -Total time of the time loop: 11.8106 +Total time of the time loop: 9.88073 Number of time steps: 9 Skipped time steps: 1 -Average time per time step: 1.31229 -Standard deviation between time steps: 0.083117 -Time elapsed in the skipped time steps: 2.8317 +Average time per time step: 1.09786 +Standard deviation between time steps: 0.0775836 +Time elapsed in the skipped time steps: 2.6768 Standard counter description | Time/step | % loop time | Call(s)/step ------------------------------------------------------------------------------------------ -Linear solver resolutions Ax=B | 0.20118 | 12.4 | 4 -Matrix assembly for implicit scheme | 0.1265401 | 7.8 | 6 -Convection operator | 0.4557769 | 28.0 | 8 -Diffusion operator | 0.1462312 | 9.0 | 36 -Gradient operator | 0.1370221 | 8.4 | 9 -Divergence operator | 0.04048505 | 2.5 | 8 -Source terms | 0.002067465 | 0.1 | 6 -Update ::mettre_a_jour | 0.07502718 | 4.6 | 5 -Solver for implicit diffusion | 0.0577835 | 3.6 | 6 -Computation of the time step dt | 0.02468548 | 1.5 | 10 -Turbulence model::update | 0.01060921 | 0.7 | 2 -Post-treatment operations | 0.0282308 | 1.7 | 2 -Other operations | 0.006654609 | 0.4 | +Linear solver resolutions Ax=B | 0.174566 | 15.9 | 4 +Matrix assembly for implicit scheme | 0.1055591 | 9.6 | 6 +Convection operator | 0.4354912 | 39.7 | 8 +Diffusion operator | 0.1078714 | 9.8 | 36 +Gradient operator | 0.06912533 | 6.3 | 9 +Divergence operator | 0.02441109 | 2.2 | 8 +Source terms | 0.001850366 | 0.2 | 6 +Update ::mettre_a_jour | 0.04996588 | 4.6 | 5 +Solver for implicit diffusion | 0.05493421 | 5.0 | 6 +Computation of the time step dt | 0.01905284 | 1.7 | 10 +Turbulence model::update | 0.008911962 | 0.8 | 2 +Post-treatment operations | 0.02671327 | 2.4 | 2 +Other operations | 0.01940543 | 1.8 | -Average number of iteration of the linear solver per call: 35.3 +Average number of iteration of the linear solver per call: 35.7 ----------------------------------------------------------------------------------------------------------- @@ -64,17 +64,17 @@ Average number of iteration of the linear solver per call: 35.3 ----------------------------------------------------------------------------------------------------------- Counter description | Time per step | % loop time | Call(s)/step | Bandwidth ----------------------------------------------------------------------------------------------------------- -Libraries: | 0.200505 | 15.3 | 4 | -Kernels: | 1.07348 | 81.8 | 1993 | -Copy host to device: | 0.000944822 | 0.1 | 43 | 3.1 GB/s -Copy device to host: | 0.00182325 | 0.1 | 7 | 20.4 GB/s -Alloc/Free on device: | 0.000348043 | 0.0 | 23 | -GPU: 97% Copy H<->D: 0.21% Alloc/free: 0.027% Comm: 0% CPU & I/O: 2.7% +Libraries: | 0.173899 | 15.8 | 4 | +Kernels: | 0.885519 | 80.7 | 1991 | +Copy host to device: | 0.000952295 | 0.1 | 43 | 3.1 GB/s +Copy device to host: | 0.0018143 | 0.2 | 7 | 20.5 GB/s +Alloc/Free on device: | 0.000332402 | 0.0 | 23 | +GPU: 96% Copy H<->D: 0.25% Alloc/free: 0.03% Comm: 0% CPU & I/O: 3.2% ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Post-resolution statistics ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -Time of the post-resolution: 0.342931 +Time of the post-resolution: 0.342387 -Total time for the whole computation 82.2694 +Total time for the whole computation 68.5839 -Power consumption: 0 kW 0 kWh 0 € (0.12€/kWh) +[Slurm] Power consumption (75 s): 0.475 kW 0.010 kWh 0.001 € (0.10€/kWh) diff --git a/tests/GPU/DomainFlowLES/DomainFlowLES_BENCH.TU.dalianvl_cc100 b/tests/GPU/DomainFlowLES/DomainFlowLES_BENCH.TU.dalianvl_cc100 new file mode 100644 index 0000000000..4dda32c8dd --- /dev/null +++ b/tests/GPU/DomainFlowLES/DomainFlowLES_BENCH.TU.dalianvl_cc100 @@ -0,0 +1,79 @@ + # Global performance file # + +This is the global file for tracking performance in TRUST. It stores aggregated quantities. +More detailed statistics can be found in the DomainFlowLES_BENCH_csv.TU file +For time loop, only standard counters of level 1 are printed alongside your custom counters +Time is given in seconds + +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Context of the computation +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Date: 22-05-2026 -- 12:14:07 +OS: dalianvl14__Linux__aarch64__5.14.0-503.40.1.el9_5.aarch64__#1 SMP PREEMPT_DYNAMIC Thu Apr 24 13:05:29 UTC 2025 +CPU model : Unknown Linux CPU +Total number of threads:144 +GPU model: NVIDIA GB200 +CUDA runtime version: 13.0 +CUDA drivers version: 13.20 +Nb procs used for the computation: 1 +TRUST version: 1.9.8_beta +Total number of elements used for the calculation: 3276800 + +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Computation start-up statistics +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Total time of the start-up: 48.5207 + +Number of calls to the linear solver per time step: 4 +Average time of the resolution of the linear problem per call: 0.98739 +Average number of iteration of the linear solver per call: 21.75 + +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Time loop statistics +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +The first time step is not accounted for the computation of the time loop statistics +Total time of the time loop: 2.19618 +Number of time steps: 9 +Skipped time steps: 1 +Average time per time step: 0.24402 +Standard deviation between time steps: 0.0459336 +Time elapsed in the skipped time steps: 1.21579 + + +Standard counter description | Time/step | % loop time | Call(s)/step +------------------------------------------------------------------------------------------ +Linear solver resolutions Ax=B | 0.0962815 | 39.5 | 4 +Matrix assembly for implicit scheme | 0.01431973 | 5.9 | 6 +Convection operator | 0.02844478 | 11.7 | 8 +Diffusion operator | 0.02991948 | 12.3 | 36 +Gradient operator | 0.007994795 | 3.3 | 9 +Divergence operator | 0.002778588 | 1.1 | 8 +Source terms | 0.0007949644 | 0.3 | 6 +Update ::mettre_a_jour | 0.009183308 | 3.8 | 5 +Solver for implicit diffusion | 0.0204651 | 8.4 | 6 +Computation of the time step dt | 0.002499168 | 1.0 | 10 +Turbulence model::update | 0.001439575 | 0.6 | 2 +Post-treatment operations | 0.01804008 | 7.4 | 2 +Other operations | 0.01185865 | 4.9 | + +Average number of iteration of the linear solver per call: 36.1 + + +----------------------------------------------------------------------------------------------------------- + GPU statistics +----------------------------------------------------------------------------------------------------------- +Counter description | Time per step | % loop time | Call(s)/step | Bandwidth +----------------------------------------------------------------------------------------------------------- +Libraries: | 0.0958068 | 39.3 | 4 | +Kernels: | 0.117053 | 48.0 | 1991 | +Copy host to device: | 0.000836579 | 0.3 | 43 | 3.5 GB/s +Copy device to host: | 0.000433294 | 0.2 | 7 | 85.9 GB/s +Alloc/Free on device: | 0.000928713 | 0.4 | 23 | +GPU: 87% Copy H<->D: 0.52% Alloc/free: 0.38% Comm: 0% CPU & I/O: 12% +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Post-resolution statistics +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Time of the post-resolution: 4.63836 + +Total time for the whole computation 56.571 + diff --git a/tests/GPU/DomainFlowLES/DomainFlowLES_BENCH.TU.eureka_cc89 b/tests/GPU/DomainFlowLES/DomainFlowLES_BENCH.TU.eureka_cc89 new file mode 100644 index 0000000000..91f11500bd --- /dev/null +++ b/tests/GPU/DomainFlowLES/DomainFlowLES_BENCH.TU.eureka_cc89 @@ -0,0 +1,79 @@ + # Global performance file # + +This is the global file for tracking performance in TRUST. It stores aggregated quantities. +More detailed statistics can be found in the DomainFlowLES_BENCH_csv.TU file +For time loop, only standard counters of level 1 are printed alongside your custom counters +Time is given in seconds + +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Context of the computation +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Date: 11-05-2026 -- 08:30:16 +OS: eureka__Linux__x86_64__6.17.0-20-generic__#20~24.04.1-Ubuntu SMP PREEMPT_DYNAMIC Thu Mar 19 01:28:37 UTC 2 +CPU model : INTEL(R) XEON(R) PLATINUM 8580 +Total number of threads:240 +GPU model: NVIDIA RTX 6000 Ada Generation +CUDA runtime version: 12.90 +CUDA drivers version: 13.10 +Nb procs used for the computation: 1 +TRUST version: 1.9.8_beta +Total number of elements used for the calculation: 3276800 + +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Computation start-up statistics +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Total time of the start-up: 53.5087 + +Number of calls to the linear solver per time step: 4 +Average time of the resolution of the linear problem per call: 1.18754 +Average number of iteration of the linear solver per call: 21.75 + +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Time loop statistics +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +The first time step is not accounted for the computation of the time loop statistics +Total time of the time loop: 4.49963 +Number of time steps: 9 +Skipped time steps: 1 +Average time per time step: 0.499959 +Standard deviation between time steps: 0.0428608 +Time elapsed in the skipped time steps: 3.00568 + + +Standard counter description | Time/step | % loop time | Call(s)/step +------------------------------------------------------------------------------------------ +Linear solver resolutions Ax=B | 0.16093 | 32.2 | 4 +Matrix assembly for implicit scheme | 0.01970638 | 3.9 | 6 +Convection operator | 0.09048199 | 18.1 | 8 +Diffusion operator | 0.0725157 | 14.5 | 36 +Gradient operator | 0.02036548 | 4.1 | 9 +Divergence operator | 0.008354662 | 1.7 | 8 +Source terms | 0.001649874 | 0.3 | 6 +Update ::mettre_a_jour | 0.01730522 | 3.5 | 5 +Solver for implicit diffusion | 0.05971053 | 11.9 | 6 +Computation of the time step dt | 0.007172377 | 1.4 | 10 +Turbulence model::update | 0.004708388 | 0.9 | 2 +Post-treatment operations | 0.01345581 | 2.7 | 2 +Other operations | 0.02360189 | 4.7 | + +Average number of iteration of the linear solver per call: 35.4 + + +----------------------------------------------------------------------------------------------------------- + GPU statistics +----------------------------------------------------------------------------------------------------------- +Counter description | Time per step | % loop time | Call(s)/step | Bandwidth +----------------------------------------------------------------------------------------------------------- +Libraries: | 0.160556 | 32.1 | 4 | +Kernels: | 0.312891 | 62.6 | 1991 | +Copy host to device: | 0.000655779 | 0.1 | 43 | 4.4 GB/s +Copy device to host: | 0.00319411 | 0.6 | 7 | 11.7 GB/s +Alloc/Free on device: | 0.000571484 | 0.1 | 23 | +GPU: 95% Copy H<->D: 0.77% Alloc/free: 0.11% Comm: 0% CPU & I/O: 4.4% +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Post-resolution statistics +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Time of the post-resolution: 0.0734133 + +Total time for the whole computation 61.0875 + diff --git a/tests/GPU/DomainFlowLES/DomainFlowLES_BENCH.TU.irene-amd-ccrt_cc70 b/tests/GPU/DomainFlowLES/DomainFlowLES_BENCH.TU.irene-amd-ccrt_cc70 index 1860138c54..f0a90ec939 100644 --- a/tests/GPU/DomainFlowLES/DomainFlowLES_BENCH.TU.irene-amd-ccrt_cc70 +++ b/tests/GPU/DomainFlowLES/DomainFlowLES_BENCH.TU.irene-amd-ccrt_cc70 @@ -8,8 +8,8 @@ Time is given in seconds ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Context of the computation ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -Date: 20-03-2026 -- 14:11:03 -OS: irene7056__Linux__x86_64__4.18.0-553.89.1.el8_10.x86_64__#1 SMP Sat Nov 29 00:49:18 EST 2025 +Date: 23-04-2026 -- 14:46:40 +OS: irene7051__Linux__x86_64__4.18.0-553.89.1.el8_10.x86_64__#1 SMP Sat Nov 29 00:49:18 EST 2025 CPU model : Intel(R) Xeon(R) Gold 6230 CPU @ 2.10GHz Total number of threads:80 GPU model: Tesla V100-SXM2-16GB @@ -22,39 +22,39 @@ Total number of elements used for the calculation: 3276800 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Computation start-up statistics ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -Total time of the start-up: 84.3428 +Total time of the start-up: 89.5421 Number of calls to the linear solver per time step: 4 -Average time of the resolution of the linear problem per call: 1.57906 +Average time of the resolution of the linear problem per call: 1.65017 Average number of iteration of the linear solver per call: 21.75 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Time loop statistics ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ The first time step is not accounted for the computation of the time loop statistics -Total time of the time loop: 10.5634 +Total time of the time loop: 10.4615 Number of time steps: 9 Skipped time steps: 1 -Average time per time step: 1.17371 -Standard deviation between time steps: 0.10204 -Time elapsed in the skipped time steps: 4.49973 +Average time per time step: 1.16239 +Standard deviation between time steps: 0.107809 +Time elapsed in the skipped time steps: 4.67111 Standard counter description | Time/step | % loop time | Call(s)/step ------------------------------------------------------------------------------------------ -Linear solver resolutions Ax=B | 0.281414 | 24.0 | 4 -Matrix assembly for implicit scheme | 0.04662882 | 4.0 | 6 -Convection operator | 0.3456106 | 29.4 | 8 -Diffusion operator | 0.1786968 | 15.2 | 36 -Gradient operator | 0.0569327 | 4.9 | 9 -Divergence operator | 0.04346049 | 3.7 | 8 -Source terms | 0.00252737 | 0.2 | 6 -Update ::mettre_a_jour | 0.04792681 | 4.1 | 5 -Solver for implicit diffusion | 0.07688781 | 6.6 | 6 -Computation of the time step dt | 0.02732393 | 2.3 | 10 -Turbulence model::update | 0.01237735 | 1.1 | 2 -Post-treatment operations | 0.03411321 | 2.9 | 2 -Other operations | 0.0198106 | 1.7 | +Linear solver resolutions Ax=B | 0.281586 | 24.2 | 4 +Matrix assembly for implicit scheme | 0.04644436 | 4.0 | 6 +Convection operator | 0.3458352 | 29.8 | 8 +Diffusion operator | 0.1801068 | 15.5 | 36 +Gradient operator | 0.04069576 | 3.5 | 9 +Divergence operator | 0.04349869 | 3.7 | 8 +Source terms | 0.002216718 | 0.2 | 6 +Update ::mettre_a_jour | 0.0466742 | 4.0 | 5 +Solver for implicit diffusion | 0.07746529 | 6.7 | 6 +Computation of the time step dt | 0.02720833 | 2.3 | 10 +Turbulence model::update | 0.01275307 | 1.1 | 2 +Post-treatment operations | 0.03633502 | 3.1 | 2 +Other operations | 0.02157274 | 1.9 | Average number of iteration of the linear solver per call: 35.4 @@ -64,17 +64,17 @@ Average number of iteration of the linear solver per call: 35.4 ----------------------------------------------------------------------------------------------------------- Counter description | Time per step | % loop time | Call(s)/step | Bandwidth ----------------------------------------------------------------------------------------------------------- -Libraries: | 0.280557 | 23.9 | 4 | -Kernels: | 0.840487 | 71.6 | 1993 | -Copy host to device: | 0.00142747 | 0.1 | 43 | 2.0 GB/s -Copy device to host: | 0.00819653 | 0.7 | 7 | 4.5 GB/s -Alloc/Free on device: | 0.000885623 | 0.1 | 23 | -GPU: 96% Copy H<->D: 0.82% Alloc/free: 0.075% Comm: 0% CPU & I/O: 3.6% +Libraries: | 0.280644 | 24.1 | 4 | +Kernels: | 0.824854 | 71.0 | 1991 | +Copy host to device: | 0.00150175 | 0.1 | 43 | 1.9 GB/s +Copy device to host: | 0.00918955 | 0.8 | 7 | 4.1 GB/s +Alloc/Free on device: | 0.000722558 | 0.1 | 23 | +GPU: 95% Copy H<->D: 0.92% Alloc/free: 0.062% Comm: 0% CPU & I/O: 3.9% ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Post-resolution statistics ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -Time of the post-resolution: 0.384397 +Time of the post-resolution: 0.253759 -Total time for the whole computation 99.7903 +Total time for the whole computation 104.929 -[Slurm] Power consumption (116 s): 0.222 kW 0.007 kWh 0.001 € (0.10€/kWh) +[Slurm] Power consumption (119 s): 0.195 kW 0.006 kWh 0.001 € (0.10€/kWh) diff --git a/tests/GPU/DomainFlowLES/DomainFlowLES_BENCH.TU.is157091_cc86 b/tests/GPU/DomainFlowLES/DomainFlowLES_BENCH.TU.is157091_cc86 index 83cb9631bc..647ab137d1 100644 --- a/tests/GPU/DomainFlowLES/DomainFlowLES_BENCH.TU.is157091_cc86 +++ b/tests/GPU/DomainFlowLES/DomainFlowLES_BENCH.TU.is157091_cc86 @@ -8,13 +8,13 @@ Time is given in seconds ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Context of the computation ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -Date: 10-03-2026 -- 08:38:40 +Date: 14-05-2026 -- 15:57:08 OS: is157091__Linux__x86_64__5.11.12-300.fc34.x86_64__#1 SMP Wed Apr 7 16:31:13 UTC 2021 CPU model : AMD Ryzen Threadripper PRO 5975WX 32-Cores Total number of threads:64 GPU model: NVIDIA RTX A6000 CUDA runtime version: 12.90 -CUDA drivers version: 12.70 +CUDA drivers version: 13.20 Nb procs used for the computation: 1 TRUST version: 1.9.8_beta Total number of elements used for the calculation: 3276800 @@ -22,39 +22,39 @@ Total number of elements used for the calculation: 3276800 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Computation start-up statistics ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -Total time of the start-up: 55.0303 +Total time of the start-up: 56.4516 Number of calls to the linear solver per time step: 4 -Average time of the resolution of the linear problem per call: 0.852492 +Average time of the resolution of the linear problem per call: 1.05844 Average number of iteration of the linear solver per call: 21.75 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Time loop statistics ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ The first time step is not accounted for the computation of the time loop statistics -Total time of the time loop: 9.17717 +Total time of the time loop: 8.35729 Number of time steps: 9 Skipped time steps: 1 -Average time per time step: 1.01969 -Standard deviation between time steps: 0.0706814 -Time elapsed in the skipped time steps: 2.88004 +Average time per time step: 0.928588 +Standard deviation between time steps: 0.0665702 +Time elapsed in the skipped time steps: 2.86698 Standard counter description | Time/step | % loop time | Call(s)/step ------------------------------------------------------------------------------------------ -Linear solver resolutions Ax=B | 0.278561 | 27.3 | 4 -Matrix assembly for implicit scheme | 0.03953483 | 3.9 | 6 -Convection operator | 0.2709577 | 26.6 | 8 -Diffusion operator | 0.1310476 | 12.9 | 36 -Gradient operator | 0.04856839 | 4.8 | 9 -Divergence operator | 0.03368633 | 3.3 | 8 -Source terms | 0.002160707 | 0.2 | 6 -Update ::mettre_a_jour | 0.0415759 | 4.1 | 5 -Solver for implicit diffusion | 0.08429423 | 8.3 | 6 -Computation of the time step dt | 0.02971084 | 2.9 | 10 -Turbulence model::update | 0.00944341 | 0.9 | 2 -Post-treatment operations | 0.02223871 | 2.2 | 2 -Other operations | 0.02790565 | 2.7 | +Linear solver resolutions Ax=B | 0.265963 | 28.6 | 4 +Matrix assembly for implicit scheme | 0.04038793 | 4.3 | 6 +Convection operator | 0.2593428 | 27.9 | 8 +Diffusion operator | 0.1087147 | 11.7 | 36 +Gradient operator | 0.02225161 | 2.4 | 9 +Divergence operator | 0.02232687 | 2.4 | 8 +Source terms | 0.002280371 | 0.2 | 6 +Update ::mettre_a_jour | 0.03501159 | 3.8 | 5 +Solver for implicit diffusion | 0.08865022 | 9.5 | 6 +Computation of the time step dt | 0.02310452 | 2.5 | 10 +Turbulence model::update | 0.009679251 | 1.0 | 2 +Post-treatment operations | 0.02006731 | 2.2 | 2 +Other operations | 0.03080791 | 3.3 | Average number of iteration of the linear solver per call: 35.4 @@ -64,16 +64,16 @@ Average number of iteration of the linear solver per call: 35.4 ----------------------------------------------------------------------------------------------------------- Counter description | Time per step | % loop time | Call(s)/step | Bandwidth ----------------------------------------------------------------------------------------------------------- -Libraries: | 0.275962 | 27.1 | 4 | -Kernels: | 0.709639 | 69.6 | 1993 | -Copy host to device: | 0.000689055 | 0.1 | 43 | 4.2 GB/s -Copy device to host: | 0.00311297 | 0.3 | 7 | 12.0 GB/s -Alloc/Free on device: | 0.000785802 | 0.1 | 23 | -GPU: 97% Copy H<->D: 0.37% Alloc/free: 0.077% Comm: 0% CPU & I/O: 2.9% +Libraries: | 0.265066 | 28.5 | 4 | +Kernels: | 0.63096 | 67.9 | 1991 | +Copy host to device: | 0.000694008 | 0.1 | 43 | 4.2 GB/s +Copy device to host: | 0.00416774 | 0.4 | 7 | 8.9 GB/s +Alloc/Free on device: | 0.000657239 | 0.1 | 23 | +GPU: 96% Copy H<->D: 0.52% Alloc/free: 0.071% Comm: 0% CPU & I/O: 2.9% ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Post-resolution statistics ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -Time of the post-resolution: 0.0926276 +Time of the post-resolution: 0.0958504 -Total time for the whole computation 67.1801 +Total time for the whole computation 67.7717 diff --git a/tests/GPU/DomainFlowLES/DomainFlowLES_BENCH.TU.is159479_cc120 b/tests/GPU/DomainFlowLES/DomainFlowLES_BENCH.TU.is159479_cc120 new file mode 100644 index 0000000000..6f4dfb3cd4 --- /dev/null +++ b/tests/GPU/DomainFlowLES/DomainFlowLES_BENCH.TU.is159479_cc120 @@ -0,0 +1,79 @@ + # Global performance file # + +This is the global file for tracking performance in TRUST. It stores aggregated quantities. +More detailed statistics can be found in the DomainFlowLES_BENCH_csv.TU file +For time loop, only standard counters of level 1 are printed alongside your custom counters +Time is given in seconds + +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Context of the computation +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Date: 24-04-2026 -- 14:33:44 +OS: is159479.intra.cea.fr__Linux__x86_64__6.12.0-160000.26-default__#1 SMP PREEMPT_DYNAMIC Thu Feb 5 00:00:11 UTC 2026 (f +CPU model : AMD Ryzen Threadripper PRO 9965WX 24-Cores +Total number of threads:48 +GPU model: NVIDIA RTX PRO 6000 Blackwell Workstation Edition +CUDA runtime version: 13.0 +CUDA drivers version: 13.0 +Nb procs used for the computation: 1 +TRUST version: 1.9.8_beta +Total number of elements used for the calculation: 3276800 + +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Computation start-up statistics +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Total time of the start-up: 40.8316 + +Number of calls to the linear solver per time step: 4 +Average time of the resolution of the linear problem per call: 0.59519 +Average number of iteration of the linear solver per call: 21.75 + +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Time loop statistics +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +The first time step is not accounted for the computation of the time loop statistics +Total time of the time loop: 2.72965 +Number of time steps: 9 +Skipped time steps: 1 +Average time per time step: 0.303294 +Standard deviation between time steps: 0.030574 +Time elapsed in the skipped time steps: 1.69938 + + +Standard counter description | Time/step | % loop time | Call(s)/step +------------------------------------------------------------------------------------------ +Linear solver resolutions Ax=B | 0.119445 | 39.4 | 4 +Matrix assembly for implicit scheme | 0.0119855 | 4.0 | 6 +Convection operator | 0.04748959 | 15.7 | 8 +Diffusion operator | 0.04161902 | 13.7 | 36 +Gradient operator | 0.01082988 | 3.6 | 9 +Divergence operator | 0.00443817 | 1.5 | 8 +Source terms | 0.0008179478 | 0.3 | 6 +Update ::mettre_a_jour | 0.01174738 | 3.9 | 5 +Solver for implicit diffusion | 0.02742291 | 9.0 | 6 +Computation of the time step dt | 0.003806219 | 1.3 | 10 +Turbulence model::update | 0.004209919 | 1.4 | 2 +Post-treatment operations | 0.009942449 | 3.3 | 2 +Other operations | 0.00953959 | 3.1 | + +Average number of iteration of the linear solver per call: 35.3 + + +----------------------------------------------------------------------------------------------------------- + GPU statistics +----------------------------------------------------------------------------------------------------------- +Counter description | Time per step | % loop time | Call(s)/step | Bandwidth +----------------------------------------------------------------------------------------------------------- +Libraries: | 0.119216 | 39.3 | 4 | +Kernels: | 0.165121 | 54.4 | 1991 | +Copy host to device: | 0.000505459 | 0.2 | 43 | 5.8 GB/s +Copy device to host: | 0.00358184 | 1.2 | 7 | 10.4 GB/s +Alloc/Free on device: | 0.000322484 | 0.1 | 23 | +GPU: 94% Copy H<->D: 1.3% Alloc/free: 0.11% Comm: 0% CPU & I/O: 4.8% +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Post-resolution statistics +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Time of the post-resolution: 0.0774195 + +Total time for the whole computation 45.3381 + diff --git a/tests/GPU/DomainFlowLES/DomainFlowLES_BENCH.TU.is247793_gfx1100 b/tests/GPU/DomainFlowLES/DomainFlowLES_BENCH.TU.is247793_gfx1100 index 471ab9ec4c..a6a3ca6c4a 100644 --- a/tests/GPU/DomainFlowLES/DomainFlowLES_BENCH.TU.is247793_gfx1100 +++ b/tests/GPU/DomainFlowLES/DomainFlowLES_BENCH.TU.is247793_gfx1100 @@ -1,54 +1,79 @@ -Statistiques d'initialisation du calcul - -Temps total 77.5112 - -Statistiques de resolution du probleme - -Temps total 15.4304 - - -Timesteps 10 -Secondes / pas de temps 1.54304 -Dont solveurs Ax=B 0.299995 19% (4 appels/pas de temps) -Dont solveur diffusion_implicite 0.103985 6% (6 appels/pas de temps) -Dont assemblage matrice_implicite 0.122103 7% (6 appels/pas de temps) -Dont mettre_a_jour 0.050672 3% (5 appels/pas de temps) -Dont operateurs convection 0.364616 23% (8 appels/pas de temps) -Dont operateurs diffusion 0.245065 15% (35.9 appels/pas de temps) -Dont operateurs gradient 0.059774 3% (9 appels/pas de temps) -Dont operateurs divergence 0.022762 1% (8.2 appels/pas de temps) -Dont operateurs source 0.006063 0% (6 appels/pas de temps) -Dont operations postraitement 0.202759 13% (2 appels/pas de temps) -Dont calcul dt 0.016310 1% (10 appels/pas de temps) -Dont modele turbulence 0.011045 0% (2 appels/pas de temps) -Dont calcul divers 0.037886 2% (0 appels/pas de temps) -Nb solveur / pas de temps 4 -Secondes / solveur 0.0749988 -Iterations / solveur 27.25 -GPU statistics per time step (experimental): -Libraries : 0.299279 s 19.4% 4.0 calls -Kernels : 0.914389 s 59.3% 64035.4 calls -Copy H2D : 0.034075 s 2.2% 86.4 calls 11.8 GB/s -Copy D2H : 0.012526 s 0.8% 101.0 calls 18.3 GB/s -Alloc/Free: 0.007877 s 0.5% 57.5 calls -GPU: 78.6% Copy H<->D: 3% Alloc/Free: 0.5% Comm: 0% CPU & Others: 17.8% -I/O: - -Timesteps = number of time steps -Nb solveur = number of linear system resolutions -Nb assemblage implicite = number of matrix assemblies for the implicit scheme -Iterations = average number of iterations of the solver -Communications = fraction of the time spent - in communications between processors (excluding io files) -Network latency = time of one mpsum measured by an internal bench over 0.1s -Network bandwidth = maximum on all processors - of the average bandwidth of send_recv operations -Waiting time = estimation of the waiting time of the different processors - -Max_waiting_time big => probably due to a bad partitioning -Communications > 30% => too many processors or network too slow - -Statistiques de post resolution - -Temps total 0.422806 + # Global performance file # + +This is the global file for tracking performance in TRUST. It stores aggregated quantities. +More detailed statistics can be found in the DomainFlowLES_BENCH_csv.TU file +For time loop, only standard counters of level 1 are printed alongside your custom counters +Time is given in seconds + +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Context of the computation +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Date: 22-05-2026 -- 17:32:53 +OS: is247793__Linux__x86_64__6.8.0-107-generic__#107~22.04.1-Ubuntu SMP PREEMPT_DYNAMIC Wed Mar 18 23:40:43 UTC +CPU model : AMD Ryzen Threadripper PRO 7965WX 24-Cores +Total number of threads:48 +GPU model: AMD Radeon PRO W7900 +HIP runtime version: 7.53 +HIP drivers version: 7.53 +Nb procs used for the computation: 1 +TRUST version: 1.9.8_beta +Total number of elements used for the calculation: 3276800 + +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Computation start-up statistics +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Total time of the start-up: 48.1642 + +Number of calls to the linear solver per time step: 4 +Average time of the resolution of the linear problem per call: 1.30368 +Average number of iteration of the linear solver per call: 41.25 + +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Time loop statistics +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +The first time step is not accounted for the computation of the time loop statistics +Total time of the time loop: 9.09926 +Number of time steps: 9 +Skipped time steps: 1 +Average time per time step: 1.01103 +Standard deviation between time steps: 0.0525723 +Time elapsed in the skipped time steps: 2.64799 + + +Standard counter description | Time/step | % loop time | Call(s)/step +------------------------------------------------------------------------------------------ +Linear solver resolutions Ax=B | 0.411071 | 40.7 | 4 +Matrix assembly for implicit scheme | 0.05677431 | 5.6 | 6 +Convection operator | 0.17899 | 17.7 | 8 +Diffusion operator | 0.1028158 | 10.2 | 36 +Gradient operator | 0.03740054 | 3.7 | 9 +Divergence operator | 0.01530846 | 1.5 | 8 +Source terms | 0.002693547 | 0.3 | 6 +Update ::mettre_a_jour | 0.03649897 | 3.6 | 5 +Solver for implicit diffusion | 0.0935207 | 9.3 | 6 +Computation of the time step dt | 0.01495112 | 1.5 | 10 +Turbulence model::update | 0.005950598 | 0.6 | 2 +Post-treatment operations | 0.01349221 | 1.3 | 2 +Other operations | 0.04156191 | 4.1 | + +Average number of iteration of the linear solver per call: 40.1 + + +----------------------------------------------------------------------------------------------------------- + GPU statistics +----------------------------------------------------------------------------------------------------------- +Counter description | Time per step | % loop time | Call(s)/step | Bandwidth +----------------------------------------------------------------------------------------------------------- +Libraries: | 0.410401 | 40.6 | 4 | +Kernels: | 0.57806 | 57.2 | 1991 | +Copy host to device: | 0.00114789 | 0.1 | 43 | 2.5 GB/s +Copy device to host: | 0.00192947 | 0.2 | 7 | 19.3 GB/s +Alloc/Free on device: | 0.000596136 | 0.1 | 23 | +GPU: 98% Copy H<->D: 0.3% Alloc/free: 0.059% Comm: 0% CPU & I/O: 1.9% +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Post-resolution statistics +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Time of the post-resolution: 0.154435 + +Total time for the whole computation 60.0659 diff --git a/tests/GPU/DomainFlowLES/DomainFlowLES_BENCH.TU.jean-zay_cc90 b/tests/GPU/DomainFlowLES/DomainFlowLES_BENCH.TU.jean-zay_cc90 index 8c7e324927..929ea656a8 100644 --- a/tests/GPU/DomainFlowLES/DomainFlowLES_BENCH.TU.jean-zay_cc90 +++ b/tests/GPU/DomainFlowLES/DomainFlowLES_BENCH.TU.jean-zay_cc90 @@ -8,13 +8,13 @@ Time is given in seconds ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Context of the computation ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -Date: 20-03-2026 -- 13:59:18 -OS: jzxh136__Linux__x86_64__5.14.0-570.58.1.el9_6.x86_64__#1 SMP PREEMPT_DYNAMIC Tue Oct 21 04:15:07 EDT 2025 +Date: 10-06-2026 -- 10:35:02 +OS: jzxh022__Linux__x86_64__5.14.0-570.116.1.el9_6.x86_64__#1 SMP PREEMPT_DYNAMIC Tue May 19 08:44:54 EDT 2026 CPU model : Intel(R) Xeon(R) Platinum 8468 Total number of threads:192 GPU model: NVIDIA H100 80GB HBM3 CUDA runtime version: 12.60 -CUDA drivers version: 13.0 +CUDA drivers version: 13.20 Nb procs used for the computation: 1 TRUST version: 1.9.8_beta Total number of elements used for the calculation: 3276800 @@ -22,39 +22,39 @@ Total number of elements used for the calculation: 3276800 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Computation start-up statistics ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -Total time of the start-up: 57.7416 +Total time of the start-up: 41.3232 Number of calls to the linear solver per time step: 4 -Average time of the resolution of the linear problem per call: 1.02785 +Average time of the resolution of the linear problem per call: 0.791435 Average number of iteration of the linear solver per call: 21.75 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Time loop statistics ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ The first time step is not accounted for the computation of the time loop statistics -Total time of the time loop: 3.39787 +Total time of the time loop: 3.20102 Number of time steps: 9 Skipped time steps: 1 -Average time per time step: 0.377541 -Standard deviation between time steps: 0.0880077 -Time elapsed in the skipped time steps: 2.99383 +Average time per time step: 0.355669 +Standard deviation between time steps: 0.0925432 +Time elapsed in the skipped time steps: 2.7195 Standard counter description | Time/step | % loop time | Call(s)/step ------------------------------------------------------------------------------------------ -Linear solver resolutions Ax=B | 0.11934 | 31.6 | 4 -Matrix assembly for implicit scheme | 0.01312974 | 3.5 | 6 -Convection operator | 0.08059334 | 21.3 | 8 -Diffusion operator | 0.04303252 | 11.4 | 36 -Gradient operator | 0.0157995 | 4.2 | 9 -Divergence operator | 0.007773447 | 2.1 | 8 -Source terms | 0.0009952992 | 0.3 | 6 -Update ::mettre_a_jour | 0.01492743 | 4.0 | 5 -Solver for implicit diffusion | 0.02761908 | 7.3 | 6 -Computation of the time step dt | 0.004369273 | 1.2 | 10 -Turbulence model::update | 0.00482918 | 1.3 | 2 -Post-treatment operations | 0.03422401 | 9.1 | 2 -Other operations | 0.01090807 | 2.9 | +Linear solver resolutions Ax=B | 0.118033 | 33.2 | 4 +Matrix assembly for implicit scheme | 0.01421842 | 4.0 | 6 +Convection operator | 0.06594422 | 18.5 | 8 +Diffusion operator | 0.03895065 | 11.0 | 36 +Gradient operator | 0.01349732 | 3.8 | 9 +Divergence operator | 0.003659466 | 1.0 | 8 +Source terms | 0.001029953 | 0.3 | 6 +Update ::mettre_a_jour | 0.01316835 | 3.7 | 5 +Solver for implicit diffusion | 0.03054857 | 8.6 | 6 +Computation of the time step dt | 0.003698007 | 1.0 | 10 +Turbulence model::update | 0.004816221 | 1.4 | 2 +Post-treatment operations | 0.03526975 | 9.9 | 2 +Other operations | 0.0128344 | 3.6 | Average number of iteration of the linear solver per call: 35.4 @@ -64,17 +64,17 @@ Average number of iteration of the linear solver per call: 35.4 ----------------------------------------------------------------------------------------------------------- Counter description | Time per step | % loop time | Call(s)/step | Bandwidth ----------------------------------------------------------------------------------------------------------- -Libraries: | 0.118884 | 31.5 | 4 | -Kernels: | 0.205998 | 54.6 | 1993 | -Copy host to device: | 0.000907065 | 0.2 | 43 | 3.2 GB/s -Copy device to host: | 0.00452331 | 1.2 | 7 | 8.2 GB/s -Alloc/Free on device: | 0.000677801 | 0.2 | 23 | -GPU: 86% Copy H<->D: 1.4% Alloc/free: 0.18% Comm: 0% CPU & I/O: 12% +Libraries: | 0.117593 | 33.1 | 4 | +Kernels: | 0.185695 | 52.2 | 1991 | +Copy host to device: | 0.000912839 | 0.3 | 43 | 3.2 GB/s +Copy device to host: | 0.00452191 | 1.3 | 7 | 8.2 GB/s +Alloc/Free on device: | 0.000529603 | 0.1 | 23 | +GPU: 85% Copy H<->D: 1.5% Alloc/free: 0.15% Comm: 0% CPU & I/O: 13% ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Post-resolution statistics ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -Time of the post-resolution: 0.28617 +Time of the post-resolution: 0.251046 -Total time for the whole computation 64.4195 +Total time for the whole computation 47.4947 -[Slurm] Power consumption (79 s): 0.443 kW 0.010 kWh 0.001 € (0.10€/kWh) +[Slurm] Power consumption (56 s): 0.441 kW 0.007 kWh 0.001 € (0.10€/kWh) diff --git a/tests/GPU/DomainFlowLES/DomainFlowLES_BENCH.TU.lumi_gfx90a b/tests/GPU/DomainFlowLES/DomainFlowLES_BENCH.TU.lumi_gfx90a index e2bf102ad2..65f7f6e603 100644 --- a/tests/GPU/DomainFlowLES/DomainFlowLES_BENCH.TU.lumi_gfx90a +++ b/tests/GPU/DomainFlowLES/DomainFlowLES_BENCH.TU.lumi_gfx90a @@ -8,8 +8,8 @@ Time is given in seconds ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Context of the computation ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -Date: 23-02-2026 -- 23:56:09 -OS: nid007971__Linux__x86_64__6.4.0-150600.23.73_15.0.14-cray_shasta_c__#1 SMP Tue Oct 21 20:32:25 UTC 2025 (89d3c98) +Date: 18-05-2026 -- 08:46:58 +OS: nid007955__Linux__x86_64__6.4.0-150600.23.73_15.0.14-cray_shasta_c__#1 SMP Tue Oct 21 20:32:25 UTC 2025 (89d3c98) CPU model : AMD EPYC 7A53 64-Core Processor Total number of threads:128 GPU model: AMD Instinct MI250X @@ -22,41 +22,41 @@ Total number of elements used for the calculation: 3276800 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Computation start-up statistics ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -Total time of the start-up: 102.82 +Total time of the start-up: 116.834 Number of calls to the linear solver per time step: 4 -Average time of the resolution of the linear problem per call: 1.87468 -Average number of iteration of the linear solver per call: 19.5 +Average time of the resolution of the linear problem per call: 1.98809 +Average number of iteration of the linear solver per call: 21 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Time loop statistics ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ The first time step is not accounted for the computation of the time loop statistics -Total time of the time loop: 12.0037 +Total time of the time loop: 9.37967 Number of time steps: 9 Skipped time steps: 1 -Average time per time step: 1.33374 -Standard deviation between time steps: 0.0778389 -Time elapsed in the skipped time steps: 3.78639 +Average time per time step: 1.04219 +Standard deviation between time steps: 0.0675697 +Time elapsed in the skipped time steps: 3.91614 Standard counter description | Time/step | % loop time | Call(s)/step ------------------------------------------------------------------------------------------ -Linear solver resolutions Ax=B | 0.20903 | 11.9 | 4 -Matrix assembly for implicit scheme | 0.1245094 | 7.1 | 6 -Convection operator | 0.4432143 | 25.3 | 8 -Diffusion operator | 0.1587154 | 9.0 | 36 -Gradient operator | 0.1311887 | 7.5 | 9 -Divergence operator | 0.03958435 | 2.3 | 8 -Source terms | 0.001954178 | 0.1 | 6 -Update ::mettre_a_jour | 0.1000263 | 5.7 | 5 -Solver for implicit diffusion | 0.05686906 | 3.2 | 6 -Computation of the time step dt | 0.02558297 | 1.5 | 10 -Turbulence model::update | 0.01002186 | 0.6 | 2 -Post-treatment operations | 0.02588545 | 1.5 | 2 -Other operations | 0.00715868 | 0.4 | +Linear solver resolutions Ax=B | 0.159419 | 15.3 | 4 +Matrix assembly for implicit scheme | 0.09955579 | 9.6 | 6 +Convection operator | 0.420294 | 40.3 | 8 +Diffusion operator | 0.1025904 | 9.8 | 36 +Gradient operator | 0.06268515 | 6.0 | 9 +Divergence operator | 0.02317575 | 2.2 | 8 +Source terms | 0.001829052 | 0.2 | 6 +Update ::mettre_a_jour | 0.04758081 | 4.6 | 5 +Solver for implicit diffusion | 0.05482205 | 5.3 | 6 +Computation of the time step dt | 0.01917573 | 1.8 | 10 +Turbulence model::update | 0.00881079 | 0.8 | 2 +Post-treatment operations | 0.02308013 | 2.2 | 2 +Other operations | 0.01916755 | 1.8 | -Average number of iteration of the linear solver per call: 34.5 +Average number of iteration of the linear solver per call: 35.3 ----------------------------------------------------------------------------------------------------------- @@ -64,17 +64,17 @@ Average number of iteration of the linear solver per call: 34.5 ----------------------------------------------------------------------------------------------------------- Counter description | Time per step | % loop time | Call(s)/step | Bandwidth ----------------------------------------------------------------------------------------------------------- -Libraries: | 0.206993 | 15.5 | 4 | -Kernels: | 1.09133 | 81.8 | 1993 | -Copy host to device: | 0.000990319 | 0.1 | 43 | 2.9 GB/s -Copy device to host: | 0.00182308 | 0.1 | 7 | 20.4 GB/s -Alloc/Free on device: | 0.00034577 | 0.0 | 23 | -GPU: 97% Copy H<->D: 0.21% Alloc/free: 0.026% Comm: 0% CPU & I/O: 2.4% +Libraries: | 0.158676 | 15.2 | 4 | +Kernels: | 0.847624 | 81.3 | 1991 | +Copy host to device: | 0.00099656 | 0.1 | 43 | 2.9 GB/s +Copy device to host: | 0.00182369 | 0.2 | 7 | 20.4 GB/s +Alloc/Free on device: | 0.00034191 | 0.0 | 23 | +GPU: 97% Copy H<->D: 0.27% Alloc/free: 0.033% Comm: 0% CPU & I/O: 3.1% ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Post-resolution statistics ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -Time of the post-resolution: 0.577526 +Time of the post-resolution: 0.696635 -Total time for the whole computation 119.187 +Total time for the whole computation 130.827 -Power consumption: 0 kW 0 kWh 0 € (0.12€/kWh) +[Slurm] Power consumption (155 s): 0.503 kW 0.022 kWh 0.002 € (0.10€/kWh) diff --git a/tests/GPU/DomainFlowLES/DomainFlowLES_BENCH.TU.topaze_cc80 b/tests/GPU/DomainFlowLES/DomainFlowLES_BENCH.TU.topaze_cc80 index f863c3cda3..5b32389127 100644 --- a/tests/GPU/DomainFlowLES/DomainFlowLES_BENCH.TU.topaze_cc80 +++ b/tests/GPU/DomainFlowLES/DomainFlowLES_BENCH.TU.topaze_cc80 @@ -8,8 +8,8 @@ Time is given in seconds ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Context of the computation ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -Date: 20-02-2026 -- 17:41:21 -OS: topaze7059__Linux__x86_64__4.18.0-553.89.1.el8_10.x86_64__#1 SMP Sat Nov 29 00:49:18 EST 2025 +Date: 15-05-2026 -- 13:28:03 +OS: topaze7064__Linux__x86_64__4.18.0-553.89.1.el8_10.x86_64__#1 SMP Sat Nov 29 00:49:18 EST 2025 CPU model : AMD EPYC 7763 64-Core Processor Total number of threads:256 GPU model: NVIDIA A100-SXM4-80GB @@ -22,39 +22,39 @@ Total number of elements used for the calculation: 3276800 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Computation start-up statistics ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -Total time of the start-up: 75.2339 +Total time of the start-up: 75.4274 Number of calls to the linear solver per time step: 4 -Average time of the resolution of the linear problem per call: 1.19536 +Average time of the resolution of the linear problem per call: 1.22135 Average number of iteration of the linear solver per call: 21.75 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Time loop statistics ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ The first time step is not accounted for the computation of the time loop statistics -Total time of the time loop: 4.85899 +Total time of the time loop: 4.19424 Number of time steps: 9 Skipped time steps: 1 -Average time per time step: 0.539888 -Standard deviation between time steps: 0.0744115 -Time elapsed in the skipped time steps: 3.20339 +Average time per time step: 0.466026 +Standard deviation between time steps: 0.0711581 +Time elapsed in the skipped time steps: 3.00923 Standard counter description | Time/step | % loop time | Call(s)/step ------------------------------------------------------------------------------------------ -Linear solver resolutions Ax=B | 0.163403 | 18.2 | 4 -Matrix assembly for implicit scheme | 0.02544632 | 2.8 | 6 -Convection operator | 0.1210334 | 13.5 | 8 -Diffusion operator | 0.06967004 | 7.8 | 36 -Gradient operator | 0.02544889 | 2.8 | 9 -Divergence operator | 0.0106954 | 1.2 | 8 -Source terms | 0.002139126 | 0.2 | 6 -Update ::mettre_a_jour | 0.01935749 | 2.2 | 5 -Solver for implicit diffusion | 0.0451121 | 5.0 | 6 -Computation of the time step dt | 0.009243173 | 1.0 | 10 -Turbulence model::update | 0.004829846 | 0.5 | 2 -Post-treatment operations | 0.02625407 | 2.9 | 2 -Other operations | 0.01725465 | 1.9 | +Linear solver resolutions Ax=B | 0.160839 | 34.5 | 4 +Matrix assembly for implicit scheme | 0.02282103 | 4.9 | 6 +Convection operator | 0.09259933 | 19.9 | 8 +Diffusion operator | 0.05369537 | 11.5 | 36 +Gradient operator | 0.01970627 | 4.2 | 9 +Divergence operator | 0.005499746 | 1.2 | 8 +Source terms | 0.001331374 | 0.3 | 6 +Update ::mettre_a_jour | 0.01598558 | 3.4 | 5 +Solver for implicit diffusion | 0.04173282 | 9.0 | 6 +Computation of the time step dt | 0.00593109 | 1.3 | 10 +Turbulence model::update | 0.003905554 | 0.8 | 2 +Post-treatment operations | 0.02473622 | 5.3 | 2 +Other operations | 0.0172433 | 3.7 | Average number of iteration of the linear solver per call: 35.4 @@ -64,16 +64,17 @@ Average number of iteration of the linear solver per call: 35.4 ----------------------------------------------------------------------------------------------------------- Counter description | Time per step | % loop time | Call(s)/step | Bandwidth ----------------------------------------------------------------------------------------------------------- -Libraries: | 0.162821 | 30.2 | 4 | -Kernels: | 0.334537 | 62.0 | 1993 | -Copy host to device: | 0.000898156 | 0.2 | 43 | 3.2 GB/s -Copy device to host: | 0.00366181 | 0.7 | 7 | 10.2 GB/s -Alloc/Free on device: | 0.000978722 | 0.2 | 23 | -GPU: 92% Copy H<->D: 0.84% Alloc/free: 0.18% Comm: 0% CPU & I/O: 6.9% +Libraries: | 0.16027 | 34.4 | 4 | +Kernels: | 0.265791 | 57.0 | 1991 | +Copy host to device: | 0.000874332 | 0.2 | 43 | 3.3 GB/s +Copy device to host: | 0.00237432 | 0.5 | 7 | 15.7 GB/s +Alloc/Free on device: | 0.000643922 | 0.1 | 23 | +GPU: 91% Copy H<->D: 0.7% Alloc/free: 0.14% Comm: 0% CPU & I/O: 7.7% ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Post-resolution statistics ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -Time of the post-resolution: 0.480654 +Time of the post-resolution: 0.277696 -Total time for the whole computation 83.7769 +Total time for the whole computation 82.9086 +[Slurm] Power consumption (114 s): 0.577 kW 0.018 kWh 0.002 € (0.10€/kWh) diff --git a/tests/GPU/DomainFlowLES/check_perf.sh b/tests/GPU/DomainFlowLES/check_perf.sh index 0c9c696391..4972132040 100755 --- a/tests/GPU/DomainFlowLES/check_perf.sh +++ b/tests/GPU/DomainFlowLES/check_perf.sh @@ -56,6 +56,7 @@ run() # Try to mitigate variablity by setting exclusive mode on GPU (firefox, slack, edge, chrome, use device !) [ "$np" = "" ] && [ "$TRUST_WITHOUT_HOST" = 1 ] && [ "`hostname`" = is157091 ] && set_EXCLUSIVE_PROCESS=`sudo ls 2>/dev/null` [ "$set_EXCLUSIVE_PROCESS" != "" ] && sudo nvidia-smi -c EXCLUSIVE_PROCESS 1>/dev/null + trust -clean 1>/dev/null 2>&1 # Clean the files for IO trust $nsys $jdd $np 1>$jdd.out_err 2>&1 [ "$set_EXCLUSIVE_PROCESS" != "" ] && sudo nvidia-smi -c DEFAULT 1>/dev/null check $jdd $gpu @@ -96,6 +97,7 @@ else then [ "`grep -i 'nb_parts 8' $jdd.data`" != "" ] && run $HOST$GPU_ARCH 8 fi + #[ "`grep 'PARALLEL OK' $jdd.data`" != "" ] && run $HOST$GPU_ARCH 2 fi # clean rm -f *.sauv *.lml *.sqlite *.nsys-rep diff --git a/tests/GPU/GAMELAN_AMG/GAMELAN_AMG.data b/tests/GPU/GAMELAN_AMG/GAMELAN_AMG.data new file mode 100644 index 0000000000..bf5eb4afa5 --- /dev/null +++ b/tests/GPU/GAMELAN_AMG/GAMELAN_AMG.data @@ -0,0 +1,141 @@ +# Warning: Degraded data file of GAMELAN simulation. Do not use except for non regression testing ! # +# PARALLEL OK # + +dimension 3 + +Pb_Thermohydraulique_QC pb1 + +Domaine dom + +# BEGIN MESH # +lire_fichier dom dom.geom +/* raffiner_isotrope dom raffiner_isotrope dom */ +# END MESH # +# BEGIN PARTITION +Partition dom +{ + Partition_tool metis { Nb_parts 2 } + Larg_joint 2 + zones_name dom + ecrire_lata dom.lata + reorder 1 + single_hdf +} +End +END PARTITION # +# BEGIN SCATTER +Scatter dom.Zones dom +END SCATTER # + +VDF dis +Lire dis { reorder { algo hilbert } } + +Runge_Kutta_Rationnel_ordre_2 sch + +Lire sch +{ + nb_pas_dt_max 10 + tinit 0. + dt_start dt_calc + tmax 20. + dt_min 1.e-7 + dt_max 6.e-3 + dt_impr 1.e-7 + + seuil_statio 1.e-14 + facsec 1 + diffusion_implicite 1 + seuil_diffusion_implicite 1.e-10 + tcpumax 23.30 + +} + +Associer pb1 dom +Associer pb1 sch + +option_vdf { + p_imposee_aux_faces oui +} +Discretiser pb1 dis + +Lire pb1 +{ + + fluide_quasi_compressible { + gravite champ_uniforme 3 0 0. -9.81 + + pression 100000. + + mu champ_fonc_fonction pb1 temperature 1 (0.86269e-5*val*0.02897*(8.0^0.5)*((1.0+(0.0020159/0.02897))^0.5))/(((val*0.02897*(8.0^0.5)*((1.0+(0.0020159/0.02897))^0.5))+(1.0-val)*0.0020159*((1.0+(((0.86269/1.792)^0.5)*((0.02897/0.0020159)^0.25)))^2)))+(1.792e-5*(1.0-val)*0.0020159*(8.0^0.5)*((1.0+(0.02897/0.0020159))^0.5))/((((1.0-val)*0.0020159*(8.0^0.5)*((1.0+(0.02897/0.0020159))^0.5))+(val)*0.02897*((1.0+(((1.792/0.86269)^0.5)*((0.0020159/0.02897)^0.25)))^2))) + + lambda champ_fonc_fonction pb1 temperature 1 7.72e-5*(100000.0*0.02897)/(8.314472*284.15*((((0.02897/0.0020159)-1.0)*val)+1.00)) + + loi_etat gaz_parfait_QC { + + Prandtl 0.189301713586576 + Cp 1. + gamma 1.4 + } + + traitement_pth constant + + } + + + + Navier_Stokes_QC + { + + solveur_pression amg gcp { rtol 1.e-7 impr } + convection { centre } + diffusion { } + + conditions_initiales + { + vitesse Champ_uniforme 3 0. 0. 0. + } + conditions_limites + { + w_w_w_w paroi_fixe + c_c_c_c_c_c_c%0 paroi_fixe + p paroi_fixe + in frontiere_ouverte_rho_U_impose Champ_front_uniforme 3 0. 0. 0.0264513805222214 + o_o_o_o Frontiere_ouverte_pression_imposee Champ_front_fonc_XYZ 1 -12.0291562777197*z + f Frontiere_ouverte_pression_imposee Champ_front_fonc_XYZ 1 -12.0291562777197*z + g Frontiere_ouverte_pression_imposee Champ_front_fonc_XYZ 1 -12.0291562777197*z + } + + } + + Convection_diffusion_Chaleur_QC + { + convection { quick } + diffusion { } + conditions_initiales + { + temperature Champ_Fonc_XYZ dom 1 273+10*(x+y+z) + } + + conditions_limites + { + w_w_w_w paroi_adiabatique + c_c_c_c_c_c_c%0 paroi_adiabatique + p paroi_adiabatique + in frontiere_ouverte_temperature_imposee Champ_front_Uniforme 1 273 + o_o_o_o frontiere_ouverte T_ext Champ_front_Uniforme 1 273 + f frontiere_ouverte T_ext Champ_front_Uniforme 1 272 + g frontiere_ouverte T_ext Champ_front_Uniforme 1 272 + } + } + Postraitement + { + Champs dt_post 1000 + { + pression elem + } + } +} +Resoudre pb1 +Fin + + diff --git a/tests/GPU/GAMELAN_AMG/GAMELAN_AMG.lml.gz b/tests/GPU/GAMELAN_AMG/GAMELAN_AMG.lml.gz new file mode 120000 index 0000000000..0a5fc3c894 --- /dev/null +++ b/tests/GPU/GAMELAN_AMG/GAMELAN_AMG.lml.gz @@ -0,0 +1 @@ +../../Dilatable/GAMELAN/GAMELAN.lml.gz \ No newline at end of file diff --git a/tests/GPU/GAMELAN_AMG/GAMELAN_AMG_BENCH.TU.adastra_gfx90a b/tests/GPU/GAMELAN_AMG/GAMELAN_AMG_BENCH.TU.adastra_gfx90a new file mode 100644 index 0000000000..6cdfd9503a --- /dev/null +++ b/tests/GPU/GAMELAN_AMG/GAMELAN_AMG_BENCH.TU.adastra_gfx90a @@ -0,0 +1,78 @@ + # Global performance file # + +This is the global file for tracking performance in TRUST. It stores aggregated quantities. +More detailed statistics can be found in the GAMELAN_AMG_BENCH_csv.TU file +For time loop, only standard counters of level 1 are printed alongside your custom counters +Time is given in seconds + +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Context of the computation +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Date: 23-04-2026 -- 09:20:04 +OS: g1109__Linux__x86_64__5.14.0-570.69.1.el9_6.x86_64__#1 SMP PREEMPT_DYNAMIC Tue Nov 25 01:30:14 EST 2025 +CPU model : AMD EPYC 7A53 64-Core Processor +Total number of threads:128 +GPU model: AMD Instinct MI250X +HIP runtime version: 6.43 +HIP drivers version: 6.43 +Nb procs used for the computation: 1 +TRUST version: 1.9.8_beta +Total number of elements used for the calculation: 600576 + +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Computation start-up statistics +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Total time of the start-up: 10.3899 + +Number of calls to the linear solver per time step: 2 +Average time of the resolution of the linear problem per call: 0.443242 +Average number of iteration of the linear solver per call: 19 + +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Time loop statistics +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +The first time step is not accounted for the computation of the time loop statistics +Total time of the time loop: 2.21534 +Number of time steps: 9 +Skipped time steps: 1 +Average time per time step: 0.246149 +Standard deviation between time steps: 0.072186 +Time elapsed in the skipped time steps: 0.545262 + + +Standard counter description | Time/step | % loop time | Call(s)/step +------------------------------------------------------------------------------------------ +Linear solver resolutions Ax=B | 0.0277442 | 11.3 | 2 +Convection operator | 0.01435192 | 5.8 | 7 +Diffusion operator | 0.01873119 | 7.6 | 16 +Gradient operator | 0.004311924 | 1.8 | 5 +Divergence operator | 0.001880982 | 0.8 | 6 +Source terms | 0.000381754 | 0.2 | 4 +Update ::mettre_a_jour | 0.01100964 | 4.5 | 4 +Solver for implicit diffusion | 0.004626703 | 1.9 | 4 +Computation of the time step dt | 0.00107997 | 0.4 | 6 +Post-treatment operations | 0.1561464 | 63.4 | 1 +Other operations | 0.005884409 | 2.4 | + +Average number of iteration of the linear solver per call: 22 + + +----------------------------------------------------------------------------------------------------------- + GPU statistics +----------------------------------------------------------------------------------------------------------- +Counter description | Time per step | % loop time | Call(s)/step | Bandwidth +----------------------------------------------------------------------------------------------------------- +Libraries: | 0.0275371 | 11.2 | 2 | +Kernels: | 0.172778 | 70.2 | 1073 | +Copy host to device: | 0.00248311 | 1.0 | 107 | 5.3 GB/s +Copy device to host: | 0.00041268 | 0.2 | 6 | 11.6 GB/s +Alloc/Free on device: | 2.92352e-05 | 0.0 | 856 | +GPU: 81% Copy H<->D: 1.2% Alloc/free: 0.012% Comm: 0% CPU & I/O: 17% +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Post-resolution statistics +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Time of the post-resolution: 0.0288232 + +Total time for the whole computation 13.1794 + +[Slurm] Power consumption (21 s): 0.373 kW 0.002 kWh 0.000 € (0.10€/kWh) diff --git a/tests/GPU/GAMELAN_AMG/GAMELAN_AMG_BENCH.TU.adastra_gfx942 b/tests/GPU/GAMELAN_AMG/GAMELAN_AMG_BENCH.TU.adastra_gfx942 new file mode 100644 index 0000000000..79f6321bed --- /dev/null +++ b/tests/GPU/GAMELAN_AMG/GAMELAN_AMG_BENCH.TU.adastra_gfx942 @@ -0,0 +1,78 @@ + # Global performance file # + +This is the global file for tracking performance in TRUST. It stores aggregated quantities. +More detailed statistics can be found in the GAMELAN_AMG_BENCH_csv.TU file +For time loop, only standard counters of level 1 are printed alongside your custom counters +Time is given in seconds + +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Context of the computation +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Date: 23-04-2026 -- 14:40:15 +OS: a1003__Linux__x86_64__5.14.0-570.69.1.el9_6.x86_64__#1 SMP PREEMPT_DYNAMIC Tue Nov 25 01:30:14 EST 2025 +CPU model : AMD Instinct MI300A Accelerator +Total number of threads:192 +GPU model: AMD Instinct MI300A +HIP runtime version: 6.43 +HIP drivers version: 6.43 +Nb procs used for the computation: 1 +TRUST version: 1.9.8_beta +Total number of elements used for the calculation: 600576 + +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Computation start-up statistics +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Total time of the start-up: 10.7076 + +Number of calls to the linear solver per time step: 2 +Average time of the resolution of the linear problem per call: 0.491718 +Average number of iteration of the linear solver per call: 19 + +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Time loop statistics +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +The first time step is not accounted for the computation of the time loop statistics +Total time of the time loop: 0.764317 +Number of time steps: 9 +Skipped time steps: 1 +Average time per time step: 0.0849242 +Standard deviation between time steps: 0.0536174 +Time elapsed in the skipped time steps: 0.289352 + + +Standard counter description | Time/step | % loop time | Call(s)/step +------------------------------------------------------------------------------------------ +Linear solver resolutions Ax=B | 0.0168719 | 19.9 | 2 +Convection operator | 0.01172727 | 13.8 | 7 +Diffusion operator | 0.01047739 | 12.3 | 16 +Gradient operator | 0.004039262 | 4.8 | 5 +Divergence operator | 0.001213214 | 1.4 | 6 +Source terms | 0.0002689149 | 0.3 | 4 +Update ::mettre_a_jour | 0.006867849 | 8.1 | 4 +Solver for implicit diffusion | 0.004425528 | 5.2 | 4 +Computation of the time step dt | 0.0008362072 | 1.0 | 6 +Post-treatment operations | 0.02350376 | 27.7 | 1 +Other operations | 0.004692891 | 5.5 | + +Average number of iteration of the linear solver per call: 22 + + +----------------------------------------------------------------------------------------------------------- + GPU statistics +----------------------------------------------------------------------------------------------------------- +Counter description | Time per step | % loop time | Call(s)/step | Bandwidth +----------------------------------------------------------------------------------------------------------- +Libraries: | 0.0166839 | 19.6 | 2 | +Kernels: | 0.0298892 | 35.2 | 1073 | +Copy host to device: | 0.00228099 | 2.7 | 107 | 5.8 GB/s +Copy device to host: | 0.000326292 | 0.4 | 6 | 14.7 GB/s +Alloc/Free on device: | 2.7959e-05 | 0.0 | 856 | +GPU: 55% Copy H<->D: 3.1% Alloc/free: 0.033% Comm: 0% CPU & I/O: 42% +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Post-resolution statistics +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Time of the post-resolution: 0.0255038 + +Total time for the whole computation 11.7868 + +[Slurm] Power consumption (21 s): 0.485 kW 0.003 kWh 0.000 € (0.10€/kWh) diff --git a/tests/GPU/GAMELAN_AMG/GAMELAN_AMG_BENCH.TU.dalianvl_cc100 b/tests/GPU/GAMELAN_AMG/GAMELAN_AMG_BENCH.TU.dalianvl_cc100 new file mode 100644 index 0000000000..5c43126385 --- /dev/null +++ b/tests/GPU/GAMELAN_AMG/GAMELAN_AMG_BENCH.TU.dalianvl_cc100 @@ -0,0 +1,77 @@ + # Global performance file # + +This is the global file for tracking performance in TRUST. It stores aggregated quantities. +More detailed statistics can be found in the GAMELAN_AMG_BENCH_csv.TU file +For time loop, only standard counters of level 1 are printed alongside your custom counters +Time is given in seconds + +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Context of the computation +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Date: 22-05-2026 -- 12:14:24 +OS: dalianvl14__Linux__aarch64__5.14.0-503.40.1.el9_5.aarch64__#1 SMP PREEMPT_DYNAMIC Thu Apr 24 13:05:29 UTC 2025 +CPU model : Unknown Linux CPU +Total number of threads:144 +GPU model: NVIDIA GB200 +CUDA runtime version: 13.0 +CUDA drivers version: 13.20 +Nb procs used for the computation: 1 +TRUST version: 1.9.8_beta +Total number of elements used for the calculation: 600576 + +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Computation start-up statistics +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Total time of the start-up: 5.34843 + +Number of calls to the linear solver per time step: 2 +Average time of the resolution of the linear problem per call: 0.303297 +Average number of iteration of the linear solver per call: 19 + +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Time loop statistics +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +The first time step is not accounted for the computation of the time loop statistics +Total time of the time loop: 0.662786 +Number of time steps: 9 +Skipped time steps: 1 +Average time per time step: 0.0736429 +Standard deviation between time steps: 0.0515967 +Time elapsed in the skipped time steps: 0.246865 + + +Standard counter description | Time/step | % loop time | Call(s)/step +------------------------------------------------------------------------------------------ +Linear solver resolutions Ax=B | 0.0194675 | 26.4 | 2 +Convection operator | 0.008774242 | 11.9 | 7 +Diffusion operator | 0.008319704 | 11.3 | 16 +Gradient operator | 0.00259036 | 3.5 | 5 +Divergence operator | 0.0009964828 | 1.4 | 6 +Source terms | 0.0001999642 | 0.3 | 4 +Update ::mettre_a_jour | 0.003929349 | 5.3 | 4 +Solver for implicit diffusion | 0.002681795 | 3.6 | 4 +Computation of the time step dt | 0.0006292012 | 0.9 | 6 +Post-treatment operations | 0.02159892 | 29.3 | 1 +Other operations | 0.004455399 | 6.1 | + +Average number of iteration of the linear solver per call: 22 + + +----------------------------------------------------------------------------------------------------------- + GPU statistics +----------------------------------------------------------------------------------------------------------- +Counter description | Time per step | % loop time | Call(s)/step | Bandwidth +----------------------------------------------------------------------------------------------------------- +Libraries: | 0.0192347 | 26.1 | 2 | +Kernels: | 0.0204398 | 27.8 | 1116 | +Copy host to device: | 0.00176518 | 2.4 | 102 | 5.0 GB/s +Copy device to host: | 0.000111953 | 0.2 | 6 | 4.5 GB/s +Alloc/Free on device: | 3.53492e-05 | 0.0 | 856 | +GPU: 54% Copy H<->D: 2.5% Alloc/free: 0.048% Comm: 0% CPU & I/O: 44% +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Post-resolution statistics +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Time of the post-resolution: 0.0184735 + +Total time for the whole computation 6.27657 + diff --git a/tests/GPU/GAMELAN_AMG/GAMELAN_AMG_BENCH.TU.eureka_cc89 b/tests/GPU/GAMELAN_AMG/GAMELAN_AMG_BENCH.TU.eureka_cc89 new file mode 100644 index 0000000000..9b15d03726 --- /dev/null +++ b/tests/GPU/GAMELAN_AMG/GAMELAN_AMG_BENCH.TU.eureka_cc89 @@ -0,0 +1,77 @@ + # Global performance file # + +This is the global file for tracking performance in TRUST. It stores aggregated quantities. +More detailed statistics can be found in the GAMELAN_AMG_BENCH_csv.TU file +For time loop, only standard counters of level 1 are printed alongside your custom counters +Time is given in seconds + +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Context of the computation +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Date: 11-05-2026 -- 08:30:32 +OS: eureka__Linux__x86_64__6.17.0-20-generic__#20~24.04.1-Ubuntu SMP PREEMPT_DYNAMIC Thu Mar 19 01:28:37 UTC 2 +CPU model : INTEL(R) XEON(R) PLATINUM 8580 +Total number of threads:240 +GPU model: NVIDIA RTX 6000 Ada Generation +CUDA runtime version: 12.90 +CUDA drivers version: 13.10 +Nb procs used for the computation: 1 +TRUST version: 1.9.8_beta +Total number of elements used for the calculation: 600576 + +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Computation start-up statistics +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Total time of the start-up: 6.16705 + +Number of calls to the linear solver per time step: 2 +Average time of the resolution of the linear problem per call: 0.365755 +Average number of iteration of the linear solver per call: 19 + +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Time loop statistics +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +The first time step is not accounted for the computation of the time loop statistics +Total time of the time loop: 0.883002 +Number of time steps: 9 +Skipped time steps: 1 +Average time per time step: 0.0981114 +Standard deviation between time steps: 0.0399791 +Time elapsed in the skipped time steps: 0.283736 + + +Standard counter description | Time/step | % loop time | Call(s)/step +------------------------------------------------------------------------------------------ +Linear solver resolutions Ax=B | 0.0294439 | 30.0 | 2 +Convection operator | 0.01347316 | 13.7 | 7 +Diffusion operator | 0.0176114 | 18.0 | 16 +Gradient operator | 0.003485975 | 3.6 | 5 +Divergence operator | 0.006129298 | 6.2 | 6 +Source terms | 0.0004662337 | 0.5 | 4 +Update ::mettre_a_jour | 0.005655361 | 5.8 | 4 +Solver for implicit diffusion | 0.002242366 | 2.3 | 4 +Computation of the time step dt | 0.0007874331 | 0.8 | 6 +Post-treatment operations | 0.01724572 | 17.6 | 1 +Other operations | 0.001570476 | 1.6 | + +Average number of iteration of the linear solver per call: 22 + + +----------------------------------------------------------------------------------------------------------- + GPU statistics +----------------------------------------------------------------------------------------------------------- +Counter description | Time per step | % loop time | Call(s)/step | Bandwidth +----------------------------------------------------------------------------------------------------------- +Libraries: | 0.0293034 | 29.9 | 2 | +Kernels: | 0.023372 | 23.8 | 1099 | +Copy host to device: | 0.00983065 | 10.0 | 120 | 9.1 GB/s +Copy device to host: | 0.00637157 | 6.5 | 24 | 12.7 GB/s +Alloc/Free on device: | 2.12862e-05 | 0.0 | 856 | +GPU: 54% Copy H<->D: 17% Alloc/free: 0.022% Comm: 0% CPU & I/O: 30% +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Post-resolution statistics +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Time of the post-resolution: 0.0112591 + +Total time for the whole computation 7.34506 + diff --git a/tests/GPU/GAMELAN_AMG/GAMELAN_AMG_BENCH.TU.irene-amd-ccrt_cc70 b/tests/GPU/GAMELAN_AMG/GAMELAN_AMG_BENCH.TU.irene-amd-ccrt_cc70 new file mode 100644 index 0000000000..f24564b48b --- /dev/null +++ b/tests/GPU/GAMELAN_AMG/GAMELAN_AMG_BENCH.TU.irene-amd-ccrt_cc70 @@ -0,0 +1,78 @@ + # Global performance file # + +This is the global file for tracking performance in TRUST. It stores aggregated quantities. +More detailed statistics can be found in the GAMELAN_AMG_BENCH_csv.TU file +For time loop, only standard counters of level 1 are printed alongside your custom counters +Time is given in seconds + +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Context of the computation +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Date: 23-04-2026 -- 09:21:57 +OS: irene7050__Linux__x86_64__4.18.0-553.89.1.el8_10.x86_64__#1 SMP Sat Nov 29 00:49:18 EST 2025 +CPU model : Intel(R) Xeon(R) Gold 6230 CPU @ 2.10GHz +Total number of threads:80 +GPU model: Tesla V100-SXM2-16GB +CUDA runtime version: 12.90 +CUDA drivers version: 13.0 +Nb procs used for the computation: 1 +TRUST version: 1.9.8_beta +Total number of elements used for the calculation: 600576 + +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Computation start-up statistics +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Total time of the start-up: 6.53659 + +Number of calls to the linear solver per time step: 2 +Average time of the resolution of the linear problem per call: 0.526084 +Average number of iteration of the linear solver per call: 19 + +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Time loop statistics +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +The first time step is not accounted for the computation of the time loop statistics +Total time of the time loop: 1.25661 +Number of time steps: 9 +Skipped time steps: 1 +Average time per time step: 0.139624 +Standard deviation between time steps: 0.0814119 +Time elapsed in the skipped time steps: 0.661938 + + +Standard counter description | Time/step | % loop time | Call(s)/step +------------------------------------------------------------------------------------------ +Linear solver resolutions Ax=B | 0.0439441 | 31.5 | 2 +Convection operator | 0.01506447 | 10.8 | 7 +Diffusion operator | 0.01830773 | 13.1 | 16 +Gradient operator | 0.004852523 | 3.5 | 5 +Divergence operator | 0.001835266 | 1.3 | 6 +Source terms | 0.0004075369 | 0.3 | 4 +Update ::mettre_a_jour | 0.007209213 | 5.2 | 4 +Solver for implicit diffusion | 0.005015587 | 3.6 | 4 +Computation of the time step dt | 0.001144518 | 0.8 | 6 +Post-treatment operations | 0.03472955 | 24.9 | 1 +Other operations | 0.007113221 | 5.1 | + +Average number of iteration of the linear solver per call: 22 + + +----------------------------------------------------------------------------------------------------------- + GPU statistics +----------------------------------------------------------------------------------------------------------- +Counter description | Time per step | % loop time | Call(s)/step | Bandwidth +----------------------------------------------------------------------------------------------------------- +Libraries: | 0.0436994 | 31.3 | 2 | +Kernels: | 0.0398327 | 28.5 | 1073 | +Copy host to device: | 0.00489413 | 3.5 | 107 | 2.7 GB/s +Copy device to host: | 0.00110194 | 0.8 | 6 | 4.4 GB/s +Alloc/Free on device: | 2.8232e-05 | 0.0 | 856 | +GPU: 60% Copy H<->D: 4.3% Alloc/free: 0.02% Comm: 0% CPU & I/O: 36% +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Post-resolution statistics +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Time of the post-resolution: 0.0293262 + +Total time for the whole computation 8.48449 + +[Slurm] Power consumption (23 s): 0.290 kW 0.002 kWh 0.000 € (0.10€/kWh) diff --git a/tests/GPU/GAMELAN_AMG/GAMELAN_AMG_BENCH.TU.is157091_cc86 b/tests/GPU/GAMELAN_AMG/GAMELAN_AMG_BENCH.TU.is157091_cc86 new file mode 100644 index 0000000000..098287e410 --- /dev/null +++ b/tests/GPU/GAMELAN_AMG/GAMELAN_AMG_BENCH.TU.is157091_cc86 @@ -0,0 +1,77 @@ + # Global performance file # + +This is the global file for tracking performance in TRUST. It stores aggregated quantities. +More detailed statistics can be found in the GAMELAN_AMG_BENCH_csv.TU file +For time loop, only standard counters of level 1 are printed alongside your custom counters +Time is given in seconds + +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Context of the computation +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Date: 31-05-2026 -- 19:50:56 +OS: is157091__Linux__x86_64__5.11.12-300.fc34.x86_64__#1 SMP Wed Apr 7 16:31:13 UTC 2021 +CPU model : AMD Ryzen Threadripper PRO 5975WX 32-Cores +Total number of threads:64 +GPU model: NVIDIA RTX A6000 +CUDA runtime version: 12.90 +CUDA drivers version: 13.20 +Nb procs used for the computation: 1 +TRUST version: 1.9.8_beta +Total number of elements used for the calculation: 600576 + +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Computation start-up statistics +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Total time of the start-up: 5.13875 + +Number of calls to the linear solver per time step: 2 +Average time of the resolution of the linear problem per call: 0.361327 +Average number of iteration of the linear solver per call: 19 + +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Time loop statistics +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +The first time step is not accounted for the computation of the time loop statistics +Total time of the time loop: 1.11979 +Number of time steps: 9 +Skipped time steps: 1 +Average time per time step: 0.124421 +Standard deviation between time steps: 0.0569485 +Time elapsed in the skipped time steps: 0.248369 + + +Standard counter description | Time/step | % loop time | Call(s)/step +------------------------------------------------------------------------------------------ +Linear solver resolutions Ax=B | 0.0481099 | 38.7 | 2 +Convection operator | 0.01159845 | 9.3 | 7 +Diffusion operator | 0.01443568 | 11.6 | 16 +Gradient operator | 0.003674675 | 3.0 | 5 +Divergence operator | 0.001333877 | 1.1 | 6 +Source terms | 0.0003881394 | 0.3 | 4 +Update ::mettre_a_jour | 0.008112219 | 6.5 | 4 +Solver for implicit diffusion | 0.004905236 | 3.9 | 4 +Computation of the time step dt | 0.001191429 | 1.0 | 6 +Post-treatment operations | 0.02340271 | 18.8 | 1 +Other operations | 0.007268525 | 5.8 | + +Average number of iteration of the linear solver per call: 22 + + +----------------------------------------------------------------------------------------------------------- + GPU statistics +----------------------------------------------------------------------------------------------------------- +Counter description | Time per step | % loop time | Call(s)/step | Bandwidth +----------------------------------------------------------------------------------------------------------- +Libraries: | 0.0479003 | 38.5 | 2 | +Kernels: | 0.0420805 | 33.8 | 1116 | +Copy host to device: | 0.00167663 | 1.3 | 102 | 5.3 GB/s +Copy device to host: | 0.000106937 | 0.1 | 6 | 4.7 GB/s +Alloc/Free on device: | 2.33561e-05 | 0.0 | 856 | +GPU: 72% Copy H<->D: 1.4% Alloc/free: 0.019% Comm: 0% CPU & I/O: 26% +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Post-resolution statistics +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Time of the post-resolution: 0.010622 + +Total time for the whole computation 6.51754 + diff --git a/tests/GPU/GAMELAN_AMG/GAMELAN_AMG_BENCH.TU.is159479_cc120 b/tests/GPU/GAMELAN_AMG/GAMELAN_AMG_BENCH.TU.is159479_cc120 new file mode 100644 index 0000000000..6fe6886c37 --- /dev/null +++ b/tests/GPU/GAMELAN_AMG/GAMELAN_AMG_BENCH.TU.is159479_cc120 @@ -0,0 +1,77 @@ + # Global performance file # + +This is the global file for tracking performance in TRUST. It stores aggregated quantities. +More detailed statistics can be found in the GAMELAN_AMG_BENCH_csv.TU file +For time loop, only standard counters of level 1 are printed alongside your custom counters +Time is given in seconds + +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Context of the computation +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Date: 13-05-2026 -- 07:02:42 +OS: is159479.intra.cea.fr__Linux__x86_64__6.12.0-160000.29-default__#1 SMP PREEMPT_DYNAMIC Fri May 1 12:45:19 UTC 2026 (6 +CPU model : AMD Ryzen Threadripper PRO 9965WX 24-Cores +Total number of threads:48 +GPU model: NVIDIA RTX PRO 6000 Blackwell Workstation Edition +CUDA runtime version: 13.0 +CUDA drivers version: 13.0 +Nb procs used for the computation: 1 +TRUST version: 1.9.8_beta +Total number of elements used for the calculation: 600576 + +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Computation start-up statistics +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Total time of the start-up: 3.77582 + +Number of calls to the linear solver per time step: 2 +Average time of the resolution of the linear problem per call: 0.224925 +Average number of iteration of the linear solver per call: 19 + +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Time loop statistics +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +The first time step is not accounted for the computation of the time loop statistics +Total time of the time loop: 0.524563 +Number of time steps: 9 +Skipped time steps: 1 +Average time per time step: 0.0582848 +Standard deviation between time steps: 0.0277033 +Time elapsed in the skipped time steps: 0.209913 + + +Standard counter description | Time/step | % loop time | Call(s)/step +------------------------------------------------------------------------------------------ +Linear solver resolutions Ax=B | 0.0235559 | 40.4 | 2 +Convection operator | 0.006250034 | 10.7 | 7 +Diffusion operator | 0.006456884 | 11.1 | 16 +Gradient operator | 0.00193865 | 3.3 | 5 +Divergence operator | 0.000565465 | 1.0 | 6 +Source terms | 0.0001516686 | 0.3 | 4 +Update ::mettre_a_jour | 0.002713566 | 4.7 | 4 +Solver for implicit diffusion | 0.001805891 | 3.1 | 4 +Computation of the time step dt | 0.0004955493 | 0.9 | 6 +Post-treatment operations | 0.01174376 | 20.1 | 1 +Other operations | 0.002607339 | 4.5 | + +Average number of iteration of the linear solver per call: 22 + + +----------------------------------------------------------------------------------------------------------- + GPU statistics +----------------------------------------------------------------------------------------------------------- +Counter description | Time per step | % loop time | Call(s)/step | Bandwidth +----------------------------------------------------------------------------------------------------------- +Libraries: | 0.023469 | 40.3 | 2 | +Kernels: | 0.0159351 | 27.3 | 1116 | +Copy host to device: | 0.00123798 | 2.1 | 102 | 7.2 GB/s +Copy device to host: | 8.48317e-05 | 0.1 | 6 | 5.9 GB/s +Alloc/Free on device: | 1.78693e-05 | 0.0 | 856 | +GPU: 68% Copy H<->D: 2.3% Alloc/free: 0.031% Comm: 0% CPU & I/O: 30% +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Post-resolution statistics +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Time of the post-resolution: 0.00880884 + +Total time for the whole computation 4.51912 + diff --git a/tests/GPU/GAMELAN_AMG/GAMELAN_AMG_BENCH.TU.is246827_cc86 b/tests/GPU/GAMELAN_AMG/GAMELAN_AMG_BENCH.TU.is246827_cc86 new file mode 100644 index 0000000000..119f8aacad --- /dev/null +++ b/tests/GPU/GAMELAN_AMG/GAMELAN_AMG_BENCH.TU.is246827_cc86 @@ -0,0 +1,52 @@ +Statistiques d'initialisation du calcul + +Temps total 4.8905 + +Statistiques de resolution du probleme + +Temps total 3.46361 + + +Timesteps 10 +Secondes / pas de temps 0.346353 +Dont solveurs Ax=B 0.170325 49% (2 appels/pas de temps) +Dont solveur diffusion_implicite 0.012993 3% (4 appels/pas de temps) +Dont mettre_a_jour 0.017205 4% (4 appels/pas de temps) +Dont operateurs convection 0.036781 10% (6.9 appels/pas de temps) +Dont operateurs diffusion 0.045840 13% (16 appels/pas de temps) +Dont operateurs gradient 0.010809 3% (5 appels/pas de temps) +Dont operateurs divergence 0.006897 1% (6 appels/pas de temps) +Dont operateurs source 0.000772 0% (4 appels/pas de temps) +Dont operations postraitement 0.018958 5% (1 appel/pas de temps) +Dont calcul dt 0.002290 0% (6 appels/pas de temps) +Dont calcul divers 0.023481 6% (0 appels/pas de temps) +Nb solveur / pas de temps 2 +Secondes / solveur 0.0851625 +Iterations / solveur 16.05 +GPU statistics per time step (experimental): +Libraries : 0.169797 s 49.0% 2.0 calls +Kernels : 0.106873 s 30.9% 1154.1 calls +Copy H2D : 0.008219 s 2.4% 137.2 calls 8.0 GB/s +Copy D2H : 0.004039 s 1.2% 11.4 calls 8.2 GB/s +Alloc/Free: 0.008932 s 2.6% 912.1 calls +GPU: 79.8% Copy H<->D: 3.5% Alloc/Free: 2.5% Comm: 0% CPU & Others: 14% +I/O: + +Timesteps = number of time steps +Nb solveur = number of linear system resolutions +Nb assemblage implicite = number of matrix assemblies for the implicit scheme +Iterations = average number of iterations of the solver +Communications = fraction of the time spent + in communications between processors (excluding io files) +Network latency = time of one mpsum measured by an internal bench over 0.1s +Network bandwidth = maximum on all processors + of the average bandwidth of send_recv operations +Waiting time = estimation of the waiting time of the different processors + +Max_waiting_time big => probably due to a bad partitioning +Communications > 30% => too many processors or network too slow + +Statistiques de post resolution + +Temps total 0.143859 + diff --git a/tests/GPU/GAMELAN_AMG/GAMELAN_AMG_BENCH.TU.is247793_gfx1100 b/tests/GPU/GAMELAN_AMG/GAMELAN_AMG_BENCH.TU.is247793_gfx1100 new file mode 100644 index 0000000000..bd0ae4f1e6 --- /dev/null +++ b/tests/GPU/GAMELAN_AMG/GAMELAN_AMG_BENCH.TU.is247793_gfx1100 @@ -0,0 +1,77 @@ + # Global performance file # + +This is the global file for tracking performance in TRUST. It stores aggregated quantities. +More detailed statistics can be found in the GAMELAN_AMG_BENCH_csv.TU file +For time loop, only standard counters of level 1 are printed alongside your custom counters +Time is given in seconds + +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Context of the computation +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Date: 22-05-2026 -- 17:33:11 +OS: is247793__Linux__x86_64__6.8.0-107-generic__#107~22.04.1-Ubuntu SMP PREEMPT_DYNAMIC Wed Mar 18 23:40:43 UTC +CPU model : AMD Ryzen Threadripper PRO 7965WX 24-Cores +Total number of threads:48 +GPU model: AMD Radeon PRO W7900 +HIP runtime version: 7.53 +HIP drivers version: 7.53 +Nb procs used for the computation: 1 +TRUST version: 1.9.8_beta +Total number of elements used for the calculation: 600576 + +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Computation start-up statistics +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Total time of the start-up: 4.69876 + +Number of calls to the linear solver per time step: 2 +Average time of the resolution of the linear problem per call: 0.491257 +Average number of iteration of the linear solver per call: 22 + +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Time loop statistics +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +The first time step is not accounted for the computation of the time loop statistics +Total time of the time loop: 1.50817 +Number of time steps: 9 +Skipped time steps: 1 +Average time per time step: 0.167574 +Standard deviation between time steps: 0.0378413 +Time elapsed in the skipped time steps: 0.324171 + + +Standard counter description | Time/step | % loop time | Call(s)/step +------------------------------------------------------------------------------------------ +Linear solver resolutions Ax=B | 0.0474527 | 28.3 | 2 +Convection operator | 0.01240694 | 7.4 | 7 +Diffusion operator | 0.01717201 | 10.2 | 16 +Gradient operator | 0.003410908 | 2.0 | 5 +Divergence operator | 0.001815707 | 1.1 | 6 +Source terms | 0.0005138534 | 0.3 | 4 +Update ::mettre_a_jour | 0.007159344 | 4.3 | 4 +Solver for implicit diffusion | 0.005624942 | 3.4 | 4 +Computation of the time step dt | 0.00146865 | 0.9 | 6 +Post-treatment operations | 0.06273935 | 37.4 | 1 +Other operations | 0.00780992 | 4.7 | + +Average number of iteration of the linear solver per call: 21.7 + + +----------------------------------------------------------------------------------------------------------- + GPU statistics +----------------------------------------------------------------------------------------------------------- +Counter description | Time per step | % loop time | Call(s)/step | Bandwidth +----------------------------------------------------------------------------------------------------------- +Libraries: | 0.0472264 | 28.2 | 2 | +Kernels: | 0.0947303 | 56.5 | 1116 | +Copy host to device: | 0.00191656 | 1.1 | 102 | 4.6 GB/s +Copy device to host: | 0.00020206 | 0.1 | 6 | 2.5 GB/s +Alloc/Free on device: | 1.96927e-05 | 0.0 | 856 | +GPU: 85% Copy H<->D: 1.3% Alloc/free: 0.012% Comm: 0% CPU & I/O: 14% +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Post-resolution statistics +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Time of the post-resolution: 0.0125331 + +Total time for the whole computation 6.54364 + diff --git a/tests/GPU/GAMELAN_AMG/GAMELAN_AMG_BENCH.TU.jean-zay_cc90 b/tests/GPU/GAMELAN_AMG/GAMELAN_AMG_BENCH.TU.jean-zay_cc90 new file mode 100644 index 0000000000..07ab4d2e5d --- /dev/null +++ b/tests/GPU/GAMELAN_AMG/GAMELAN_AMG_BENCH.TU.jean-zay_cc90 @@ -0,0 +1,78 @@ + # Global performance file # + +This is the global file for tracking performance in TRUST. It stores aggregated quantities. +More detailed statistics can be found in the GAMELAN_AMG_BENCH_csv.TU file +For time loop, only standard counters of level 1 are printed alongside your custom counters +Time is given in seconds + +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Context of the computation +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Date: 23-04-2026 -- 08:11:50 +OS: jzxh011__Linux__x86_64__5.14.0-570.58.1.el9_6.x86_64__#1 SMP PREEMPT_DYNAMIC Tue Oct 21 04:15:07 EDT 2025 +CPU model : Intel(R) Xeon(R) Platinum 8468 +Total number of threads:192 +GPU model: NVIDIA H100 80GB HBM3 +CUDA runtime version: 12.60 +CUDA drivers version: 13.0 +Nb procs used for the computation: 1 +TRUST version: 1.9.8_beta +Total number of elements used for the calculation: 600576 + +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Computation start-up statistics +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Total time of the start-up: 6.74938 + +Number of calls to the linear solver per time step: 2 +Average time of the resolution of the linear problem per call: 0.418271 +Average number of iteration of the linear solver per call: 19 + +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Time loop statistics +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +The first time step is not accounted for the computation of the time loop statistics +Total time of the time loop: 0.768038 +Number of time steps: 9 +Skipped time steps: 1 +Average time per time step: 0.0853375 +Standard deviation between time steps: 0.0561452 +Time elapsed in the skipped time steps: 0.663388 + + +Standard counter description | Time/step | % loop time | Call(s)/step +------------------------------------------------------------------------------------------ +Linear solver resolutions Ax=B | 0.0226408 | 26.5 | 2 +Convection operator | 0.01184246 | 13.9 | 7 +Diffusion operator | 0.009782814 | 11.5 | 16 +Gradient operator | 0.003806978 | 4.5 | 5 +Divergence operator | 0.0009910392 | 1.2 | 6 +Source terms | 0.0001986488 | 0.2 | 4 +Update ::mettre_a_jour | 0.00376011 | 4.4 | 4 +Solver for implicit diffusion | 0.002483634 | 2.9 | 4 +Computation of the time step dt | 0.0006226492 | 0.7 | 6 +Post-treatment operations | 0.02469796 | 28.9 | 1 +Other operations | 0.00451043 | 5.3 | + +Average number of iteration of the linear solver per call: 22 + + +----------------------------------------------------------------------------------------------------------- + GPU statistics +----------------------------------------------------------------------------------------------------------- +Counter description | Time per step | % loop time | Call(s)/step | Bandwidth +----------------------------------------------------------------------------------------------------------- +Libraries: | 0.0225083 | 26.4 | 2 | +Kernels: | 0.0191684 | 22.5 | 1073 | +Copy host to device: | 0.00240775 | 2.8 | 107 | 5.5 GB/s +Copy device to host: | 0.00061727 | 0.7 | 6 | 7.8 GB/s +Alloc/Free on device: | 1.9512e-05 | 0.0 | 856 | +GPU: 49% Copy H<->D: 3.5% Alloc/free: 0.023% Comm: 0% CPU & I/O: 48% +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Post-resolution statistics +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Time of the post-resolution: 0.0715051 + +Total time for the whole computation 8.25234 + +[Slurm] Power consumption (19 s): 0.370 kW 0.002 kWh 0.000 € (0.10€/kWh) diff --git a/tests/GPU/GAMELAN_AMG/GAMELAN_AMG_BENCH.TU.lumi_gfx90a b/tests/GPU/GAMELAN_AMG/GAMELAN_AMG_BENCH.TU.lumi_gfx90a new file mode 100644 index 0000000000..7b334d89de --- /dev/null +++ b/tests/GPU/GAMELAN_AMG/GAMELAN_AMG_BENCH.TU.lumi_gfx90a @@ -0,0 +1,78 @@ + # Global performance file # + +This is the global file for tracking performance in TRUST. It stores aggregated quantities. +More detailed statistics can be found in the GAMELAN_AMG_BENCH_csv.TU file +For time loop, only standard counters of level 1 are printed alongside your custom counters +Time is given in seconds + +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Context of the computation +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Date: 18-05-2026 -- 08:50:06 +OS: nid007955__Linux__x86_64__6.4.0-150600.23.73_15.0.14-cray_shasta_c__#1 SMP Tue Oct 21 20:32:25 UTC 2025 (89d3c98) +CPU model : AMD EPYC 7A53 64-Core Processor +Total number of threads:128 +GPU model: AMD Instinct MI250X +HIP runtime version: 6.43 +HIP drivers version: 6.43 +Nb procs used for the computation: 1 +TRUST version: 1.9.8_beta +Total number of elements used for the calculation: 600576 + +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Computation start-up statistics +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Total time of the start-up: 48.31 + +Number of calls to the linear solver per time step: 2 +Average time of the resolution of the linear problem per call: 1.69807 +Average number of iteration of the linear solver per call: 19 + +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Time loop statistics +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +The first time step is not accounted for the computation of the time loop statistics +Total time of the time loop: 2.09182 +Number of time steps: 9 +Skipped time steps: 1 +Average time per time step: 0.232424 +Standard deviation between time steps: 0.0682738 +Time elapsed in the skipped time steps: 0.479321 + + +Standard counter description | Time/step | % loop time | Call(s)/step +------------------------------------------------------------------------------------------ +Linear solver resolutions Ax=B | 0.0269227 | 11.6 | 2 +Convection operator | 0.01438519 | 6.2 | 7 +Diffusion operator | 0.01992115 | 8.6 | 16 +Gradient operator | 0.004397904 | 1.9 | 5 +Divergence operator | 0.001778478 | 0.8 | 6 +Source terms | 0.0003667301 | 0.2 | 4 +Update ::mettre_a_jour | 0.01043215 | 4.5 | 4 +Solver for implicit diffusion | 0.004551798 | 2.0 | 4 +Computation of the time step dt | 0.001073174 | 0.5 | 6 +Post-treatment operations | 0.1427887 | 61.4 | 1 +Other operations | 0.005806166 | 2.5 | + +Average number of iteration of the linear solver per call: 22 + + +----------------------------------------------------------------------------------------------------------- + GPU statistics +----------------------------------------------------------------------------------------------------------- +Counter description | Time per step | % loop time | Call(s)/step | Bandwidth +----------------------------------------------------------------------------------------------------------- +Libraries: | 0.0267111 | 11.5 | 2 | +Kernels: | 0.163472 | 70.3 | 1116 | +Copy host to device: | 0.0021821 | 0.9 | 102 | 4.1 GB/s +Copy device to host: | 0.000161978 | 0.1 | 6 | 3.1 GB/s +Alloc/Free on device: | 3.40422e-05 | 0.0 | 856 | +GPU: 82% Copy H<->D: 1% Alloc/free: 0.015% Comm: 0% CPU & I/O: 17% +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Post-resolution statistics +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Time of the post-resolution: 0.0255515 + +Total time for the whole computation 50.9067 + +[Slurm] Power consumption (76 s): 0.449 kW 0.009 kWh 0.001 € (0.10€/kWh) diff --git a/tests/GPU/GAMELAN_AMG/GAMELAN_AMG_BENCH.TU.topaze_cc80 b/tests/GPU/GAMELAN_AMG/GAMELAN_AMG_BENCH.TU.topaze_cc80 new file mode 100644 index 0000000000..426022d24e --- /dev/null +++ b/tests/GPU/GAMELAN_AMG/GAMELAN_AMG_BENCH.TU.topaze_cc80 @@ -0,0 +1,78 @@ + # Global performance file # + +This is the global file for tracking performance in TRUST. It stores aggregated quantities. +More detailed statistics can be found in the GAMELAN_AMG_BENCH_csv.TU file +For time loop, only standard counters of level 1 are printed alongside your custom counters +Time is given in seconds + +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Context of the computation +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Date: 15-05-2026 -- 13:29:10 +OS: topaze7064__Linux__x86_64__4.18.0-553.89.1.el8_10.x86_64__#1 SMP Sat Nov 29 00:49:18 EST 2025 +CPU model : AMD EPYC 7763 64-Core Processor +Total number of threads:256 +GPU model: NVIDIA A100-SXM4-80GB +CUDA runtime version: 12.90 +CUDA drivers version: 13.0 +Nb procs used for the computation: 1 +TRUST version: 1.9.8_beta +Total number of elements used for the calculation: 600576 + +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Computation start-up statistics +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Total time of the start-up: 7.11853 + +Number of calls to the linear solver per time step: 2 +Average time of the resolution of the linear problem per call: 0.439124 +Average number of iteration of the linear solver per call: 19 + +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Time loop statistics +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +The first time step is not accounted for the computation of the time loop statistics +Total time of the time loop: 0.928862 +Number of time steps: 9 +Skipped time steps: 1 +Average time per time step: 0.103207 +Standard deviation between time steps: 0.0773559 +Time elapsed in the skipped time steps: 0.454931 + + +Standard counter description | Time/step | % loop time | Call(s)/step +------------------------------------------------------------------------------------------ +Linear solver resolutions Ax=B | 0.0303394 | 29.4 | 2 +Convection operator | 0.01164006 | 11.3 | 7 +Diffusion operator | 0.01139027 | 11.0 | 16 +Gradient operator | 0.004048191 | 3.9 | 5 +Divergence operator | 0.001141949 | 1.1 | 6 +Source terms | 0.0002602714 | 0.3 | 4 +Update ::mettre_a_jour | 0.004800796 | 4.7 | 4 +Solver for implicit diffusion | 0.003161202 | 3.1 | 4 +Computation of the time step dt | 0.0007556401 | 0.7 | 6 +Post-treatment operations | 0.0315041 | 30.5 | 1 +Other operations | 0.004165016 | 4.0 | + +Average number of iteration of the linear solver per call: 22 + + +----------------------------------------------------------------------------------------------------------- + GPU statistics +----------------------------------------------------------------------------------------------------------- +Counter description | Time per step | % loop time | Call(s)/step | Bandwidth +----------------------------------------------------------------------------------------------------------- +Libraries: | 0.0301621 | 29.2 | 2 | +Kernels: | 0.025873 | 25.1 | 1116 | +Copy host to device: | 0.00202125 | 2.0 | 102 | 4.4 GB/s +Copy device to host: | 0.000121609 | 0.1 | 6 | 4.1 GB/s +Alloc/Free on device: | 2.90103e-05 | 0.0 | 856 | +GPU: 54% Copy H<->D: 2.1% Alloc/free: 0.028% Comm: 0% CPU & I/O: 44% +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Post-resolution statistics +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Time of the post-resolution: 0.030145 + +Total time for the whole computation 8.53248 + +[Slurm] Power consumption (38 s): 0.629 kW 0.007 kWh 0.001 € (0.10€/kWh) diff --git a/tests/GPU/GAMELAN_AMG/PAR_GAMELAN_AMG_BENCH.TU.is246827x8 b/tests/GPU/GAMELAN_AMG/PAR_GAMELAN_AMG_BENCH.TU.is246827x8 new file mode 100644 index 0000000000..790ebd73cb --- /dev/null +++ b/tests/GPU/GAMELAN_AMG/PAR_GAMELAN_AMG_BENCH.TU.is246827x8 @@ -0,0 +1,63 @@ +Statistiques d'initialisation du calcul + +Temps total 2.30915 + +Statistiques de resolution du probleme + +Temps total 11.5886 + + +Timesteps 10 +Secondes / pas de temps 1.15886 +Dont solveurs Ax=B 0.652978 56% (2 appels/pas de temps) +Dont solveur diffusion_implicite 0.060133 5% (4 appels/pas de temps) +Dont mettre_a_jour 0.057110 4% (4 appels/pas de temps) +Dont operateurs convection 0.077288 6% (6.9 appels/pas de temps) +Dont operateurs diffusion 0.202981 17% (16 appels/pas de temps) +Dont operateurs gradient 0.019023 1% (5 appels/pas de temps) +Dont operateurs divergence 0.018322 1% (6 appels/pas de temps) +Dont operateurs source 0.003755 0% (4 appels/pas de temps) +Dont operations postraitement 0.010286 0% (1 appel/pas de temps) +Dont calcul dt 0.009854 0% (6 appels/pas de temps) +Dont calcul divers 0.085039 7% (0 appels/pas de temps) +Nb echange_espace_virtuel / pas de temps 105.9 +Nb MPI_allreduce / pas de temps 54.2 +----------------------------------------------------------------------------------------------------------------------------------------- +Warning: The number of MPI_allreduce calls per time step is high. Contact TRUST support if you plan to run massive parallel calculation. +----------------------------------------------------------------------------------------------------------------------------------------- +Nb solveur / pas de temps 2 +Secondes / solveur 0.326489 +Iterations / solveur 8.05 +I/O: +--------------------------------------------------------------------------------------------------------- +Warning: One or several PETSc solvers are used and thus the communication time below are under-estimated. +--------------------------------------------------------------------------------------------------------- +Communications avg 4.7 % of total time +Communications max 8.4 % of total time +Communications min 2.1 % of total time +Network latency benchmark 1.39952e-06 s +Network bandwidth max 4325.19 MB/s +Total network traffic 461.511 MB / timestep +Average message size 151.893 kB +Min waiting time 0.2 % of total time +Max waiting time 6.5 % of total time +Avg waiting time 2.9 % of total time + +Timesteps = number of time steps +Nb solveur = number of linear system resolutions +Nb assemblage implicite = number of matrix assemblies for the implicit scheme +Iterations = average number of iterations of the solver +Communications = fraction of the time spent + in communications between processors (excluding io files) +Network latency = time of one mpsum measured by an internal bench over 0.1s +Network bandwidth = maximum on all processors + of the average bandwidth of send_recv operations +Waiting time = estimation of the waiting time of the different processors + +Max_waiting_time big => probably due to a bad partitioning +Communications > 30% => too many processors or network too slow + +Statistiques de post resolution + +Temps total 0.0601 + diff --git a/tests/GPU/GAMELAN_AMG/check_perf.sh b/tests/GPU/GAMELAN_AMG/check_perf.sh new file mode 120000 index 0000000000..6d20411c12 --- /dev/null +++ b/tests/GPU/GAMELAN_AMG/check_perf.sh @@ -0,0 +1 @@ +../DomainFlowLES/check_perf.sh \ No newline at end of file diff --git a/tests/GPU/GAMELAN_AMG/dom.geom b/tests/GPU/GAMELAN_AMG/dom.geom new file mode 120000 index 0000000000..0759265946 --- /dev/null +++ b/tests/GPU/GAMELAN_AMG/dom.geom @@ -0,0 +1 @@ +../../Dilatable/GAMELAN/dom.geom \ No newline at end of file diff --git a/tests/GPU/GMRES/GMRES.data b/tests/GPU/GMRES/GMRES.data index 895e8fcd83..c27c485015 100644 --- a/tests/GPU/GMRES/GMRES.data +++ b/tests/GPU/GMRES/GMRES.data @@ -41,7 +41,8 @@ Scatter DOM.Zones dom_fluide END SCATTER # -VEFPreP1B dis +VEFPreP1B dis +Lire dis { reorder { algo Hilbert } } Scheme_euler_implicit sch Read sch diff --git a/tests/GPU/GMRES/GMRES_BENCH.TU.adastra_gfx90a b/tests/GPU/GMRES/GMRES_BENCH.TU.adastra_gfx90a index bd1cb44dd5..0d0c3bf521 100644 --- a/tests/GPU/GMRES/GMRES_BENCH.TU.adastra_gfx90a +++ b/tests/GPU/GMRES/GMRES_BENCH.TU.adastra_gfx90a @@ -8,8 +8,8 @@ Time is given in seconds ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Context of the computation ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -Date: 12-03-2026 -- 18:25:55 -OS: g1016__Linux__x86_64__5.14.0-570.69.1.el9_6.x86_64__#1 SMP PREEMPT_DYNAMIC Tue Nov 25 01:30:14 EST 2025 +Date: 23-04-2026 -- 09:21:16 +OS: g1109__Linux__x86_64__5.14.0-570.69.1.el9_6.x86_64__#1 SMP PREEMPT_DYNAMIC Tue Nov 25 01:30:14 EST 2025 CPU model : AMD EPYC 7A53 64-Core Processor Total number of threads:128 GPU model: AMD Instinct MI250X @@ -22,36 +22,36 @@ Total number of elements used for the calculation: 2560000 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Computation start-up statistics ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -Total time of the start-up: 54.233 +Total time of the start-up: 58.4859 Number of calls to the linear solver per time step: 1 -Average time of the resolution of the linear problem per call: 2.47337 +Average time of the resolution of the linear problem per call: 3.10443 Average number of iteration of the linear solver per call: 0 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Time loop statistics ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ The first time step is not accounted for the computation of the time loop statistics -Total time of the time loop: 19.9272 +Total time of the time loop: 18.9179 Number of time steps: 9 Skipped time steps: 1 -Average time per time step: 2.21413 -Standard deviation between time steps: 0.279079 -Time elapsed in the skipped time steps: 8.57934 +Average time per time step: 2.10199 +Standard deviation between time steps: 0.290698 +Time elapsed in the skipped time steps: 18.7335 Standard counter description | Time/step | % loop time | Call(s)/step ------------------------------------------------------------------------------------------ -Linear solver resolutions Ax=B | 1.39803 | 63.1 | 3 -Matrix assembly for implicit scheme | 0.1719198 | 7.8 | 1 -Convection operator | 0.2154025 | 9.7 | 4 -Diffusion operator | 0.01442309 | 0.7 | 2 -Divergence operator | 0.03172028 | 1.4 | 4 -Source terms | 0.0005456014 | 0.0 | 2 -Update ::mettre_a_jour | 0.01157037 | 0.5 | 4 -Computation of the time step dt | 0.001521904 | 0.1 | 4 -Post-treatment operations | 0.02268971 | 1.0 | 1 -Other operations | 0.3463029 | 15.6 | +Linear solver resolutions Ax=B | 1.4093 | 67.0 | 3 +Matrix assembly for implicit scheme | 0.1359102 | 6.5 | 1 +Convection operator | 0.1733518 | 8.2 | 4 +Diffusion operator | 0.01203251 | 0.6 | 2 +Divergence operator | 0.02118798 | 1.0 | 4 +Source terms | 0.0005388649 | 0.0 | 2 +Update ::mettre_a_jour | 0.00881933 | 0.4 | 4 +Computation of the time step dt | 0.00153714 | 0.1 | 4 +Post-treatment operations | 0.02121914 | 1.0 | 1 +Other operations | 0.3180953 | 15.1 | Average number of iteration of the linear solver per call: 14.7 @@ -61,17 +61,17 @@ Average number of iteration of the linear solver per call: 14.7 ----------------------------------------------------------------------------------------------------------- Counter description | Time per step | % loop time | Call(s)/step | Bandwidth ----------------------------------------------------------------------------------------------------------- -Libraries: | 0.0841743 | 3.8 | 1 | -Kernels: | 2.1175 | 95.6 | 1301 | -Copy host to device: | 0.000659637 | 0.0 | 21 | 6.8 GB/s -Copy device to host: | 0.00075392 | 0.0 | 7 | 14.8 GB/s -Alloc/Free on device: | 0.000119878 | 0.0 | 4 | -GPU: 99% Copy H<->D: 0.064% Alloc/free: 0.0054% Comm: 0% CPU & I/O: 0.49% +Libraries: | 0.0665175 | 3.2 | 1 | +Kernels: | 2.02331 | 96.3 | 1300 | +Copy host to device: | 0.000635208 | 0.0 | 21 | 7.1 GB/s +Copy device to host: | 0.000746803 | 0.0 | 7 | 14.9 GB/s +Alloc/Free on device: | 0.00011886 | 0.0 | 4 | +GPU: 99% Copy H<->D: 0.066% Alloc/free: 0.0057% Comm: 0% CPU & I/O: 0.51% ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Post-resolution statistics ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -Time of the post-resolution: 0.197574 +Time of the post-resolution: 0.207713 -Total time for the whole computation 82.9371 +Total time for the whole computation 96.3451 -[Slurm] Power consumption (90 s): 0.535 kW 0.013 kWh 0.001 € (0.10€/kWh) +[Slurm] Power consumption (104 s): 0.529 kW 0.015 kWh 0.002 € (0.10€/kWh) diff --git a/tests/GPU/GMRES/GMRES_BENCH.TU.adastra_gfx942 b/tests/GPU/GMRES/GMRES_BENCH.TU.adastra_gfx942 index 3de23b884e..a2f22ab2f1 100644 --- a/tests/GPU/GMRES/GMRES_BENCH.TU.adastra_gfx942 +++ b/tests/GPU/GMRES/GMRES_BENCH.TU.adastra_gfx942 @@ -8,8 +8,8 @@ Time is given in seconds ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Context of the computation ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -Date: 23-02-2026 -- 17:59:06 -OS: a1007__Linux__x86_64__5.14.0-570.69.1.el9_6.x86_64__#1 SMP PREEMPT_DYNAMIC Tue Nov 25 01:30:14 EST 2025 +Date: 23-04-2026 -- 14:41:34 +OS: a1003__Linux__x86_64__5.14.0-570.69.1.el9_6.x86_64__#1 SMP PREEMPT_DYNAMIC Tue Nov 25 01:30:14 EST 2025 CPU model : AMD Instinct MI300A Accelerator Total number of threads:192 GPU model: AMD Instinct MI300A @@ -22,36 +22,36 @@ Total number of elements used for the calculation: 2560000 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Computation start-up statistics ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -Total time of the start-up: 89.1523 +Total time of the start-up: 63.9456 Number of calls to the linear solver per time step: 1 -Average time of the resolution of the linear problem per call: 3.94747 +Average time of the resolution of the linear problem per call: 3.87063 Average number of iteration of the linear solver per call: 0 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Time loop statistics ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ The first time step is not accounted for the computation of the time loop statistics -Total time of the time loop: 7.1811 +Total time of the time loop: 6.96203 Number of time steps: 9 Skipped time steps: 1 -Average time per time step: 0.7979 -Standard deviation between time steps: 0.123846 -Time elapsed in the skipped time steps: 9.4072 +Average time per time step: 0.773558 +Standard deviation between time steps: 0.122661 +Time elapsed in the skipped time steps: 17.8522 Standard counter description | Time/step | % loop time | Call(s)/step ------------------------------------------------------------------------------------------ -Linear solver resolutions Ax=B | 0.6164 | 77.3 | 3 -Matrix assembly for implicit scheme | 0.03673611 | 4.6 | 1 -Convection operator | 0.04092509 | 5.1 | 4 -Diffusion operator | 0.004677966 | 0.6 | 2 -Divergence operator | 0.009021316 | 1.1 | 4 -Source terms | 0.0003811747 | 0.0 | 2 -Update ::mettre_a_jour | 0.005522843 | 0.7 | 4 -Computation of the time step dt | 0.001337907 | 0.2 | 4 -Post-treatment operations | 0.01127625 | 1.4 | 1 -Other operations | 0.07162194 | 9.0 | +Linear solver resolutions Ax=B | 0.606553 | 78.4 | 3 +Matrix assembly for implicit scheme | 0.03394749 | 4.4 | 1 +Convection operator | 0.04057731 | 5.2 | 4 +Diffusion operator | 0.00410471 | 0.5 | 2 +Divergence operator | 0.006404666 | 0.8 | 4 +Source terms | 0.0002954579 | 0.0 | 2 +Update ::mettre_a_jour | 0.004831261 | 0.6 | 4 +Computation of the time step dt | 0.001217283 | 0.2 | 4 +Post-treatment operations | 0.01085299 | 1.4 | 1 +Other operations | 0.06477427 | 8.4 | Average number of iteration of the linear solver per call: 14.7 @@ -61,16 +61,17 @@ Average number of iteration of the linear solver per call: 14.7 ----------------------------------------------------------------------------------------------------------- Counter description | Time per step | % loop time | Call(s)/step | Bandwidth ----------------------------------------------------------------------------------------------------------- -Libraries: | 0.0385048 | 4.8 | 1 | -Kernels: | 0.747298 | 93.7 | 1301 | -Copy host to device: | 0.000577279 | 0.1 | 21 | 7.8 GB/s -Copy device to host: | 0.00051411 | 0.1 | 7 | 21.7 GB/s -Alloc/Free on device: | 0.000865148 | 0.1 | 4 | -GPU: 98% Copy H<->D: 0.14% Alloc/free: 0.11% Comm: 0% CPU & I/O: 1.3% +Libraries: | 0.0347636 | 4.5 | 1 | +Kernels: | 0.726319 | 93.9 | 1300 | +Copy host to device: | 0.000620905 | 0.1 | 21 | 7.2 GB/s +Copy device to host: | 0.000508449 | 0.1 | 7 | 21.9 GB/s +Alloc/Free on device: | 0.000853162 | 0.1 | 4 | +GPU: 98% Copy H<->D: 0.15% Alloc/free: 0.11% Comm: 0% CPU & I/O: 1.4% ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Post-resolution statistics ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -Time of the post-resolution: 0.214741 +Time of the post-resolution: 0.183286 -Total time for the whole computation 105.955 +Total time for the whole computation 88.9431 +[Slurm] Power consumption (98 s): 0.703 kW 0.019 kWh 0.002 € (0.10€/kWh) diff --git a/tests/GPU/GMRES/GMRES_BENCH.TU.dalianvl_cc100 b/tests/GPU/GMRES/GMRES_BENCH.TU.dalianvl_cc100 new file mode 100644 index 0000000000..0dfe5397eb --- /dev/null +++ b/tests/GPU/GMRES/GMRES_BENCH.TU.dalianvl_cc100 @@ -0,0 +1,76 @@ + # Global performance file # + +This is the global file for tracking performance in TRUST. It stores aggregated quantities. +More detailed statistics can be found in the GMRES_BENCH_csv.TU file +For time loop, only standard counters of level 1 are printed alongside your custom counters +Time is given in seconds + +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Context of the computation +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Date: 22-05-2026 -- 12:15:07 +OS: dalianvl14__Linux__aarch64__5.14.0-503.40.1.el9_5.aarch64__#1 SMP PREEMPT_DYNAMIC Thu Apr 24 13:05:29 UTC 2025 +CPU model : Unknown Linux CPU +Total number of threads:144 +GPU model: NVIDIA GB200 +CUDA runtime version: 13.0 +CUDA drivers version: 13.20 +Nb procs used for the computation: 1 +TRUST version: 1.9.8_beta +Total number of elements used for the calculation: 2560000 + +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Computation start-up statistics +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Total time of the start-up: 39.6331 + +Number of calls to the linear solver per time step: 1 +Average time of the resolution of the linear problem per call: 2.71223 +Average number of iteration of the linear solver per call: 0 + +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Time loop statistics +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +The first time step is not accounted for the computation of the time loop statistics +Total time of the time loop: 1.59626 +Number of time steps: 9 +Skipped time steps: 1 +Average time per time step: 0.177362 +Standard deviation between time steps: 0.0223964 +Time elapsed in the skipped time steps: 9.6339 + + +Standard counter description | Time/step | % loop time | Call(s)/step +------------------------------------------------------------------------------------------ +Linear solver resolutions Ax=B | 0.11388 | 64.2 | 3 +Matrix assembly for implicit scheme | 0.01229249 | 6.9 | 1 +Convection operator | 0.01123903 | 6.3 | 4 +Diffusion operator | 0.001825577 | 1.0 | 2 +Divergence operator | 0.001687251 | 1.0 | 4 +Source terms | 0.0001569918 | 0.1 | 2 +Update ::mettre_a_jour | 0.002649643 | 1.5 | 4 +Computation of the time step dt | 0.0005573932 | 0.3 | 4 +Post-treatment operations | 0.005802905 | 3.3 | 1 +Other operations | 0.02727123 | 15.4 | + +Average number of iteration of the linear solver per call: 14.7 + + +----------------------------------------------------------------------------------------------------------- + GPU statistics +----------------------------------------------------------------------------------------------------------- +Counter description | Time per step | % loop time | Call(s)/step | Bandwidth +----------------------------------------------------------------------------------------------------------- +Libraries: | 0.0326826 | 18.4 | 1 | +Kernels: | 0.129523 | 73.0 | 1300 | +Copy host to device: | 0.000431132 | 0.2 | 21 | 10.4 GB/s +Copy device to host: | 0.00143199 | 0.8 | 7 | 7.8 GB/s +Alloc/Free on device: | 0.00336589 | 1.9 | 4 | +GPU: 91% Copy H<->D: 1.1% Alloc/free: 1.9% Comm: 0% CPU & I/O: 5.6% +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Post-resolution statistics +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Time of the post-resolution: 0.154901 + +Total time for the whole computation 51.0182 + diff --git a/tests/GPU/GMRES/GMRES_BENCH.TU.eureka_cc89 b/tests/GPU/GMRES/GMRES_BENCH.TU.eureka_cc89 new file mode 100644 index 0000000000..99b4f42fb5 --- /dev/null +++ b/tests/GPU/GMRES/GMRES_BENCH.TU.eureka_cc89 @@ -0,0 +1,76 @@ + # Global performance file # + +This is the global file for tracking performance in TRUST. It stores aggregated quantities. +More detailed statistics can be found in the GMRES_BENCH_csv.TU file +For time loop, only standard counters of level 1 are printed alongside your custom counters +Time is given in seconds + +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Context of the computation +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Date: 11-05-2026 -- 08:31:26 +OS: eureka__Linux__x86_64__6.17.0-20-generic__#20~24.04.1-Ubuntu SMP PREEMPT_DYNAMIC Thu Mar 19 01:28:37 UTC 2 +CPU model : INTEL(R) XEON(R) PLATINUM 8580 +Total number of threads:240 +GPU model: NVIDIA RTX 6000 Ada Generation +CUDA runtime version: 12.90 +CUDA drivers version: 13.10 +Nb procs used for the computation: 1 +TRUST version: 1.9.8_beta +Total number of elements used for the calculation: 2560000 + +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Computation start-up statistics +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Total time of the start-up: 50.7317 + +Number of calls to the linear solver per time step: 1 +Average time of the resolution of the linear problem per call: 3.94905 +Average number of iteration of the linear solver per call: 0 + +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Time loop statistics +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +The first time step is not accounted for the computation of the time loop statistics +Total time of the time loop: 6.3836 +Number of time steps: 9 +Skipped time steps: 1 +Average time per time step: 0.709288 +Standard deviation between time steps: 0.0878574 +Time elapsed in the skipped time steps: 21.172 + + +Standard counter description | Time/step | % loop time | Call(s)/step +------------------------------------------------------------------------------------------ +Linear solver resolutions Ax=B | 0.483116 | 68.1 | 3 +Matrix assembly for implicit scheme | 0.03525991 | 5.0 | 1 +Convection operator | 0.03701098 | 5.2 | 4 +Diffusion operator | 0.003952027 | 0.6 | 2 +Divergence operator | 0.004919432 | 0.7 | 4 +Source terms | 0.0006515458 | 0.1 | 2 +Update ::mettre_a_jour | 0.004295798 | 0.6 | 4 +Computation of the time step dt | 0.001351149 | 0.2 | 4 +Post-treatment operations | 0.008900335 | 1.3 | 1 +Other operations | 0.129831 | 18.3 | + +Average number of iteration of the linear solver per call: 14.7 + + +----------------------------------------------------------------------------------------------------------- + GPU statistics +----------------------------------------------------------------------------------------------------------- +Counter description | Time per step | % loop time | Call(s)/step | Bandwidth +----------------------------------------------------------------------------------------------------------- +Libraries: | 0.065369 | 9.2 | 1 | +Kernels: | 0.631272 | 89.0 | 1300 | +Copy host to device: | 0.00114264 | 0.2 | 21 | 3.9 GB/s +Copy device to host: | 0.00259112 | 0.4 | 7 | 4.3 GB/s +Alloc/Free on device: | 0.000653251 | 0.1 | 4 | +GPU: 98% Copy H<->D: 0.53% Alloc/free: 0.092% Comm: 0% CPU & I/O: 1.2% +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Post-resolution statistics +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Time of the post-resolution: 0.102475 + +Total time for the whole computation 78.3897 + diff --git a/tests/GPU/GMRES/GMRES_BENCH.TU.is157091_cc86 b/tests/GPU/GMRES/GMRES_BENCH.TU.is157091_cc86 index d711f92cd4..aa08c81b5d 100644 --- a/tests/GPU/GMRES/GMRES_BENCH.TU.is157091_cc86 +++ b/tests/GPU/GMRES/GMRES_BENCH.TU.is157091_cc86 @@ -8,7 +8,7 @@ Time is given in seconds ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Context of the computation ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -Date: 20-02-2026 -- 17:55:11 +Date: 22-04-2026 -- 20:42:37 OS: is157091__Linux__x86_64__5.11.12-300.fc34.x86_64__#1 SMP Wed Apr 7 16:31:13 UTC 2021 CPU model : AMD Ryzen Threadripper PRO 5975WX 32-Cores Total number of threads:64 @@ -22,36 +22,36 @@ Total number of elements used for the calculation: 2560000 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Computation start-up statistics ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -Total time of the start-up: 43.6421 +Total time of the start-up: 44.9979 Number of calls to the linear solver per time step: 1 -Average time of the resolution of the linear problem per call: 2.76651 +Average time of the resolution of the linear problem per call: 3.21012 Average number of iteration of the linear solver per call: 0 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Time loop statistics ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ The first time step is not accounted for the computation of the time loop statistics -Total time of the time loop: 12.4133 +Total time of the time loop: 11.2014 Number of time steps: 9 Skipped time steps: 1 -Average time per time step: 1.37926 -Standard deviation between time steps: 0.145441 -Time elapsed in the skipped time steps: 12.0402 +Average time per time step: 1.2446 +Standard deviation between time steps: 0.142212 +Time elapsed in the skipped time steps: 18.2877 Standard counter description | Time/step | % loop time | Call(s)/step ------------------------------------------------------------------------------------------ -Linear solver resolutions Ax=B | 0.839934 | 60.9 | 3 -Matrix assembly for implicit scheme | 0.100131 | 7.3 | 1 -Convection operator | 0.0936801 | 6.8 | 4 -Diffusion operator | 0.00734367 | 0.5 | 2 -Divergence operator | 0.02655093 | 1.9 | 4 -Source terms | 0.001257321 | 0.1 | 2 -Update ::mettre_a_jour | 0.01117306 | 0.8 | 4 -Computation of the time step dt | 0.002250517 | 0.2 | 4 -Post-treatment operations | 0.01673185 | 1.2 | 1 -Other operations | 0.2802051 | 20.3 | +Linear solver resolutions Ax=B | 0.800799 | 64.3 | 3 +Matrix assembly for implicit scheme | 0.07217464 | 5.8 | 1 +Convection operator | 0.0669646 | 5.4 | 4 +Diffusion operator | 0.005771725 | 0.5 | 2 +Divergence operator | 0.01634721 | 1.3 | 4 +Source terms | 0.0007613063 | 0.1 | 2 +Update ::mettre_a_jour | 0.008655555 | 0.7 | 4 +Computation of the time step dt | 0.002120476 | 0.2 | 4 +Post-treatment operations | 0.01154682 | 0.9 | 1 +Other operations | 0.2594554 | 20.8 | Average number of iteration of the linear solver per call: 14.7 @@ -61,16 +61,16 @@ Average number of iteration of the linear solver per call: 14.7 ----------------------------------------------------------------------------------------------------------- Counter description | Time per step | % loop time | Call(s)/step | Bandwidth ----------------------------------------------------------------------------------------------------------- -Libraries: | 0.123609 | 9.0 | 1 | -Kernels: | 1.24552 | 90.3 | 1301 | -Copy host to device: | 0.00138436 | 0.1 | 21 | 3.2 GB/s -Copy device to host: | 0.00108039 | 0.1 | 7 | 10.3 GB/s -Alloc/Free on device: | 0.00046704 | 0.0 | 4 | -GPU: 99% Copy H<->D: 0.18% Alloc/free: 0.034% Comm: 0% CPU & I/O: 0.52% +Libraries: | 0.105849 | 8.5 | 1 | +Kernels: | 1.12879 | 90.7 | 1300 | +Copy host to device: | 0.00137746 | 0.1 | 21 | 3.3 GB/s +Copy device to host: | 0.00110908 | 0.1 | 7 | 10.0 GB/s +Alloc/Free on device: | 0.000533048 | 0.0 | 4 | +GPU: 99% Copy H<->D: 0.2% Alloc/free: 0.043% Comm: 0% CPU & I/O: 0.56% ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Post-resolution statistics ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -Time of the post-resolution: 0.0728189 +Time of the post-resolution: 0.070975 -Total time for the whole computation 68.1685 +Total time for the whole computation 74.5579 diff --git a/tests/GPU/GMRES/GMRES_BENCH.TU.is159479_cc120 b/tests/GPU/GMRES/GMRES_BENCH.TU.is159479_cc120 new file mode 100644 index 0000000000..834dadd02b --- /dev/null +++ b/tests/GPU/GMRES/GMRES_BENCH.TU.is159479_cc120 @@ -0,0 +1,76 @@ + # Global performance file # + +This is the global file for tracking performance in TRUST. It stores aggregated quantities. +More detailed statistics can be found in the GMRES_BENCH_csv.TU file +For time loop, only standard counters of level 1 are printed alongside your custom counters +Time is given in seconds + +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Context of the computation +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Date: 24-04-2026 -- 14:34:27 +OS: is159479.intra.cea.fr__Linux__x86_64__6.12.0-160000.26-default__#1 SMP PREEMPT_DYNAMIC Thu Feb 5 00:00:11 UTC 2026 (f +CPU model : AMD Ryzen Threadripper PRO 9965WX 24-Cores +Total number of threads:48 +GPU model: NVIDIA RTX PRO 6000 Blackwell Workstation Edition +CUDA runtime version: 13.0 +CUDA drivers version: 13.0 +Nb procs used for the computation: 1 +TRUST version: 1.9.8_beta +Total number of elements used for the calculation: 2560000 + +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Computation start-up statistics +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Total time of the start-up: 33.6781 + +Number of calls to the linear solver per time step: 1 +Average time of the resolution of the linear problem per call: 1.96677 +Average number of iteration of the linear solver per call: 0 + +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Time loop statistics +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +The first time step is not accounted for the computation of the time loop statistics +Total time of the time loop: 3.25831 +Number of time steps: 9 +Skipped time steps: 1 +Average time per time step: 0.362034 +Standard deviation between time steps: 0.0415193 +Time elapsed in the skipped time steps: 12.0571 + + +Standard counter description | Time/step | % loop time | Call(s)/step +------------------------------------------------------------------------------------------ +Linear solver resolutions Ax=B | 0.239558 | 66.2 | 3 +Matrix assembly for implicit scheme | 0.02138728 | 5.9 | 1 +Convection operator | 0.02537361 | 7.0 | 4 +Diffusion operator | 0.002365576 | 0.7 | 2 +Divergence operator | 0.002552803 | 0.7 | 4 +Source terms | 0.0003383349 | 0.1 | 2 +Update ::mettre_a_jour | 0.002266834 | 0.6 | 4 +Computation of the time step dt | 0.0007245106 | 0.2 | 4 +Post-treatment operations | 0.005288873 | 1.5 | 1 +Other operations | 0.06217785 | 17.2 | + +Average number of iteration of the linear solver per call: 14.7 + + +----------------------------------------------------------------------------------------------------------- + GPU statistics +----------------------------------------------------------------------------------------------------------- +Counter description | Time per step | % loop time | Call(s)/step | Bandwidth +----------------------------------------------------------------------------------------------------------- +Libraries: | 0.043691 | 12.1 | 1 | +Kernels: | 0.31172 | 86.1 | 1300 | +Copy host to device: | 0.000469634 | 0.1 | 21 | 9.5 GB/s +Copy device to host: | 0.00146406 | 0.4 | 7 | 7.6 GB/s +Alloc/Free on device: | 0.000357557 | 0.1 | 4 | +GPU: 98% Copy H<->D: 0.53% Alloc/free: 0.099% Comm: 0% CPU & I/O: 1.2% +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Post-resolution statistics +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Time of the post-resolution: 0.0655445 + +Total time for the whole computation 49.0591 + diff --git a/tests/GPU/GMRES/GMRES_BENCH.TU.is247793_gfx1100 b/tests/GPU/GMRES/GMRES_BENCH.TU.is247793_gfx1100 new file mode 100644 index 0000000000..051d8fa10d --- /dev/null +++ b/tests/GPU/GMRES/GMRES_BENCH.TU.is247793_gfx1100 @@ -0,0 +1,76 @@ + # Global performance file # + +This is the global file for tracking performance in TRUST. It stores aggregated quantities. +More detailed statistics can be found in the GMRES_BENCH_csv.TU file +For time loop, only standard counters of level 1 are printed alongside your custom counters +Time is given in seconds + +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Context of the computation +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Date: 22-05-2026 -- 17:33:51 +OS: is247793__Linux__x86_64__6.8.0-107-generic__#107~22.04.1-Ubuntu SMP PREEMPT_DYNAMIC Wed Mar 18 23:40:43 UTC +CPU model : AMD Ryzen Threadripper PRO 7965WX 24-Cores +Total number of threads:48 +GPU model: AMD Radeon PRO W7900 +HIP runtime version: 7.53 +HIP drivers version: 7.53 +Nb procs used for the computation: 1 +TRUST version: 1.9.8_beta +Total number of elements used for the calculation: 2560000 + +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Computation start-up statistics +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Total time of the start-up: 38.0716 + +Number of calls to the linear solver per time step: 1 +Average time of the resolution of the linear problem per call: 3.84061 +Average number of iteration of the linear solver per call: 0 + +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Time loop statistics +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +The first time step is not accounted for the computation of the time loop statistics +Total time of the time loop: 13.4029 +Number of time steps: 9 +Skipped time steps: 1 +Average time per time step: 1.48921 +Standard deviation between time steps: 0.222303 +Time elapsed in the skipped time steps: 15.3733 + + +Standard counter description | Time/step | % loop time | Call(s)/step +------------------------------------------------------------------------------------------ +Linear solver resolutions Ax=B | 1.14468 | 76.9 | 3 +Matrix assembly for implicit scheme | 0.06534276 | 4.4 | 1 +Convection operator | 0.07545809 | 5.1 | 4 +Diffusion operator | 0.008406638 | 0.6 | 2 +Divergence operator | 0.01259237 | 0.8 | 4 +Source terms | 0.0009561102 | 0.1 | 2 +Update ::mettre_a_jour | 0.007802457 | 0.5 | 4 +Computation of the time step dt | 0.002251303 | 0.2 | 4 +Post-treatment operations | 0.01362194 | 0.9 | 1 +Other operations | 0.1580985 | 10.6 | + +Average number of iteration of the linear solver per call: 13.1 + + +----------------------------------------------------------------------------------------------------------- + GPU statistics +----------------------------------------------------------------------------------------------------------- +Counter description | Time per step | % loop time | Call(s)/step | Bandwidth +----------------------------------------------------------------------------------------------------------- +Libraries: | 0.146646 | 9.8 | 1 | +Kernels: | 1.33413 | 89.6 | 1300 | +Copy host to device: | 0.00109438 | 0.1 | 21 | 4.1 GB/s +Copy device to host: | 0.000674911 | 0.0 | 7 | 16.5 GB/s +Alloc/Free on device: | 0.000762378 | 0.1 | 4 | +GPU: 99% Copy H<->D: 0.12% Alloc/free: 0.051% Comm: 0% CPU & I/O: 0.4% +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Post-resolution statistics +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Time of the post-resolution: 0.114679 + +Total time for the whole computation 66.9625 + diff --git a/tests/GPU/GMRES/GMRES_BENCH.TU.lumi_gfx90a b/tests/GPU/GMRES/GMRES_BENCH.TU.lumi_gfx90a new file mode 100644 index 0000000000..96e5d1d18e --- /dev/null +++ b/tests/GPU/GMRES/GMRES_BENCH.TU.lumi_gfx90a @@ -0,0 +1,77 @@ + # Global performance file # + +This is the global file for tracking performance in TRUST. It stores aggregated quantities. +More detailed statistics can be found in the GMRES_BENCH_csv.TU file +For time loop, only standard counters of level 1 are printed alongside your custom counters +Time is given in seconds + +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Context of the computation +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Date: 18-05-2026 -- 08:53:57 +OS: nid007955__Linux__x86_64__6.4.0-150600.23.73_15.0.14-cray_shasta_c__#1 SMP Tue Oct 21 20:32:25 UTC 2025 (89d3c98) +CPU model : AMD EPYC 7A53 64-Core Processor +Total number of threads:128 +GPU model: AMD Instinct MI250X +HIP runtime version: 6.43 +HIP drivers version: 6.43 +Nb procs used for the computation: 1 +TRUST version: 1.9.8_beta +Total number of elements used for the calculation: 2560000 + +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Computation start-up statistics +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Total time of the start-up: 104.002 + +Number of calls to the linear solver per time step: 1 +Average time of the resolution of the linear problem per call: 7.31327 +Average number of iteration of the linear solver per call: 0 + +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Time loop statistics +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +The first time step is not accounted for the computation of the time loop statistics +Total time of the time loop: 17.8793 +Number of time steps: 9 +Skipped time steps: 1 +Average time per time step: 1.98659 +Standard deviation between time steps: 0.2676 +Time elapsed in the skipped time steps: 24.2401 + + +Standard counter description | Time/step | % loop time | Call(s)/step +------------------------------------------------------------------------------------------ +Linear solver resolutions Ax=B | 1.31941 | 66.4 | 3 +Matrix assembly for implicit scheme | 0.1361106 | 6.9 | 1 +Convection operator | 0.1735005 | 8.7 | 4 +Diffusion operator | 0.01117017 | 0.6 | 2 +Divergence operator | 0.01994576 | 1.0 | 4 +Source terms | 0.0005305762 | 0.0 | 2 +Update ::mettre_a_jour | 0.008478555 | 0.4 | 4 +Computation of the time step dt | 0.001511032 | 0.1 | 4 +Post-treatment operations | 0.01971884 | 1.0 | 1 +Other operations | 0.2962186 | 14.9 | + +Average number of iteration of the linear solver per call: 14.7 + + +----------------------------------------------------------------------------------------------------------- + GPU statistics +----------------------------------------------------------------------------------------------------------- +Counter description | Time per step | % loop time | Call(s)/step | Bandwidth +----------------------------------------------------------------------------------------------------------- +Libraries: | 0.0650842 | 3.3 | 1 | +Kernels: | 1.90925 | 96.1 | 1300 | +Copy host to device: | 0.000662391 | 0.0 | 21 | 6.8 GB/s +Copy device to host: | 0.000750408 | 0.0 | 7 | 14.8 GB/s +Alloc/Free on device: | 0.000125511 | 0.0 | 4 | +GPU: 99% Copy H<->D: 0.071% Alloc/free: 0.0063% Comm: 0% CPU & I/O: 0.54% +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Post-resolution statistics +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Time of the post-resolution: 0.204038 + +Total time for the whole computation 146.325 + +[Slurm] Power consumption (170 s): 0.518 kW 0.024 kWh 0.002 € (0.10€/kWh) diff --git a/tests/GPU/GMRES/GMRES_BENCH.TU.topaze_cc80 b/tests/GPU/GMRES/GMRES_BENCH.TU.topaze_cc80 index ca10fe9c5e..f22144e384 100644 --- a/tests/GPU/GMRES/GMRES_BENCH.TU.topaze_cc80 +++ b/tests/GPU/GMRES/GMRES_BENCH.TU.topaze_cc80 @@ -8,8 +8,8 @@ Time is given in seconds ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Context of the computation ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -Date: 20-03-2026 -- 15:24:53 -OS: topaze7046__Linux__x86_64__4.18.0-553.89.1.el8_10.x86_64__#1 SMP Sat Nov 29 00:49:18 EST 2025 +Date: 15-05-2026 -- 13:31:12 +OS: topaze7040__Linux__x86_64__4.18.0-553.89.1.el8_10.x86_64__#1 SMP Sat Nov 29 00:49:18 EST 2025 CPU model : AMD EPYC 7763 64-Core Processor Total number of threads:256 GPU model: NVIDIA A100-SXM4-80GB @@ -22,36 +22,36 @@ Total number of elements used for the calculation: 2560000 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Computation start-up statistics ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -Total time of the start-up: 57.25 +Total time of the start-up: 61.3805 Number of calls to the linear solver per time step: 1 -Average time of the resolution of the linear problem per call: 4.33425 +Average time of the resolution of the linear problem per call: 4.66789 Average number of iteration of the linear solver per call: 0 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Time loop statistics ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ The first time step is not accounted for the computation of the time loop statistics -Total time of the time loop: 5.0447 +Total time of the time loop: 4.57182 Number of time steps: 9 Skipped time steps: 1 -Average time per time step: 0.560522 -Standard deviation between time steps: 0.054987 -Time elapsed in the skipped time steps: 16.1386 +Average time per time step: 0.50798 +Standard deviation between time steps: 0.0546923 +Time elapsed in the skipped time steps: 23.7777 Standard counter description | Time/step | % loop time | Call(s)/step ------------------------------------------------------------------------------------------ -Linear solver resolutions Ax=B | 0.333833 | 59.6 | 3 -Matrix assembly for implicit scheme | 0.04433419 | 7.9 | 1 -Convection operator | 0.04280126 | 7.6 | 4 -Diffusion operator | 0.005348669 | 1.0 | 2 -Divergence operator | 0.008769969 | 1.6 | 4 -Source terms | 0.001239124 | 0.2 | 2 -Update ::mettre_a_jour | 0.004984535 | 0.9 | 4 -Computation of the time step dt | 0.001195886 | 0.2 | 4 -Post-treatment operations | 0.01160467 | 2.1 | 1 -Other operations | 0.10641 | 19.0 | +Linear solver resolutions Ax=B | 0.327822 | 64.5 | 3 +Matrix assembly for implicit scheme | 0.02985497 | 5.9 | 1 +Convection operator | 0.02795193 | 5.5 | 4 +Diffusion operator | 0.004184844 | 0.8 | 2 +Divergence operator | 0.00393144 | 0.8 | 4 +Source terms | 0.0004237254 | 0.1 | 2 +Update ::mettre_a_jour | 0.003992837 | 0.8 | 4 +Computation of the time step dt | 0.001005162 | 0.2 | 4 +Post-treatment operations | 0.00991028 | 2.0 | 1 +Other operations | 0.09890277 | 19.5 | Average number of iteration of the linear solver per call: 14.7 @@ -61,16 +61,17 @@ Average number of iteration of the linear solver per call: 14.7 ----------------------------------------------------------------------------------------------------------- Counter description | Time per step | % loop time | Call(s)/step | Bandwidth ----------------------------------------------------------------------------------------------------------- -Libraries: | 0.0617394 | 11.0 | 1 | -Kernels: | 0.483261 | 86.2 | 1301 | -Copy host to device: | 0.00155029 | 0.3 | 21 | 2.9 GB/s -Copy device to host: | 0.0014181 | 0.3 | 7 | 7.9 GB/s -Alloc/Free on device: | 0.000880882 | 0.2 | 4 | -GPU: 97% Copy H<->D: 0.53% Alloc/free: 0.16% Comm: 0% CPU & I/O: 2.1% +Libraries: | 0.0595002 | 11.7 | 1 | +Kernels: | 0.433552 | 85.3 | 1300 | +Copy host to device: | 0.00174372 | 0.3 | 21 | 2.6 GB/s +Copy device to host: | 0.00094954 | 0.2 | 7 | 11.7 GB/s +Alloc/Free on device: | 0.00095336 | 0.2 | 4 | +GPU: 97% Copy H<->D: 0.53% Alloc/free: 0.19% Comm: 0% CPU & I/O: 2.2% ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Post-resolution statistics ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -Time of the post-resolution: 0.235477 +Time of the post-resolution: 0.213536 -Total time for the whole computation 78.6688 +Total time for the whole computation 89.9436 +[Slurm] Power consumption (129 s): 0.454 kW 0.016 kWh 0.002 € (0.10€/kWh) diff --git a/tests/GPU/GPU4/GPU4.data b/tests/GPU/GPU4/GPU4.data index 922b0221ab..c0a8e9fee7 100644 --- a/tests/GPU/GPU4/GPU4.data +++ b/tests/GPU/GPU4/GPU4.data @@ -40,7 +40,8 @@ END PARTITION # Scatter DOM.Zones dom END SCATTER # -VEFPreP1B dis +VEFPreP1B dis +Lire dis { reorder { algo Hilbert } } Scheme_euler_explicit sch Read sch diff --git a/tests/GPU/GPU4/GPU4_BENCH.TU.adastra_gfx90a b/tests/GPU/GPU4/GPU4_BENCH.TU.adastra_gfx90a index 0e42504b80..bcabd77109 100644 --- a/tests/GPU/GPU4/GPU4_BENCH.TU.adastra_gfx90a +++ b/tests/GPU/GPU4/GPU4_BENCH.TU.adastra_gfx90a @@ -8,8 +8,8 @@ Time is given in seconds ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Context of the computation ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -Date: 20-02-2026 -- 21:11:25 -OS: g1031__Linux__x86_64__5.14.0-570.69.1.el9_6.x86_64__#1 SMP PREEMPT_DYNAMIC Tue Nov 25 01:30:14 EST 2025 +Date: 24-05-2026 -- 15:59:02 +OS: g1321__Linux__x86_64__5.14.0-570.69.1.el9_6.x86_64__#1 SMP PREEMPT_DYNAMIC Tue Nov 25 01:30:14 EST 2025 CPU model : AMD EPYC 7A53 64-Core Processor Total number of threads:128 GPU model: AMD Instinct MI250X @@ -22,37 +22,37 @@ Total number of elements used for the calculation: 2592000 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Computation start-up statistics ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -Total time of the start-up: 41.562 +Total time of the start-up: 32.7813 Number of calls to the linear solver per time step: 2 -Average time of the resolution of the linear problem per call: 1.31268 +Average time of the resolution of the linear problem per call: 1.65578 Average number of iteration of the linear solver per call: 33 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Time loop statistics ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ The first time step is not accounted for the computation of the time loop statistics -Total time of the time loop: 1.49597 +Total time of the time loop: 1.05563 Number of time steps: 9 Skipped time steps: 1 -Average time per time step: 0.166219 -Standard deviation between time steps: 0.0078387 -Time elapsed in the skipped time steps: 0.142076 +Average time per time step: 0.117292 +Standard deviation between time steps: 0.00824722 +Time elapsed in the skipped time steps: 0.101644 Standard counter description | Time/step | % loop time | Call(s)/step ------------------------------------------------------------------------------------------ -Linear solver resolutions Ax=B | 0.0452288 | 24.9 | 1 -Convection operator | 0.02206269 | 12.1 | 1 -Diffusion operator | 0.008556311 | 4.7 | 1 -Gradient operator | 0.04916169 | 27.0 | 2 -Divergence operator | 0.01618532 | 8.9 | 2 -Update ::mettre_a_jour | 0.01322576 | 7.3 | 1 -Computation of the time step dt | 0.006794905 | 3.7 | 2 -Post-treatment operations | 0.00346718 | 1.9 | 1 -Other operations | 0.001535927 | 0.8 | +Linear solver resolutions Ax=B | 0.0370046 | 31.5 | 1 +Convection operator | 0.01428056 | 12.2 | 1 +Diffusion operator | 0.008201341 | 7.0 | 1 +Gradient operator | 0.02456975 | 20.9 | 2 +Divergence operator | 0.01065395 | 9.1 | 2 +Update ::mettre_a_jour | 0.009077858 | 7.7 | 1 +Computation of the time step dt | 0.005003812 | 4.3 | 2 +Post-treatment operations | 0.003668977 | 3.1 | 1 +Other operations | 0.00483125 | 4.1 | -Average number of iteration of the linear solver per call: 20.3 +Average number of iteration of the linear solver per call: 20.7 ----------------------------------------------------------------------------------------------------------- @@ -60,17 +60,17 @@ Average number of iteration of the linear solver per call: 20.3 ----------------------------------------------------------------------------------------------------------- Counter description | Time per step | % loop time | Call(s)/step | Bandwidth ----------------------------------------------------------------------------------------------------------- -Libraries: | 0.0450126 | 27.1 | 1 | -Kernels: | 0.115366 | 69.4 | 147 | -Copy host to device: | 0.00027125 | 0.2 | 10 | 4.9 GB/s -Copy device to host: | 0.000193674 | 0.1 | 1 | 17.3 GB/s -Alloc/Free on device: | 1.50704e-05 | 0.0 | 0 | -GPU: 96% Copy H<->D: 0.28% Alloc/free: 0.0091% Comm: 0% CPU & I/O: 3.2% +Libraries: | 0.0367933 | 31.4 | 1 | +Kernels: | 0.0751332 | 64.1 | 153 | +Copy host to device: | 0.000225522 | 0.2 | 9 | 5.1 GB/s +Copy device to host: | 0.000229117 | 0.2 | 2 | 15.5 GB/s +Alloc/Free on device: | 1.44047e-05 | 0.0 | 0 | +GPU: 95% Copy H<->D: 0.39% Alloc/free: 0.012% Comm: 0% CPU & I/O: 4.2% ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Post-resolution statistics ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -Time of the post-resolution: 0.157813 +Time of the post-resolution: 0.15931 -Total time for the whole computation 43.3579 +Total time for the whole computation 34.0979 -Power consumption: 0 kW 0 kWh 0 € (0.12€/kWh) +[Slurm] Power consumption (40 s): 0.438 kW 0.005 kWh 0.000 € (0.10€/kWh) diff --git a/tests/GPU/GPU4/GPU4_BENCH.TU.adastra_gfx942 b/tests/GPU/GPU4/GPU4_BENCH.TU.adastra_gfx942 index 6b13b9d45c..3ecbd97739 100644 --- a/tests/GPU/GPU4/GPU4_BENCH.TU.adastra_gfx942 +++ b/tests/GPU/GPU4/GPU4_BENCH.TU.adastra_gfx942 @@ -8,8 +8,8 @@ Time is given in seconds ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Context of the computation ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -Date: 17-03-2026 -- 17:53:25 -OS: a1007__Linux__x86_64__5.14.0-570.69.1.el9_6.x86_64__#1 SMP PREEMPT_DYNAMIC Tue Nov 25 01:30:14 EST 2025 +Date: 08-06-2026 -- 14:41:00 +OS: a1002__Linux__x86_64__5.14.0-570.69.1.el9_6.x86_64__#1 SMP PREEMPT_DYNAMIC Tue Nov 25 01:30:14 EST 2025 CPU model : AMD Instinct MI300A Accelerator Total number of threads:192 GPU model: AMD Instinct MI300A @@ -22,37 +22,37 @@ Total number of elements used for the calculation: 2592000 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Computation start-up statistics ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -Total time of the start-up: 79.453 +Total time of the start-up: 26.4833 Number of calls to the linear solver per time step: 2 -Average time of the resolution of the linear problem per call: 1.8534 +Average time of the resolution of the linear problem per call: 1.3744 Average number of iteration of the linear solver per call: 33 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Time loop statistics ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ The first time step is not accounted for the computation of the time loop statistics -Total time of the time loop: 0.493985 +Total time of the time loop: 0.455643 Number of time steps: 9 Skipped time steps: 1 -Average time per time step: 0.0548872 -Standard deviation between time steps: 0.00732123 -Time elapsed in the skipped time steps: 0.075823 +Average time per time step: 0.050627 +Standard deviation between time steps: 0.00715344 +Time elapsed in the skipped time steps: 0.0646453 Standard counter description | Time/step | % loop time | Call(s)/step ------------------------------------------------------------------------------------------ -Linear solver resolutions Ax=B | 0.0205601 | 37.5 | 1 -Convection operator | 0.005483088 | 10.0 | 1 -Diffusion operator | 0.002202733 | 4.0 | 1 -Gradient operator | 0.01036635 | 18.9 | 2 -Divergence operator | 0.004477627 | 8.2 | 2 -Update ::mettre_a_jour | 0.003807159 | 6.9 | 1 -Computation of the time step dt | 0.001930352 | 3.5 | 2 -Post-treatment operations | 0.003031623 | 5.5 | 1 -Other operations | 0.003028226 | 5.5 | +Linear solver resolutions Ax=B | 0.019712 | 38.9 | 1 +Convection operator | 0.00476502 | 9.4 | 1 +Diffusion operator | 0.002064178 | 4.1 | 1 +Gradient operator | 0.01031318 | 20.4 | 2 +Divergence operator | 0.002667278 | 5.3 | 2 +Update ::mettre_a_jour | 0.002899753 | 5.7 | 1 +Computation of the time step dt | 0.001444263 | 2.9 | 2 +Post-treatment operations | 0.002739872 | 5.4 | 1 +Other operations | 0.004021502 | 7.9 | -Average number of iteration of the linear solver per call: 20.3 +Average number of iteration of the linear solver per call: 20.7 ----------------------------------------------------------------------------------------------------------- @@ -60,17 +60,17 @@ Average number of iteration of the linear solver per call: 20.3 ----------------------------------------------------------------------------------------------------------- Counter description | Time per step | % loop time | Call(s)/step | Bandwidth ----------------------------------------------------------------------------------------------------------- -Libraries: | 0.0204319 | 37.2 | 1 | -Kernels: | 0.0293497 | 53.5 | 147 | -Copy host to device: | 0.000250773 | 0.5 | 10 | 5.3 GB/s -Copy device to host: | 0.000111261 | 0.2 | 1 | 30.2 GB/s -Alloc/Free on device: | 0.000108043 | 0.2 | 0 | -GPU: 91% Copy H<->D: 0.66% Alloc/free: 0.2% Comm: 0% CPU & I/O: 8.4% +Libraries: | 0.0196001 | 38.7 | 1 | +Kernels: | 0.0265284 | 52.4 | 153 | +Copy host to device: | 0.000192319 | 0.4 | 9 | 5.9 GB/s +Copy device to host: | 0.000149286 | 0.3 | 2 | 23.8 GB/s +Alloc/Free on device: | 0.00011148 | 0.2 | 0 | +GPU: 91% Copy H<->D: 0.67% Alloc/free: 0.22% Comm: 0% CPU & I/O: 8% ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Post-resolution statistics ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -Time of the post-resolution: 0.147033 +Time of the post-resolution: 0.142964 -Total time for the whole computation 80.1698 +Total time for the whole computation 27.1465 -[Slurm] Power consumption (95 s): 0.624 kW 0.016 kWh 0.002 € (0.10€/kWh) +[Slurm] Power consumption (34 s): 0.629 kW 0.006 kWh 0.001 € (0.10€/kWh) diff --git a/tests/GPU/GPU4/GPU4_BENCH.TU.dalianvl_cc100 b/tests/GPU/GPU4/GPU4_BENCH.TU.dalianvl_cc100 new file mode 100644 index 0000000000..f000fce39f --- /dev/null +++ b/tests/GPU/GPU4/GPU4_BENCH.TU.dalianvl_cc100 @@ -0,0 +1,75 @@ + # Global performance file # + +This is the global file for tracking performance in TRUST. It stores aggregated quantities. +More detailed statistics can be found in the GPU4_BENCH_csv.TU file +For time loop, only standard counters of level 1 are printed alongside your custom counters +Time is given in seconds + +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Context of the computation +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Date: 22-05-2026 -- 12:15:53 +OS: dalianvl14__Linux__aarch64__5.14.0-503.40.1.el9_5.aarch64__#1 SMP PREEMPT_DYNAMIC Thu Apr 24 13:05:29 UTC 2025 +CPU model : Unknown Linux CPU +Total number of threads:144 +GPU model: NVIDIA GB200 +CUDA runtime version: 13.0 +CUDA drivers version: 13.20 +Nb procs used for the computation: 1 +TRUST version: 1.9.8_beta +Total number of elements used for the calculation: 2592000 + +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Computation start-up statistics +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Total time of the start-up: 30.6801 + +Number of calls to the linear solver per time step: 2 +Average time of the resolution of the linear problem per call: 1.40983 +Average number of iteration of the linear solver per call: 33 + +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Time loop statistics +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +The first time step is not accounted for the computation of the time loop statistics +Total time of the time loop: 0.298542 +Number of time steps: 9 +Skipped time steps: 1 +Average time per time step: 0.0331714 +Standard deviation between time steps: 0.00423861 +Time elapsed in the skipped time steps: 0.0267229 + + +Standard counter description | Time/step | % loop time | Call(s)/step +------------------------------------------------------------------------------------------ +Linear solver resolutions Ax=B | 0.0175178 | 52.8 | 1 +Convection operator | 0.002603557 | 7.8 | 1 +Diffusion operator | 0.0007492484 | 2.3 | 1 +Gradient operator | 0.002730113 | 8.2 | 2 +Divergence operator | 0.0008824289 | 2.7 | 2 +Update ::mettre_a_jour | 0.00135279 | 4.1 | 1 +Computation of the time step dt | 0.0005503397 | 1.7 | 2 +Post-treatment operations | 0.001907084 | 5.7 | 1 +Other operations | 0.004878026 | 14.7 | + +Average number of iteration of the linear solver per call: 20.7 + + +----------------------------------------------------------------------------------------------------------- + GPU statistics +----------------------------------------------------------------------------------------------------------- +Counter description | Time per step | % loop time | Call(s)/step | Bandwidth +----------------------------------------------------------------------------------------------------------- +Libraries: | 0.0173774 | 52.4 | 1 | +Kernels: | 0.0106231 | 32.0 | 153 | +Copy host to device: | 0.000173476 | 0.5 | 9 | 6.6 GB/s +Copy device to host: | 7.98649e-05 | 0.2 | 2 | 44.5 GB/s +Alloc/Free on device: | 5.66116e-05 | 0.2 | 0 | +GPU: 84% Copy H<->D: 0.76% Alloc/free: 0.17% Comm: 0% CPU & I/O: 15% +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Post-resolution statistics +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Time of the post-resolution: 0.124356 + +Total time for the whole computation 31.1298 + diff --git a/tests/GPU/GPU4/GPU4_BENCH.TU.eureka_cc89 b/tests/GPU/GPU4/GPU4_BENCH.TU.eureka_cc89 new file mode 100644 index 0000000000..c4c8d5692a --- /dev/null +++ b/tests/GPU/GPU4/GPU4_BENCH.TU.eureka_cc89 @@ -0,0 +1,75 @@ + # Global performance file # + +This is the global file for tracking performance in TRUST. It stores aggregated quantities. +More detailed statistics can be found in the GPU4_BENCH_csv.TU file +For time loop, only standard counters of level 1 are printed alongside your custom counters +Time is given in seconds + +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Context of the computation +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Date: 11-05-2026 -- 08:32:28 +OS: eureka__Linux__x86_64__6.17.0-20-generic__#20~24.04.1-Ubuntu SMP PREEMPT_DYNAMIC Thu Mar 19 01:28:37 UTC 2 +CPU model : INTEL(R) XEON(R) PLATINUM 8580 +Total number of threads:240 +GPU model: NVIDIA RTX 6000 Ada Generation +CUDA runtime version: 12.90 +CUDA drivers version: 13.10 +Nb procs used for the computation: 1 +TRUST version: 1.9.8_beta +Total number of elements used for the calculation: 2592000 + +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Computation start-up statistics +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Total time of the start-up: 32.9864 + +Number of calls to the linear solver per time step: 2 +Average time of the resolution of the linear problem per call: 1.77157 +Average number of iteration of the linear solver per call: 33 + +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Time loop statistics +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +The first time step is not accounted for the computation of the time loop statistics +Total time of the time loop: 0.649571 +Number of time steps: 9 +Skipped time steps: 1 +Average time per time step: 0.0721746 +Standard deviation between time steps: 0.00304736 +Time elapsed in the skipped time steps: 0.0509077 + + +Standard counter description | Time/step | % loop time | Call(s)/step +------------------------------------------------------------------------------------------ +Linear solver resolutions Ax=B | 0.0351763 | 48.7 | 1 +Convection operator | 0.008166033 | 11.3 | 1 +Diffusion operator | 0.00277643 | 3.8 | 1 +Gradient operator | 0.007993661 | 11.1 | 2 +Divergence operator | 0.003646946 | 5.1 | 2 +Update ::mettre_a_jour | 0.004365205 | 6.0 | 1 +Computation of the time step dt | 0.001764848 | 2.4 | 2 +Post-treatment operations | 0.001672337 | 2.3 | 1 +Other operations | 0.006612768 | 9.2 | + +Average number of iteration of the linear solver per call: 20.7 + + +----------------------------------------------------------------------------------------------------------- + GPU statistics +----------------------------------------------------------------------------------------------------------- +Counter description | Time per step | % loop time | Call(s)/step | Bandwidth +----------------------------------------------------------------------------------------------------------- +Libraries: | 0.0349861 | 48.5 | 1 | +Kernels: | 0.0334288 | 46.3 | 153 | +Copy host to device: | 0.000193372 | 0.3 | 9 | 5.9 GB/s +Copy device to host: | 0.000633207 | 0.9 | 2 | 5.6 GB/s +Alloc/Free on device: | 2.62861e-05 | 0.0 | 0 | +GPU: 95% Copy H<->D: 1.1% Alloc/free: 0.036% Comm: 0% CPU & I/O: 4% +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Post-resolution statistics +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Time of the post-resolution: 0.0631071 + +Total time for the whole computation 33.75 + diff --git a/tests/GPU/GPU4/GPU4_BENCH.TU.irene-amd-ccrt_cc70 b/tests/GPU/GPU4/GPU4_BENCH.TU.irene-amd-ccrt_cc70 index 2434000bde..d7c9ed336b 100644 --- a/tests/GPU/GPU4/GPU4_BENCH.TU.irene-amd-ccrt_cc70 +++ b/tests/GPU/GPU4/GPU4_BENCH.TU.irene-amd-ccrt_cc70 @@ -8,8 +8,8 @@ Time is given in seconds ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Context of the computation ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -Date: 20-03-2026 -- 14:15:18 -OS: irene7056__Linux__x86_64__4.18.0-553.89.1.el8_10.x86_64__#1 SMP Sat Nov 29 00:49:18 EST 2025 +Date: 23-04-2026 -- 14:51:45 +OS: irene7051__Linux__x86_64__4.18.0-553.89.1.el8_10.x86_64__#1 SMP Sat Nov 29 00:49:18 EST 2025 CPU model : Intel(R) Xeon(R) Gold 6230 CPU @ 2.10GHz Total number of threads:80 GPU model: Tesla V100-SXM2-16GB @@ -22,35 +22,35 @@ Total number of elements used for the calculation: 2592000 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Computation start-up statistics ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -Total time of the start-up: 50.9891 +Total time of the start-up: 52.3282 Number of calls to the linear solver per time step: 2 -Average time of the resolution of the linear problem per call: 2.63023 +Average time of the resolution of the linear problem per call: 2.75565 Average number of iteration of the linear solver per call: 33 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Time loop statistics ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ The first time step is not accounted for the computation of the time loop statistics -Total time of the time loop: 1.32894 +Total time of the time loop: 1.28124 Number of time steps: 9 Skipped time steps: 1 -Average time per time step: 0.14766 -Standard deviation between time steps: 0.00667253 -Time elapsed in the skipped time steps: 0.142905 +Average time per time step: 0.14236 +Standard deviation between time steps: 0.00709995 +Time elapsed in the skipped time steps: 0.120199 Standard counter description | Time/step | % loop time | Call(s)/step ------------------------------------------------------------------------------------------ -Linear solver resolutions Ax=B | 0.0643949 | 43.6 | 1 -Convection operator | 0.01452043 | 9.8 | 1 -Diffusion operator | 0.004831964 | 3.3 | 1 -Gradient operator | 0.02015019 | 13.6 | 2 -Divergence operator | 0.01717549 | 11.6 | 2 -Update ::mettre_a_jour | 0.0138784 | 9.4 | 1 -Computation of the time step dt | 0.006865698 | 4.6 | 2 -Post-treatment operations | 0.003516858 | 2.4 | 1 -Other operations | 0.002325704 | 1.6 | +Linear solver resolutions Ax=B | 0.0644412 | 45.3 | 1 +Convection operator | 0.01452938 | 10.2 | 1 +Diffusion operator | 0.004854963 | 3.4 | 1 +Gradient operator | 0.01442492 | 10.1 | 2 +Divergence operator | 0.0172205 | 12.1 | 2 +Update ::mettre_a_jour | 0.01393577 | 9.8 | 1 +Computation of the time step dt | 0.006881474 | 4.8 | 2 +Post-treatment operations | 0.003676644 | 2.6 | 1 +Other operations | 0.002395204 | 1.7 | Average number of iteration of the linear solver per call: 20.7 @@ -60,17 +60,17 @@ Average number of iteration of the linear solver per call: 20.7 ----------------------------------------------------------------------------------------------------------- Counter description | Time per step | % loop time | Call(s)/step | Bandwidth ----------------------------------------------------------------------------------------------------------- -Libraries: | 0.0641017 | 43.4 | 1 | -Kernels: | 0.0777633 | 52.7 | 147 | -Copy host to device: | 0.000519965 | 0.4 | 10 | 2.6 GB/s -Copy device to host: | 0.000765415 | 0.5 | 1 | 4.4 GB/s -Alloc/Free on device: | 4.60109e-05 | 0.0 | 0 | -GPU: 96% Copy H<->D: 0.87% Alloc/free: 0.031% Comm: 0% CPU & I/O: 3% +Libraries: | 0.0641377 | 45.1 | 1 | +Kernels: | 0.0719512 | 50.5 | 146 | +Copy host to device: | 0.00053085 | 0.4 | 10 | 2.5 GB/s +Copy device to host: | 0.000839204 | 0.6 | 1 | 4.0 GB/s +Alloc/Free on device: | 5.52052e-05 | 0.0 | 0 | +GPU: 96% Copy H<->D: 0.96% Alloc/free: 0.039% Comm: 0% CPU & I/O: 3.4% ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Post-resolution statistics ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -Time of the post-resolution: 0.149494 +Time of the post-resolution: 0.150737 -Total time for the whole computation 52.6105 +Total time for the whole computation 53.8804 -[Slurm] Power consumption (68 s): 0.205 kW 0.004 kWh 0.000 € (0.10€/kWh) +[Slurm] Power consumption (67 s): 0.211 kW 0.004 kWh 0.000 € (0.10€/kWh) diff --git a/tests/GPU/GPU4/GPU4_BENCH.TU.is157091_cc86 b/tests/GPU/GPU4/GPU4_BENCH.TU.is157091_cc86 index 81926a0981..ce591d36ec 100644 --- a/tests/GPU/GPU4/GPU4_BENCH.TU.is157091_cc86 +++ b/tests/GPU/GPU4/GPU4_BENCH.TU.is157091_cc86 @@ -8,13 +8,13 @@ Time is given in seconds ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Context of the computation ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -Date: 10-03-2026 -- 08:40:38 +Date: 14-05-2026 -- 16:09:55 OS: is157091__Linux__x86_64__5.11.12-300.fc34.x86_64__#1 SMP Wed Apr 7 16:31:13 UTC 2021 CPU model : AMD Ryzen Threadripper PRO 5975WX 32-Cores Total number of threads:64 GPU model: NVIDIA RTX A6000 CUDA runtime version: 12.90 -CUDA drivers version: 12.70 +CUDA drivers version: 13.20 Nb procs used for the computation: 1 TRUST version: 1.9.8_beta Total number of elements used for the calculation: 2592000 @@ -22,35 +22,35 @@ Total number of elements used for the calculation: 2592000 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Computation start-up statistics ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -Total time of the start-up: 34.5173 +Total time of the start-up: 35.7139 Number of calls to the linear solver per time step: 2 -Average time of the resolution of the linear problem per call: 1.46653 +Average time of the resolution of the linear problem per call: 1.70453 Average number of iteration of the linear solver per call: 33 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Time loop statistics ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ The first time step is not accounted for the computation of the time loop statistics -Total time of the time loop: 1.26768 +Total time of the time loop: 1.00381 Number of time steps: 9 Skipped time steps: 1 -Average time per time step: 0.140853 -Standard deviation between time steps: 0.00449016 -Time elapsed in the skipped time steps: 0.0853086 +Average time per time step: 0.111534 +Standard deviation between time steps: 0.00154677 +Time elapsed in the skipped time steps: 0.0637836 Standard counter description | Time/step | % loop time | Call(s)/step ------------------------------------------------------------------------------------------ -Linear solver resolutions Ax=B | 0.0663534 | 47.1 | 1 -Convection operator | 0.01407047 | 10.0 | 1 -Diffusion operator | 0.004086097 | 2.9 | 1 -Gradient operator | 0.01717689 | 12.2 | 2 -Divergence operator | 0.01332984 | 9.5 | 2 -Update ::mettre_a_jour | 0.01163229 | 8.3 | 1 -Computation of the time step dt | 0.007360857 | 5.2 | 2 -Post-treatment operations | 0.002709982 | 1.9 | 1 -Other operations | 0.004133031 | 2.9 | +Linear solver resolutions Ax=B | 0.0576824 | 51.7 | 1 +Convection operator | 0.01250822 | 11.2 | 1 +Diffusion operator | 0.003500001 | 3.1 | 1 +Gradient operator | 0.007888202 | 7.1 | 2 +Divergence operator | 0.008768458 | 7.9 | 2 +Update ::mettre_a_jour | 0.008300216 | 7.4 | 1 +Computation of the time step dt | 0.005651737 | 5.1 | 2 +Post-treatment operations | 0.00148399 | 1.3 | 1 +Other operations | 0.005751195 | 5.2 | Average number of iteration of the linear solver per call: 20.7 @@ -60,16 +60,16 @@ Average number of iteration of the linear solver per call: 20.7 ----------------------------------------------------------------------------------------------------------- Counter description | Time per step | % loop time | Call(s)/step | Bandwidth ----------------------------------------------------------------------------------------------------------- -Libraries: | 0.0660378 | 46.9 | 1 | -Kernels: | 0.0706805 | 50.2 | 147 | -Copy host to device: | 0.000225445 | 0.2 | 10 | 5.9 GB/s -Copy device to host: | 0.000299242 | 0.2 | 1 | 11.2 GB/s -Alloc/Free on device: | 2.02081e-05 | 0.0 | 0 | -GPU: 97% Copy H<->D: 0.37% Alloc/free: 0.014% Comm: 0% CPU & I/O: 2.5% +Libraries: | 0.0573718 | 51.4 | 1 | +Kernels: | 0.0510853 | 45.8 | 153 | +Copy host to device: | 0.000203896 | 0.2 | 9 | 5.6 GB/s +Copy device to host: | 0.000333923 | 0.3 | 2 | 10.6 GB/s +Alloc/Free on device: | 2.21462e-05 | 0.0 | 0 | +GPU: 97% Copy H<->D: 0.48% Alloc/free: 0.02% Comm: 0% CPU & I/O: 2.3% ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Post-resolution statistics ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -Time of the post-resolution: 0.0596979 +Time of the post-resolution: 0.0572605 -Total time for the whole computation 35.93 +Total time for the whole computation 36.8387 diff --git a/tests/GPU/GPU4/GPU4_BENCH.TU.is159479_cc120 b/tests/GPU/GPU4/GPU4_BENCH.TU.is159479_cc120 new file mode 100644 index 0000000000..4c415877a8 --- /dev/null +++ b/tests/GPU/GPU4/GPU4_BENCH.TU.is159479_cc120 @@ -0,0 +1,75 @@ + # Global performance file # + +This is the global file for tracking performance in TRUST. It stores aggregated quantities. +More detailed statistics can be found in the GPU4_BENCH_csv.TU file +For time loop, only standard counters of level 1 are printed alongside your custom counters +Time is given in seconds + +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Context of the computation +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Date: 24-04-2026 -- 14:35:08 +OS: is159479.intra.cea.fr__Linux__x86_64__6.12.0-160000.26-default__#1 SMP PREEMPT_DYNAMIC Thu Feb 5 00:00:11 UTC 2026 (f +CPU model : AMD Ryzen Threadripper PRO 9965WX 24-Cores +Total number of threads:48 +GPU model: NVIDIA RTX PRO 6000 Blackwell Workstation Edition +CUDA runtime version: 13.0 +CUDA drivers version: 13.0 +Nb procs used for the computation: 1 +TRUST version: 1.9.8_beta +Total number of elements used for the calculation: 2592000 + +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Computation start-up statistics +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Total time of the start-up: 24.6151 + +Number of calls to the linear solver per time step: 2 +Average time of the resolution of the linear problem per call: 0.970803 +Average number of iteration of the linear solver per call: 33 + +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Time loop statistics +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +The first time step is not accounted for the computation of the time loop statistics +Total time of the time loop: 0.394239 +Number of time steps: 9 +Skipped time steps: 1 +Average time per time step: 0.0438043 +Standard deviation between time steps: 0.00198087 +Time elapsed in the skipped time steps: 0.0374093 + + +Standard counter description | Time/step | % loop time | Call(s)/step +------------------------------------------------------------------------------------------ +Linear solver resolutions Ax=B | 0.0235285 | 53.7 | 1 +Convection operator | 0.004622394 | 10.6 | 1 +Diffusion operator | 0.001596872 | 3.6 | 1 +Gradient operator | 0.004136096 | 9.4 | 2 +Divergence operator | 0.001846463 | 4.2 | 2 +Update ::mettre_a_jour | 0.002348554 | 5.4 | 1 +Computation of the time step dt | 0.0009105937 | 2.1 | 2 +Post-treatment operations | 0.001085021 | 2.5 | 1 +Other operations | 0.003729836 | 8.5 | + +Average number of iteration of the linear solver per call: 20.7 + + +----------------------------------------------------------------------------------------------------------- + GPU statistics +----------------------------------------------------------------------------------------------------------- +Counter description | Time per step | % loop time | Call(s)/step | Bandwidth +----------------------------------------------------------------------------------------------------------- +Libraries: | 0.0234364 | 53.5 | 1 | +Kernels: | 0.0180744 | 41.3 | 146 | +Copy host to device: | 0.000161213 | 0.4 | 10 | 8.3 GB/s +Copy device to host: | 0.000382395 | 0.9 | 1 | 8.8 GB/s +Alloc/Free on device: | 2.0011e-05 | 0.0 | 0 | +GPU: 95% Copy H<->D: 1.2% Alloc/free: 0.046% Comm: 0% CPU & I/O: 3.9% +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Post-resolution statistics +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Time of the post-resolution: 0.0507625 + +Total time for the whole computation 25.0975 + diff --git a/tests/GPU/GPU4/GPU4_BENCH.TU.is247793_gfx1100 b/tests/GPU/GPU4/GPU4_BENCH.TU.is247793_gfx1100 index 3478687aa7..d014f13ceb 100644 --- a/tests/GPU/GPU4/GPU4_BENCH.TU.is247793_gfx1100 +++ b/tests/GPU/GPU4/GPU4_BENCH.TU.is247793_gfx1100 @@ -1,50 +1,75 @@ -Statistiques d'initialisation du calcul - -Temps total 51.3163 - -Statistiques de resolution du probleme - -Temps total 16.0305 - - -Timesteps 10 -Secondes / pas de temps 1.60305 -Dont solveurs Ax=B 1.409289 87% (1 appel/pas de temps) -Dont mettre_a_jour 0.018862 1% (1 appel/pas de temps) -Dont operateurs convection 0.029484 1% (1 appel/pas de temps) -Dont operateurs diffusion 0.010304 0% (1 appel/pas de temps) -Dont operateurs gradient 0.022206 1% (2 appels/pas de temps) -Dont operateurs divergence 0.008665 0% (2 appels/pas de temps) -Dont operations postraitement 0.074592 4% (1 appel/pas de temps) -Dont calcul dt 0.004162 0% (2 appels/pas de temps) -Dont calcul divers 0.025483 1% (0 appels/pas de temps) -Nb solveur / pas de temps 1 -Secondes / solveur 1.40929 -Iterations / solveur 275.6 -GPU statistics per time step (experimental): -Libraries : 1.409001 s 87.9% 1.0 calls -Kernels : 0.094763 s 5.9% 116.7 calls -Copy H2D : 0.006382 s 0.4% 15.8 calls 11.9 GB/s -Copy D2H : 0.012701 s 0.8% 19.1 calls 17.4 GB/s -Alloc/Free: 0.000266 s 0.0% 1.4 calls -GPU: 93.8% Copy H<->D: 1.1% Alloc/Free: 0% Comm: 0% CPU & Others: 4.9% -I/O: - -Timesteps = number of time steps -Nb solveur = number of linear system resolutions -Nb assemblage implicite = number of matrix assemblies for the implicit scheme -Iterations = average number of iterations of the solver -Communications = fraction of the time spent - in communications between processors (excluding io files) -Network latency = time of one mpsum measured by an internal bench over 0.1s -Network bandwidth = maximum on all processors - of the average bandwidth of send_recv operations -Waiting time = estimation of the waiting time of the different processors - -Max_waiting_time big => probably due to a bad partitioning -Communications > 30% => too many processors or network too slow - -Statistiques de post resolution - -Temps total 0.068697 + # Global performance file # + +This is the global file for tracking performance in TRUST. It stores aggregated quantities. +More detailed statistics can be found in the GPU4_BENCH_csv.TU file +For time loop, only standard counters of level 1 are printed alongside your custom counters +Time is given in seconds + +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Context of the computation +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Date: 22-05-2026 -- 19:03:06 +OS: is247793__Linux__x86_64__6.8.0-107-generic__#107~22.04.1-Ubuntu SMP PREEMPT_DYNAMIC Wed Mar 18 23:40:43 UTC +CPU model : AMD Ryzen Threadripper PRO 7965WX 24-Cores +Total number of threads:48 +GPU model: AMD Radeon PRO W7900 +HIP runtime version: 7.53 +HIP drivers version: 7.53 +Nb procs used for the computation: 1 +TRUST version: 1.9.8_beta +Total number of elements used for the calculation: 2592000 + +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Computation start-up statistics +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Total time of the start-up: 49.3535 + +Number of calls to the linear solver per time step: 2 +Average time of the resolution of the linear problem per call: 3.13722 +Average number of iteration of the linear solver per call: 33 + +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Time loop statistics +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +The first time step is not accounted for the computation of the time loop statistics +Total time of the time loop: 1.14455 +Number of time steps: 9 +Skipped time steps: 1 +Average time per time step: 0.127172 +Standard deviation between time steps: 0.00390832 +Time elapsed in the skipped time steps: 0.0844205 + + +Standard counter description | Time/step | % loop time | Call(s)/step +------------------------------------------------------------------------------------------ +Linear solver resolutions Ax=B | 0.0652737 | 51.3 | 1 +Convection operator | 0.01091375 | 8.6 | 1 +Diffusion operator | 0.004316445 | 3.4 | 1 +Gradient operator | 0.01320478 | 10.4 | 2 +Divergence operator | 0.006607952 | 5.2 | 2 +Update ::mettre_a_jour | 0.007333066 | 5.8 | 1 +Computation of the time step dt | 0.003446431 | 2.7 | 2 +Post-treatment operations | 0.002252622 | 1.8 | 1 +Other operations | 0.01382318 | 10.9 | + +Average number of iteration of the linear solver per call: 20.7 + + +----------------------------------------------------------------------------------------------------------- + GPU statistics +----------------------------------------------------------------------------------------------------------- +Counter description | Time per step | % loop time | Call(s)/step | Bandwidth +----------------------------------------------------------------------------------------------------------- +Libraries: | 0.0648951 | 51.0 | 1 | +Kernels: | 0.055383 | 43.5 | 153 | +Copy host to device: | 0.000475677 | 0.4 | 9 | 2.4 GB/s +Copy device to host: | 0.000299563 | 0.2 | 2 | 11.9 GB/s +Alloc/Free on device: | 0.000134375 | 0.1 | 0 | +GPU: 95% Copy H<->D: 0.61% Alloc/free: 0.11% Comm: 0% CPU & I/O: 4.7% +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Post-resolution statistics +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Time of the post-resolution: 0.101868 + +Total time for the whole computation 50.6843 diff --git a/tests/GPU/GPU4/GPU4_BENCH.TU.lumi_gfx90a b/tests/GPU/GPU4/GPU4_BENCH.TU.lumi_gfx90a index fc180748d0..0c54e27f11 100644 --- a/tests/GPU/GPU4/GPU4_BENCH.TU.lumi_gfx90a +++ b/tests/GPU/GPU4/GPU4_BENCH.TU.lumi_gfx90a @@ -8,8 +8,8 @@ Time is given in seconds ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Context of the computation ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -Date: 23-02-2026 -- 23:59:18 -OS: nid007971__Linux__x86_64__6.4.0-150600.23.73_15.0.14-cray_shasta_c__#1 SMP Tue Oct 21 20:32:25 UTC 2025 (89d3c98) +Date: 18-05-2026 -- 08:58:15 +OS: nid007955__Linux__x86_64__6.4.0-150600.23.73_15.0.14-cray_shasta_c__#1 SMP Tue Oct 21 20:32:25 UTC 2025 (89d3c98) CPU model : AMD EPYC 7A53 64-Core Processor Total number of threads:128 GPU model: AMD Instinct MI250X @@ -22,37 +22,37 @@ Total number of elements used for the calculation: 2592000 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Computation start-up statistics ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -Total time of the start-up: 80.7221 +Total time of the start-up: 95.9527 Number of calls to the linear solver per time step: 2 -Average time of the resolution of the linear problem per call: 3.24025 -Average number of iteration of the linear solver per call: 31.5 +Average time of the resolution of the linear problem per call: 4.54876 +Average number of iteration of the linear solver per call: 33 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Time loop statistics ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ The first time step is not accounted for the computation of the time loop statistics -Total time of the time loop: 1.48028 +Total time of the time loop: 0.983467 Number of time steps: 9 Skipped time steps: 1 -Average time per time step: 0.164476 -Standard deviation between time steps: 0.00638041 -Time elapsed in the skipped time steps: 0.139515 +Average time per time step: 0.109274 +Standard deviation between time steps: 0.00630091 +Time elapsed in the skipped time steps: 0.0982491 Standard counter description | Time/step | % loop time | Call(s)/step ------------------------------------------------------------------------------------------ -Linear solver resolutions Ax=B | 0.044052 | 24.5 | 1 -Convection operator | 0.02158292 | 12.0 | 1 -Diffusion operator | 0.008052386 | 4.5 | 1 -Gradient operator | 0.05045038 | 28.0 | 2 -Divergence operator | 0.01614263 | 9.0 | 2 -Update ::mettre_a_jour | 0.01302959 | 7.2 | 1 -Computation of the time step dt | 0.006851856 | 3.8 | 2 -Post-treatment operations | 0.003123114 | 1.7 | 1 -Other operations | 0.001190999 | 0.7 | +Linear solver resolutions Ax=B | 0.0349821 | 32.0 | 1 +Convection operator | 0.01327583 | 12.1 | 1 +Diffusion operator | 0.007554157 | 6.9 | 1 +Gradient operator | 0.02225961 | 20.4 | 2 +Divergence operator | 0.01005504 | 9.2 | 2 +Update ::mettre_a_jour | 0.008657176 | 7.9 | 1 +Computation of the time step dt | 0.004889879 | 4.5 | 2 +Post-treatment operations | 0.002895587 | 2.6 | 1 +Other operations | 0.004704638 | 4.3 | -Average number of iteration of the linear solver per call: 17.7 +Average number of iteration of the linear solver per call: 20.7 ----------------------------------------------------------------------------------------------------------- @@ -60,17 +60,17 @@ Average number of iteration of the linear solver per call: 17.7 ----------------------------------------------------------------------------------------------------------- Counter description | Time per step | % loop time | Call(s)/step | Bandwidth ----------------------------------------------------------------------------------------------------------- -Libraries: | 0.0432993 | 26.3 | 1 | -Kernels: | 0.115903 | 70.5 | 147 | -Copy host to device: | 0.000290883 | 0.2 | 10 | 4.6 GB/s -Copy device to host: | 0.000189674 | 0.1 | 1 | 17.7 GB/s -Alloc/Free on device: | 1.32044e-05 | 0.0 | 0 | -GPU: 97% Copy H<->D: 0.29% Alloc/free: 0.008% Comm: 0% CPU & I/O: 2.9% +Libraries: | 0.0347634 | 31.8 | 1 | +Kernels: | 0.0698308 | 63.9 | 153 | +Copy host to device: | 0.000244381 | 0.2 | 9 | 4.7 GB/s +Copy device to host: | 0.000228841 | 0.2 | 2 | 15.5 GB/s +Alloc/Free on device: | 1.47298e-05 | 0.0 | 0 | +GPU: 96% Copy H<->D: 0.43% Alloc/free: 0.013% Comm: 0% CPU & I/O: 3.8% ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Post-resolution statistics ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -Time of the post-resolution: 0.139935 +Time of the post-resolution: 0.149004 -Total time for the whole computation 82.4819 +Total time for the whole computation 97.1834 -Power consumption: 0 kW 0 kWh 0 € (0.12€/kWh) +[Slurm] Power consumption (121 s): 0.485 kW 0.016 kWh 0.002 € (0.10€/kWh) diff --git a/tests/GPU/GPU4/GPU4_BENCH.TU.topaze_cc80 b/tests/GPU/GPU4/GPU4_BENCH.TU.topaze_cc80 index e257d2f32c..97412b3ffd 100644 --- a/tests/GPU/GPU4/GPU4_BENCH.TU.topaze_cc80 +++ b/tests/GPU/GPU4/GPU4_BENCH.TU.topaze_cc80 @@ -8,8 +8,8 @@ Time is given in seconds ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Context of the computation ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -Date: 20-02-2026 -- 17:43:48 -OS: topaze7059__Linux__x86_64__4.18.0-553.89.1.el8_10.x86_64__#1 SMP Sat Nov 29 00:49:18 EST 2025 +Date: 15-05-2026 -- 13:33:13 +OS: topaze7040__Linux__x86_64__4.18.0-553.89.1.el8_10.x86_64__#1 SMP Sat Nov 29 00:49:18 EST 2025 CPU model : AMD EPYC 7763 64-Core Processor Total number of threads:256 GPU model: NVIDIA A100-SXM4-80GB @@ -22,35 +22,35 @@ Total number of elements used for the calculation: 2592000 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Computation start-up statistics ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -Total time of the start-up: 45.444 +Total time of the start-up: 45.6469 Number of calls to the linear solver per time step: 2 -Average time of the resolution of the linear problem per call: 2.14431 +Average time of the resolution of the linear problem per call: 2.08417 Average number of iteration of the linear solver per call: 33 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Time loop statistics ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ The first time step is not accounted for the computation of the time loop statistics -Total time of the time loop: 0.652446 +Total time of the time loop: 0.556963 Number of time steps: 9 Skipped time steps: 1 -Average time per time step: 0.072494 -Standard deviation between time steps: 0.00699117 -Time elapsed in the skipped time steps: 0.115932 +Average time per time step: 0.0618848 +Standard deviation between time steps: 0.00639562 +Time elapsed in the skipped time steps: 0.0650986 Standard counter description | Time/step | % loop time | Call(s)/step ------------------------------------------------------------------------------------------ -Linear solver resolutions Ax=B | 0.0327829 | 38.4 | 1 -Convection operator | 0.008547117 | 10.0 | 1 -Diffusion operator | 0.002531127 | 3.0 | 1 -Gradient operator | 0.009250802 | 10.8 | 2 -Divergence operator | 0.004430615 | 5.2 | 2 -Update ::mettre_a_jour | 0.004444057 | 5.2 | 1 -Computation of the time step dt | 0.002272804 | 2.7 | 2 -Post-treatment operations | 0.003122811 | 3.7 | 1 -Other operations | 0.005111731 | 6.0 | +Linear solver resolutions Ax=B | 0.0319398 | 51.6 | 1 +Convection operator | 0.005375055 | 8.7 | 1 +Diffusion operator | 0.002208149 | 3.6 | 1 +Gradient operator | 0.0070071 | 11.3 | 2 +Divergence operator | 0.001991878 | 3.2 | 2 +Update ::mettre_a_jour | 0.0028667 | 4.6 | 1 +Computation of the time step dt | 0.001355453 | 2.2 | 2 +Post-treatment operations | 0.002902446 | 4.7 | 1 +Other operations | 0.00623828 | 10.1 | Average number of iteration of the linear solver per call: 20.7 @@ -60,16 +60,17 @@ Average number of iteration of the linear solver per call: 20.7 ----------------------------------------------------------------------------------------------------------- Counter description | Time per step | % loop time | Call(s)/step | Bandwidth ----------------------------------------------------------------------------------------------------------- -Libraries: | 0.0326061 | 45.0 | 1 | -Kernels: | 0.0341615 | 47.1 | 147 | -Copy host to device: | 0.000264131 | 0.4 | 10 | 5.1 GB/s -Copy device to host: | 0.000340788 | 0.5 | 1 | 9.9 GB/s -Alloc/Free on device: | 4.67921e-05 | 0.1 | 0 | -GPU: 92% Copy H<->D: 0.83% Alloc/free: 0.065% Comm: 0% CPU & I/O: 7% +Libraries: | 0.0317634 | 51.3 | 1 | +Kernels: | 0.0238069 | 38.5 | 153 | +Copy host to device: | 0.000227296 | 0.4 | 9 | 5.0 GB/s +Copy device to host: | 0.000285206 | 0.5 | 2 | 12.4 GB/s +Alloc/Free on device: | 4.25278e-05 | 0.1 | 0 | +GPU: 90% Copy H<->D: 0.83% Alloc/free: 0.069% Comm: 0% CPU & I/O: 9.3% ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Post-resolution statistics ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -Time of the post-resolution: 0.181824 +Time of the post-resolution: 0.171073 -Total time for the whole computation 46.3942 +Total time for the whole computation 46.4401 +[Slurm] Power consumption (80 s): 0.409 kW 0.009 kWh 0.001 € (0.10€/kWh) diff --git a/tests/GPU/Implicit_ef_stab/Implicit_ef_stab.data b/tests/GPU/Implicit_ef_stab/Implicit_ef_stab.data index d49102d58b..7aa2fe01ab 100644 --- a/tests/GPU/Implicit_ef_stab/Implicit_ef_stab.data +++ b/tests/GPU/Implicit_ef_stab/Implicit_ef_stab.data @@ -41,7 +41,8 @@ Scatter DOM.Zones dom_fluide END SCATTER # -VEFPreP1B dis +VEFPreP1B dis +Lire dis { reorder { algo Hilbert } } Scheme_euler_implicit sch Read sch diff --git a/tests/GPU/Implicit_ef_stab/Implicit_ef_stab_BENCH.TU.adastra_gfx90a b/tests/GPU/Implicit_ef_stab/Implicit_ef_stab_BENCH.TU.adastra_gfx90a index d91d299b7a..916ff81393 100644 --- a/tests/GPU/Implicit_ef_stab/Implicit_ef_stab_BENCH.TU.adastra_gfx90a +++ b/tests/GPU/Implicit_ef_stab/Implicit_ef_stab_BENCH.TU.adastra_gfx90a @@ -8,8 +8,8 @@ Time is given in seconds ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Context of the computation ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -Date: 12-03-2026 -- 18:28:24 -OS: g1016__Linux__x86_64__5.14.0-570.69.1.el9_6.x86_64__#1 SMP PREEMPT_DYNAMIC Tue Nov 25 01:30:14 EST 2025 +Date: 24-05-2026 -- 16:00:01 +OS: g1321__Linux__x86_64__5.14.0-570.69.1.el9_6.x86_64__#1 SMP PREEMPT_DYNAMIC Tue Nov 25 01:30:14 EST 2025 CPU model : AMD EPYC 7A53 64-Core Processor Total number of threads:128 GPU model: AMD Instinct MI250X @@ -22,36 +22,36 @@ Total number of elements used for the calculation: 2560000 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Computation start-up statistics ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -Total time of the start-up: 54.1322 +Total time of the start-up: 44.9673 Number of calls to the linear solver per time step: 1 -Average time of the resolution of the linear problem per call: 2.49013 +Average time of the resolution of the linear problem per call: 3.22864 Average number of iteration of the linear solver per call: 0 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Time loop statistics ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ The first time step is not accounted for the computation of the time loop statistics -Total time of the time loop: 12.1558 +Total time of the time loop: 10.5043 Number of time steps: 9 Skipped time steps: 1 -Average time per time step: 1.35065 -Standard deviation between time steps: 0.0459982 -Time elapsed in the skipped time steps: 14.7339 +Average time per time step: 1.16715 +Standard deviation between time steps: 0.0400912 +Time elapsed in the skipped time steps: 26.41 Standard counter description | Time/step | % loop time | Call(s)/step ------------------------------------------------------------------------------------------ -Linear solver resolutions Ax=B | 0.538321 | 39.9 | 3 -Matrix assembly for implicit scheme | 0.1717073 | 12.7 | 1 -Convection operator | 0.2151823 | 15.9 | 4 -Diffusion operator | 0.01442614 | 1.1 | 2 -Divergence operator | 0.03195598 | 2.4 | 4 -Source terms | 0.0005473772 | 0.0 | 2 -Update ::mettre_a_jour | 0.01190256 | 0.9 | 4 -Computation of the time step dt | 0.001549987 | 0.1 | 4 -Post-treatment operations | 0.02270432 | 1.7 | 1 -Other operations | 0.3423522 | 25.3 | +Linear solver resolutions Ax=B | 0.498521 | 42.7 | 3 +Matrix assembly for implicit scheme | 0.1270503 | 10.9 | 1 +Convection operator | 0.1485969 | 12.7 | 4 +Diffusion operator | 0.01211031 | 1.0 | 2 +Divergence operator | 0.021462 | 1.8 | 4 +Source terms | 0.0005480331 | 0.0 | 2 +Update ::mettre_a_jour | 0.009032102 | 0.8 | 4 +Computation of the time step dt | 0.001589899 | 0.1 | 4 +Post-treatment operations | 0.02160866 | 1.9 | 1 +Other operations | 0.3266274 | 28.0 | Average number of iteration of the linear solver per call: 23.4 @@ -61,17 +61,17 @@ Average number of iteration of the linear solver per call: 23.4 ----------------------------------------------------------------------------------------------------------- Counter description | Time per step | % loop time | Call(s)/step | Bandwidth ----------------------------------------------------------------------------------------------------------- -Libraries: | 0.515509 | 38.2 | 3 | -Kernels: | 0.823336 | 61.0 | 435 | -Copy host to device: | 0.00069958 | 0.1 | 21 | 6.4 GB/s -Copy device to host: | 0.000783069 | 0.1 | 7 | 14.2 GB/s -Alloc/Free on device: | 0.00013029 | 0.0 | 4 | -GPU: 99% Copy H<->D: 0.11% Alloc/free: 0.0096% Comm: 0% CPU & I/O: 0.75% +Libraries: | 0.476198 | 40.8 | 3 | +Kernels: | 0.678666 | 58.1 | 434 | +Copy host to device: | 0.000692706 | 0.1 | 21 | 6.5 GB/s +Copy device to host: | 0.00079482 | 0.1 | 7 | 14.0 GB/s +Alloc/Free on device: | 0.000150397 | 0.0 | 4 | +GPU: 99% Copy H<->D: 0.13% Alloc/free: 0.013% Comm: 0% CPU & I/O: 0.91% ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Post-resolution statistics ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -Time of the post-resolution: 0.198921 +Time of the post-resolution: 0.202604 -Total time for the whole computation 81.2209 +Total time for the whole computation 82.0843 -[Slurm] Power consumption (89 s): 0.513 kW 0.013 kWh 0.001 € (0.10€/kWh) +[Slurm] Power consumption (88 s): 0.476 kW 0.012 kWh 0.001 € (0.10€/kWh) diff --git a/tests/GPU/Implicit_ef_stab/Implicit_ef_stab_BENCH.TU.adastra_gfx942 b/tests/GPU/Implicit_ef_stab/Implicit_ef_stab_BENCH.TU.adastra_gfx942 index 7a2a08434d..0830e1ccbe 100644 --- a/tests/GPU/Implicit_ef_stab/Implicit_ef_stab_BENCH.TU.adastra_gfx942 +++ b/tests/GPU/Implicit_ef_stab/Implicit_ef_stab_BENCH.TU.adastra_gfx942 @@ -8,8 +8,8 @@ Time is given in seconds ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Context of the computation ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -Date: 17-03-2026 -- 17:55:25 -OS: a1007__Linux__x86_64__5.14.0-570.69.1.el9_6.x86_64__#1 SMP PREEMPT_DYNAMIC Tue Nov 25 01:30:14 EST 2025 +Date: 23-04-2026 -- 14:44:39 +OS: a1003__Linux__x86_64__5.14.0-570.69.1.el9_6.x86_64__#1 SMP PREEMPT_DYNAMIC Tue Nov 25 01:30:14 EST 2025 CPU model : AMD Instinct MI300A Accelerator Total number of threads:192 GPU model: AMD Instinct MI300A @@ -22,36 +22,36 @@ Total number of elements used for the calculation: 2560000 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Computation start-up statistics ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -Total time of the start-up: 85.6592 +Total time of the start-up: 63.6504 Number of calls to the linear solver per time step: 1 -Average time of the resolution of the linear problem per call: 3.56771 +Average time of the resolution of the linear problem per call: 3.86101 Average number of iteration of the linear solver per call: 0 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Time loop statistics ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ The first time step is not accounted for the computation of the time loop statistics -Total time of the time loop: 5.31926 +Total time of the time loop: 5.12227 Number of time steps: 9 Skipped time steps: 1 -Average time per time step: 0.591029 -Standard deviation between time steps: 0.0943462 -Time elapsed in the skipped time steps: 26.3575 +Average time per time step: 0.569141 +Standard deviation between time steps: 0.0908845 +Time elapsed in the skipped time steps: 28.5199 Standard counter description | Time/step | % loop time | Call(s)/step ------------------------------------------------------------------------------------------ -Linear solver resolutions Ax=B | 0.408167 | 69.1 | 3 -Matrix assembly for implicit scheme | 0.03734021 | 6.3 | 1 -Convection operator | 0.04406685 | 7.5 | 4 -Diffusion operator | 0.004099765 | 0.7 | 2 -Divergence operator | 0.008980409 | 1.5 | 4 -Source terms | 0.0002865144 | 0.0 | 2 -Update ::mettre_a_jour | 0.006264622 | 1.1 | 4 -Computation of the time step dt | 0.001301963 | 0.2 | 4 -Post-treatment operations | 0.01142354 | 1.9 | 1 -Other operations | 0.06909795 | 11.7 | +Linear solver resolutions Ax=B | 0.399856 | 70.3 | 3 +Matrix assembly for implicit scheme | 0.03381677 | 5.9 | 1 +Convection operator | 0.04030673 | 7.1 | 4 +Diffusion operator | 0.004144521 | 0.7 | 2 +Divergence operator | 0.006502531 | 1.1 | 4 +Source terms | 0.0003088568 | 0.1 | 2 +Update ::mettre_a_jour | 0.005641864 | 1.0 | 4 +Computation of the time step dt | 0.001270153 | 0.2 | 4 +Post-treatment operations | 0.0110431 | 1.9 | 1 +Other operations | 0.06625069 | 11.6 | Average number of iteration of the linear solver per call: 23.4 @@ -61,17 +61,17 @@ Average number of iteration of the linear solver per call: 23.4 ----------------------------------------------------------------------------------------------------------- Counter description | Time per step | % loop time | Call(s)/step | Bandwidth ----------------------------------------------------------------------------------------------------------- -Libraries: | 0.398663 | 67.5 | 3 | -Kernels: | 0.179445 | 30.4 | 434 | -Copy host to device: | 0.000668456 | 0.1 | 21 | 6.7 GB/s -Copy device to host: | 0.00053813 | 0.1 | 7 | 20.7 GB/s -Alloc/Free on device: | 0.000937058 | 0.2 | 4 | -GPU: 98% Copy H<->D: 0.2% Alloc/free: 0.16% Comm: 0% CPU & I/O: 1.8% +Libraries: | 0.390398 | 68.6 | 3 | +Kernels: | 0.165589 | 29.1 | 434 | +Copy host to device: | 0.000691497 | 0.1 | 21 | 6.5 GB/s +Copy device to host: | 0.000544154 | 0.1 | 7 | 20.5 GB/s +Alloc/Free on device: | 0.000897341 | 0.2 | 4 | +GPU: 98% Copy H<->D: 0.22% Alloc/free: 0.16% Comm: 0% CPU & I/O: 1.9% ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Post-resolution statistics ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -Time of the post-resolution: 0.190028 +Time of the post-resolution: 0.182175 -Total time for the whole computation 117.526 +Total time for the whole computation 97.4747 -[Slurm] Power consumption (127 s): 0.693 kW 0.024 kWh 0.002 € (0.10€/kWh) +[Slurm] Power consumption (107 s): 0.680 kW 0.020 kWh 0.002 € (0.10€/kWh) diff --git a/tests/GPU/Implicit_ef_stab/Implicit_ef_stab_BENCH.TU.dalianvl_cc100 b/tests/GPU/Implicit_ef_stab/Implicit_ef_stab_BENCH.TU.dalianvl_cc100 new file mode 100644 index 0000000000..1aef35a8ee --- /dev/null +++ b/tests/GPU/Implicit_ef_stab/Implicit_ef_stab_BENCH.TU.dalianvl_cc100 @@ -0,0 +1,76 @@ + # Global performance file # + +This is the global file for tracking performance in TRUST. It stores aggregated quantities. +More detailed statistics can be found in the Implicit_ef_stab_BENCH_csv.TU file +For time loop, only standard counters of level 1 are printed alongside your custom counters +Time is given in seconds + +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Context of the computation +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Date: 22-05-2026 -- 12:16:39 +OS: dalianvl14__Linux__aarch64__5.14.0-503.40.1.el9_5.aarch64__#1 SMP PREEMPT_DYNAMIC Thu Apr 24 13:05:29 UTC 2025 +CPU model : Unknown Linux CPU +Total number of threads:144 +GPU model: NVIDIA GB200 +CUDA runtime version: 13.0 +CUDA drivers version: 13.20 +Nb procs used for the computation: 1 +TRUST version: 1.9.8_beta +Total number of elements used for the calculation: 2560000 + +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Computation start-up statistics +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Total time of the start-up: 40.0777 + +Number of calls to the linear solver per time step: 1 +Average time of the resolution of the linear problem per call: 2.69552 +Average number of iteration of the linear solver per call: 0 + +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Time loop statistics +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +The first time step is not accounted for the computation of the time loop statistics +Total time of the time loop: 1.52199 +Number of time steps: 9 +Skipped time steps: 1 +Average time per time step: 0.16911 +Standard deviation between time steps: 0.0156024 +Time elapsed in the skipped time steps: 14.9398 + + +Standard counter description | Time/step | % loop time | Call(s)/step +------------------------------------------------------------------------------------------ +Linear solver resolutions Ax=B | 0.108892 | 64.4 | 3 +Matrix assembly for implicit scheme | 0.01232241 | 7.3 | 1 +Convection operator | 0.0112719 | 6.7 | 4 +Diffusion operator | 0.001829879 | 1.1 | 2 +Divergence operator | 0.00168482 | 1.0 | 4 +Source terms | 0.0001482987 | 0.1 | 2 +Update ::mettre_a_jour | 0.002643297 | 1.6 | 4 +Computation of the time step dt | 0.000565564 | 0.3 | 4 +Post-treatment operations | 0.005844443 | 3.5 | 1 +Other operations | 0.02390721 | 14.1 | + +Average number of iteration of the linear solver per call: 23.4 + + +----------------------------------------------------------------------------------------------------------- + GPU statistics +----------------------------------------------------------------------------------------------------------- +Counter description | Time per step | % loop time | Call(s)/step | Bandwidth +----------------------------------------------------------------------------------------------------------- +Libraries: | 0.101757 | 60.2 | 3 | +Kernels: | 0.0561644 | 33.2 | 434 | +Copy host to device: | 0.000435701 | 0.3 | 21 | 10.3 GB/s +Copy device to host: | 0.000350478 | 0.2 | 7 | 31.8 GB/s +Alloc/Free on device: | 0.0019882 | 1.2 | 4 | +GPU: 93% Copy H<->D: 0.46% Alloc/free: 1.2% Comm: 0% CPU & I/O: 5% +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Post-resolution statistics +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Time of the post-resolution: 0.151419 + +Total time for the whole computation 56.691 + diff --git a/tests/GPU/Implicit_ef_stab/Implicit_ef_stab_BENCH.TU.eureka_cc89 b/tests/GPU/Implicit_ef_stab/Implicit_ef_stab_BENCH.TU.eureka_cc89 new file mode 100644 index 0000000000..cfcb620dd7 --- /dev/null +++ b/tests/GPU/Implicit_ef_stab/Implicit_ef_stab_BENCH.TU.eureka_cc89 @@ -0,0 +1,76 @@ + # Global performance file # + +This is the global file for tracking performance in TRUST. It stores aggregated quantities. +More detailed statistics can be found in the Implicit_ef_stab_BENCH_csv.TU file +For time loop, only standard counters of level 1 are printed alongside your custom counters +Time is given in seconds + +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Context of the computation +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Date: 11-05-2026 -- 08:33:22 +OS: eureka__Linux__x86_64__6.17.0-20-generic__#20~24.04.1-Ubuntu SMP PREEMPT_DYNAMIC Thu Mar 19 01:28:37 UTC 2 +CPU model : INTEL(R) XEON(R) PLATINUM 8580 +Total number of threads:240 +GPU model: NVIDIA RTX 6000 Ada Generation +CUDA runtime version: 12.90 +CUDA drivers version: 13.10 +Nb procs used for the computation: 1 +TRUST version: 1.9.8_beta +Total number of elements used for the calculation: 2560000 + +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Computation start-up statistics +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Total time of the start-up: 51.361 + +Number of calls to the linear solver per time step: 1 +Average time of the resolution of the linear problem per call: 3.54282 +Average number of iteration of the linear solver per call: 0 + +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Time loop statistics +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +The first time step is not accounted for the computation of the time loop statistics +Total time of the time loop: 5.0599 +Number of time steps: 9 +Skipped time steps: 1 +Average time per time step: 0.562211 +Standard deviation between time steps: 0.0438569 +Time elapsed in the skipped time steps: 36.3611 + + +Standard counter description | Time/step | % loop time | Call(s)/step +------------------------------------------------------------------------------------------ +Linear solver resolutions Ax=B | 0.338952 | 60.3 | 3 +Matrix assembly for implicit scheme | 0.03494757 | 6.2 | 1 +Convection operator | 0.03678962 | 6.5 | 4 +Diffusion operator | 0.003927381 | 0.7 | 2 +Divergence operator | 0.004817764 | 0.9 | 4 +Source terms | 0.0006496889 | 0.1 | 2 +Update ::mettre_a_jour | 0.004099632 | 0.7 | 4 +Computation of the time step dt | 0.001328052 | 0.2 | 4 +Post-treatment operations | 0.007013156 | 1.2 | 1 +Other operations | 0.1296865 | 23.1 | + +Average number of iteration of the linear solver per call: 23.4 + + +----------------------------------------------------------------------------------------------------------- + GPU statistics +----------------------------------------------------------------------------------------------------------- +Counter description | Time per step | % loop time | Call(s)/step | Bandwidth +----------------------------------------------------------------------------------------------------------- +Libraries: | 0.306028 | 54.4 | 3 | +Kernels: | 0.247187 | 44.0 | 434 | +Copy host to device: | 0.00103896 | 0.2 | 21 | 4.3 GB/s +Copy device to host: | 0.00102148 | 0.2 | 7 | 10.9 GB/s +Alloc/Free on device: | 0.000524655 | 0.1 | 4 | +GPU: 98% Copy H<->D: 0.37% Alloc/free: 0.093% Comm: 0% CPU & I/O: 1.1% +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Post-resolution statistics +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Time of the post-resolution: 0.0875391 + +Total time for the whole computation 92.8696 + diff --git a/tests/GPU/Implicit_ef_stab/Implicit_ef_stab_BENCH.TU.is157091_cc86 b/tests/GPU/Implicit_ef_stab/Implicit_ef_stab_BENCH.TU.is157091_cc86 index fe44c55875..5a041cf020 100644 --- a/tests/GPU/Implicit_ef_stab/Implicit_ef_stab_BENCH.TU.is157091_cc86 +++ b/tests/GPU/Implicit_ef_stab/Implicit_ef_stab_BENCH.TU.is157091_cc86 @@ -8,13 +8,13 @@ Time is given in seconds ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Context of the computation ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -Date: 20-02-2026 -- 17:56:57 +Date: 08-06-2026 -- 11:18:18 OS: is157091__Linux__x86_64__5.11.12-300.fc34.x86_64__#1 SMP Wed Apr 7 16:31:13 UTC 2021 CPU model : AMD Ryzen Threadripper PRO 5975WX 32-Cores Total number of threads:64 GPU model: NVIDIA RTX A6000 CUDA runtime version: 12.90 -CUDA drivers version: 12.70 +CUDA drivers version: 13.20 Nb procs used for the computation: 1 TRUST version: 1.9.8_beta Total number of elements used for the calculation: 2560000 @@ -22,36 +22,36 @@ Total number of elements used for the calculation: 2560000 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Computation start-up statistics ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -Total time of the start-up: 43.4891 +Total time of the start-up: 27.814 Number of calls to the linear solver per time step: 1 -Average time of the resolution of the linear problem per call: 2.69107 +Average time of the resolution of the linear problem per call: 2.49741 Average number of iteration of the linear solver per call: 0 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Time loop statistics ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ The first time step is not accounted for the computation of the time loop statistics -Total time of the time loop: 9.01733 +Total time of the time loop: 7.71198 Number of time steps: 9 Skipped time steps: 1 -Average time per time step: 1.00193 -Standard deviation between time steps: 0.056026 -Time elapsed in the skipped time steps: 21.6859 +Average time per time step: 0.856887 +Standard deviation between time steps: 0.0537875 +Time elapsed in the skipped time steps: 26.9383 Standard counter description | Time/step | % loop time | Call(s)/step ------------------------------------------------------------------------------------------ -Linear solver resolutions Ax=B | 0.461884 | 46.1 | 3 -Matrix assembly for implicit scheme | 0.09949726 | 9.9 | 1 -Convection operator | 0.09331728 | 9.3 | 4 -Diffusion operator | 0.007343724 | 0.7 | 2 -Divergence operator | 0.02650007 | 2.6 | 4 -Source terms | 0.001259325 | 0.1 | 2 -Update ::mettre_a_jour | 0.01118447 | 1.1 | 4 -Computation of the time step dt | 0.002266761 | 0.2 | 4 -Post-treatment operations | 0.01669265 | 1.7 | 1 -Other operations | 0.2819804 | 28.1 | +Linear solver resolutions Ax=B | 0.444648 | 51.9 | 3 +Matrix assembly for implicit scheme | 0.07059693 | 8.2 | 1 +Convection operator | 0.07447829 | 8.7 | 4 +Diffusion operator | 0.005784988 | 0.7 | 2 +Divergence operator | 0.01199142 | 1.4 | 4 +Source terms | 0.0007986473 | 0.1 | 2 +Update ::mettre_a_jour | 0.007751844 | 0.9 | 4 +Computation of the time step dt | 0.002273929 | 0.3 | 4 +Post-treatment operations | 0.01234477 | 1.4 | 1 +Other operations | 0.2262184 | 26.4 | Average number of iteration of the linear solver per call: 23.4 @@ -61,16 +61,16 @@ Average number of iteration of the linear solver per call: 23.4 ----------------------------------------------------------------------------------------------------------- Counter description | Time per step | % loop time | Call(s)/step | Bandwidth ----------------------------------------------------------------------------------------------------------- -Libraries: | 0.42272 | 42.2 | 3 | -Kernels: | 0.568741 | 56.8 | 435 | -Copy host to device: | 0.00135453 | 0.1 | 21 | 3.3 GB/s -Copy device to host: | 0.00110431 | 0.1 | 7 | 10.1 GB/s -Alloc/Free on device: | 0.000495608 | 0.0 | 4 | -GPU: 99% Copy H<->D: 0.25% Alloc/free: 0.049% Comm: 0% CPU & I/O: 0.75% +Libraries: | 0.403468 | 47.1 | 3 | +Kernels: | 0.442613 | 51.7 | 434 | +Copy host to device: | 0.00135345 | 0.2 | 21 | 3.3 GB/s +Copy device to host: | 0.00111343 | 0.1 | 7 | 10.0 GB/s +Alloc/Free on device: | 0.000585798 | 0.1 | 4 | +GPU: 99% Copy H<->D: 0.29% Alloc/free: 0.068% Comm: 0% CPU & I/O: 0.9% ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Post-resolution statistics ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -Time of the post-resolution: 0.0718186 +Time of the post-resolution: 0.0743327 -Total time for the whole computation 74.2642 +Total time for the whole computation 62.5386 diff --git a/tests/GPU/Implicit_ef_stab/Implicit_ef_stab_BENCH.TU.is159479_cc120 b/tests/GPU/Implicit_ef_stab/Implicit_ef_stab_BENCH.TU.is159479_cc120 new file mode 100644 index 0000000000..07c59a74a3 --- /dev/null +++ b/tests/GPU/Implicit_ef_stab/Implicit_ef_stab_BENCH.TU.is159479_cc120 @@ -0,0 +1,76 @@ + # Global performance file # + +This is the global file for tracking performance in TRUST. It stores aggregated quantities. +More detailed statistics can be found in the Implicit_ef_stab_BENCH_csv.TU file +For time loop, only standard counters of level 1 are printed alongside your custom counters +Time is given in seconds + +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Context of the computation +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Date: 24-04-2026 -- 14:35:42 +OS: is159479.intra.cea.fr__Linux__x86_64__6.12.0-160000.26-default__#1 SMP PREEMPT_DYNAMIC Thu Feb 5 00:00:11 UTC 2026 (f +CPU model : AMD Ryzen Threadripper PRO 9965WX 24-Cores +Total number of threads:48 +GPU model: NVIDIA RTX PRO 6000 Blackwell Workstation Edition +CUDA runtime version: 13.0 +CUDA drivers version: 13.0 +Nb procs used for the computation: 1 +TRUST version: 1.9.8_beta +Total number of elements used for the calculation: 2560000 + +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Computation start-up statistics +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Total time of the start-up: 33.7225 + +Number of calls to the linear solver per time step: 1 +Average time of the resolution of the linear problem per call: 1.97342 +Average number of iteration of the linear solver per call: 0 + +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Time loop statistics +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +The first time step is not accounted for the computation of the time loop statistics +Total time of the time loop: 2.81034 +Number of time steps: 9 +Skipped time steps: 1 +Average time per time step: 0.31226 +Standard deviation between time steps: 0.0220986 +Time elapsed in the skipped time steps: 20.2445 + + +Standard counter description | Time/step | % loop time | Call(s)/step +------------------------------------------------------------------------------------------ +Linear solver resolutions Ax=B | 0.189262 | 60.6 | 3 +Matrix assembly for implicit scheme | 0.02137981 | 6.8 | 1 +Convection operator | 0.02542805 | 8.1 | 4 +Diffusion operator | 0.002372511 | 0.8 | 2 +Divergence operator | 0.002581936 | 0.8 | 4 +Source terms | 0.000339013 | 0.1 | 2 +Update ::mettre_a_jour | 0.002378845 | 0.8 | 4 +Computation of the time step dt | 0.0007381257 | 0.2 | 4 +Post-treatment operations | 0.005218892 | 1.7 | 1 +Other operations | 0.06256046 | 20.0 | + +Average number of iteration of the linear solver per call: 23.4 + + +----------------------------------------------------------------------------------------------------------- + GPU statistics +----------------------------------------------------------------------------------------------------------- +Counter description | Time per step | % loop time | Call(s)/step | Bandwidth +----------------------------------------------------------------------------------------------------------- +Libraries: | 0.171479 | 54.9 | 3 | +Kernels: | 0.134278 | 43.0 | 434 | +Copy host to device: | 0.000496272 | 0.2 | 21 | 9.0 GB/s +Copy device to host: | 0.00135792 | 0.4 | 7 | 8.2 GB/s +Alloc/Free on device: | 0.000442294 | 0.1 | 4 | +GPU: 98% Copy H<->D: 0.59% Alloc/free: 0.14% Comm: 0% CPU & I/O: 1.3% +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Post-resolution statistics +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Time of the post-resolution: 0.0666653 + +Total time for the whole computation 56.844 + diff --git a/tests/GPU/Implicit_ef_stab/Implicit_ef_stab_BENCH.TU.is247793_gfx1100 b/tests/GPU/Implicit_ef_stab/Implicit_ef_stab_BENCH.TU.is247793_gfx1100 new file mode 100644 index 0000000000..0ca61660c2 --- /dev/null +++ b/tests/GPU/Implicit_ef_stab/Implicit_ef_stab_BENCH.TU.is247793_gfx1100 @@ -0,0 +1,76 @@ + # Global performance file # + +This is the global file for tracking performance in TRUST. It stores aggregated quantities. +More detailed statistics can be found in the Implicit_ef_stab_BENCH_csv.TU file +For time loop, only standard counters of level 1 are printed alongside your custom counters +Time is given in seconds + +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Context of the computation +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Date: 22-05-2026 -- 17:39:24 +OS: is247793__Linux__x86_64__6.8.0-107-generic__#107~22.04.1-Ubuntu SMP PREEMPT_DYNAMIC Wed Mar 18 23:40:43 UTC +CPU model : AMD Ryzen Threadripper PRO 7965WX 24-Cores +Total number of threads:48 +GPU model: AMD Radeon PRO W7900 +HIP runtime version: 7.53 +HIP drivers version: 7.53 +Nb procs used for the computation: 1 +TRUST version: 1.9.8_beta +Total number of elements used for the calculation: 2560000 + +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Computation start-up statistics +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Total time of the start-up: 39.0243 + +Number of calls to the linear solver per time step: 1 +Average time of the resolution of the linear problem per call: 4.05154 +Average number of iteration of the linear solver per call: 0 + +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Time loop statistics +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +The first time step is not accounted for the computation of the time loop statistics +Total time of the time loop: 14.9036 +Number of time steps: 9 +Skipped time steps: 1 +Average time per time step: 1.65596 +Standard deviation between time steps: 0.558843 +Time elapsed in the skipped time steps: 27.9679 + + +Standard counter description | Time/step | % loop time | Call(s)/step +------------------------------------------------------------------------------------------ +Linear solver resolutions Ax=B | 1.27397 | 76.9 | 3 +Matrix assembly for implicit scheme | 0.09543697 | 5.8 | 1 +Convection operator | 0.1076007 | 6.5 | 4 +Diffusion operator | 0.007823668 | 0.5 | 2 +Divergence operator | 0.01254682 | 0.8 | 4 +Source terms | 0.0009698686 | 0.1 | 2 +Update ::mettre_a_jour | 0.007694129 | 0.5 | 4 +Computation of the time step dt | 0.002277375 | 0.1 | 4 +Post-treatment operations | 0.01607319 | 1.0 | 1 +Other operations | 0.1315699 | 7.9 | + +Average number of iteration of the linear solver per call: 21.9 + + +----------------------------------------------------------------------------------------------------------- + GPU statistics +----------------------------------------------------------------------------------------------------------- +Counter description | Time per step | % loop time | Call(s)/step | Bandwidth +----------------------------------------------------------------------------------------------------------- +Libraries: | 1.22954 | 74.2 | 3 | +Kernels: | 0.41592 | 25.1 | 434 | +Copy host to device: | 0.000712979 | 0.0 | 21 | 6.3 GB/s +Copy device to host: | 0.000712966 | 0.0 | 7 | 15.6 GB/s +Alloc/Free on device: | 0.000785282 | 0.0 | 4 | +GPU: 99% Copy H<->D: 0.086% Alloc/free: 0.047% Comm: 0% CPU & I/O: 0.5% +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Post-resolution statistics +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Time of the post-resolution: 0.0998975 + +Total time for the whole computation 81.9958 + diff --git a/tests/GPU/Implicit_ef_stab/Implicit_ef_stab_BENCH.TU.jean-zay_cc90 b/tests/GPU/Implicit_ef_stab/Implicit_ef_stab_BENCH.TU.jean-zay_cc90 index 14f0261cec..f113bd06f4 100644 --- a/tests/GPU/Implicit_ef_stab/Implicit_ef_stab_BENCH.TU.jean-zay_cc90 +++ b/tests/GPU/Implicit_ef_stab/Implicit_ef_stab_BENCH.TU.jean-zay_cc90 @@ -8,13 +8,13 @@ Time is given in seconds ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Context of the computation ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -Date: 21-02-2026 -- 07:35:00 -OS: jzxh025__Linux__x86_64__5.14.0-570.58.1.el9_6.x86_64__#1 SMP PREEMPT_DYNAMIC Tue Oct 21 04:15:07 EDT 2025 +Date: 10-06-2026 -- 10:38:22 +OS: jzxh361__Linux__x86_64__5.14.0-570.116.1.el9_6.x86_64__#1 SMP PREEMPT_DYNAMIC Tue May 19 08:44:54 EDT 2026 CPU model : Intel(R) Xeon(R) Platinum 8468 Total number of threads:192 GPU model: NVIDIA H100 80GB HBM3 CUDA runtime version: 12.60 -CUDA drivers version: 13.0 +CUDA drivers version: 13.20 Nb procs used for the computation: 1 TRUST version: 1.9.8_beta Total number of elements used for the calculation: 2560000 @@ -22,36 +22,36 @@ Total number of elements used for the calculation: 2560000 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Computation start-up statistics ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -Total time of the start-up: 41.2371 +Total time of the start-up: 32.7393 Number of calls to the linear solver per time step: 1 -Average time of the resolution of the linear problem per call: 2.40859 +Average time of the resolution of the linear problem per call: 2.67772 Average number of iteration of the linear solver per call: 0 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Time loop statistics ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ The first time step is not accounted for the computation of the time loop statistics -Total time of the time loop: 2.85861 +Total time of the time loop: 2.6547 Number of time steps: 9 Skipped time steps: 1 -Average time per time step: 0.317624 -Standard deviation between time steps: 0.0171556 -Time elapsed in the skipped time steps: 19.6314 +Average time per time step: 0.294966 +Standard deviation between time steps: 0.0274564 +Time elapsed in the skipped time steps: 30.4392 Standard counter description | Time/step | % loop time | Call(s)/step ------------------------------------------------------------------------------------------ -Linear solver resolutions Ax=B | 0.163966 | 51.6 | 3 -Matrix assembly for implicit scheme | 0.02640968 | 8.3 | 1 -Convection operator | 0.02437444 | 7.7 | 4 -Diffusion operator | 0.003660248 | 1.2 | 2 -Divergence operator | 0.006555501 | 2.1 | 4 -Source terms | 0.0004897059 | 0.2 | 2 -Update ::mettre_a_jour | 0.004763247 | 1.5 | 4 -Computation of the time step dt | 0.0008664688 | 0.3 | 4 -Post-treatment operations | 0.00878672 | 2.8 | 1 -Other operations | 0.07775182 | 24.5 | +Linear solver resolutions Ax=B | 0.159624 | 54.1 | 3 +Matrix assembly for implicit scheme | 0.01918769 | 6.5 | 1 +Convection operator | 0.01836007 | 6.2 | 4 +Diffusion operator | 0.002982189 | 1.0 | 2 +Divergence operator | 0.00251469 | 0.9 | 4 +Source terms | 0.0002447141 | 0.1 | 2 +Update ::mettre_a_jour | 0.003794273 | 1.3 | 4 +Computation of the time step dt | 0.0007570568 | 0.3 | 4 +Post-treatment operations | 0.014242 | 4.8 | 1 +Other operations | 0.07325931 | 24.8 | Average number of iteration of the linear solver per call: 23.4 @@ -61,17 +61,17 @@ Average number of iteration of the linear solver per call: 23.4 ----------------------------------------------------------------------------------------------------------- Counter description | Time per step | % loop time | Call(s)/step | Bandwidth ----------------------------------------------------------------------------------------------------------- -Libraries: | 0.152809 | 48.1 | 3 | -Kernels: | 0.15188 | 47.8 | 435 | -Copy host to device: | 0.00076906 | 0.2 | 21 | 5.8 GB/s -Copy device to host: | 0.00166772 | 0.5 | 7 | 6.7 GB/s -Alloc/Free on device: | 0.000731562 | 0.2 | 4 | -GPU: 96% Copy H<->D: 0.77% Alloc/free: 0.23% Comm: 0% CPU & I/O: 3.1% +Libraries: | 0.148476 | 50.3 | 3 | +Kernels: | 0.12706 | 43.1 | 434 | +Copy host to device: | 0.000763417 | 0.3 | 21 | 5.9 GB/s +Copy device to host: | 0.00111141 | 0.4 | 7 | 10.0 GB/s +Alloc/Free on device: | 0.000743817 | 0.3 | 4 | +GPU: 93% Copy H<->D: 0.64% Alloc/free: 0.25% Comm: 0% CPU & I/O: 5.7% ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Post-resolution statistics ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -Time of the post-resolution: 0.185702 +Time of the post-resolution: 0.122407 -Total time for the whole computation 63.9128 +Total time for the whole computation 65.9557 -Power consumption: 0 kW 0 kWh 0 € (0.12€/kWh) +[Slurm] Power consumption (76 s): 0.451 kW 0.010 kWh 0.001 € (0.10€/kWh) diff --git a/tests/GPU/Implicit_ef_stab/Implicit_ef_stab_BENCH.TU.lumi_gfx90a b/tests/GPU/Implicit_ef_stab/Implicit_ef_stab_BENCH.TU.lumi_gfx90a index 4e9fe93c02..cdeb8a3e64 100644 --- a/tests/GPU/Implicit_ef_stab/Implicit_ef_stab_BENCH.TU.lumi_gfx90a +++ b/tests/GPU/Implicit_ef_stab/Implicit_ef_stab_BENCH.TU.lumi_gfx90a @@ -8,8 +8,8 @@ Time is given in seconds ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Context of the computation ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -Date: 24-02-2026 -- 00:02:23 -OS: nid007971__Linux__x86_64__6.4.0-150600.23.73_15.0.14-cray_shasta_c__#1 SMP Tue Oct 21 20:32:25 UTC 2025 (89d3c98) +Date: 15-05-2026 -- 20:25:26 +OS: nid007955__Linux__x86_64__6.4.0-150600.23.73_15.0.14-cray_shasta_c__#1 SMP Tue Oct 21 20:32:25 UTC 2025 (89d3c98) CPU model : AMD EPYC 7A53 64-Core Processor Total number of threads:128 GPU model: AMD Instinct MI250X @@ -22,36 +22,36 @@ Total number of elements used for the calculation: 2560000 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Computation start-up statistics ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -Total time of the start-up: 90.1836 +Total time of the start-up: 125.203 Number of calls to the linear solver per time step: 1 -Average time of the resolution of the linear problem per call: 6.28351 +Average time of the resolution of the linear problem per call: 7.04209 Average number of iteration of the linear solver per call: 0 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Time loop statistics ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ The first time step is not accounted for the computation of the time loop statistics -Total time of the time loop: 12.2985 +Total time of the time loop: 10.6218 Number of time steps: 9 Skipped time steps: 1 -Average time per time step: 1.3665 -Standard deviation between time steps: 0.0415515 -Time elapsed in the skipped time steps: 30.9744 +Average time per time step: 1.1802 +Standard deviation between time steps: 0.0454529 +Time elapsed in the skipped time steps: 40.9321 Standard counter description | Time/step | % loop time | Call(s)/step ------------------------------------------------------------------------------------------ -Linear solver resolutions Ax=B | 0.536635 | 11.2 | 3 -Matrix assembly for implicit scheme | 0.171109 | 3.6 | 1 -Convection operator | 0.2075454 | 4.3 | 4 -Diffusion operator | 0.01382455 | 0.3 | 2 -Divergence operator | 0.03201726 | 0.7 | 4 -Source terms | 0.0006596 | 0.0 | 2 -Update ::mettre_a_jour | 0.01181509 | 0.2 | 4 -Computation of the time step dt | 0.00181459 | 0.0 | 4 -Post-treatment operations | 0.02227527 | 0.5 | 1 -Other operations | 0.3688072 | 7.7 | +Linear solver resolutions Ax=B | 0.499157 | 42.3 | 3 +Matrix assembly for implicit scheme | 0.1353561 | 11.5 | 1 +Convection operator | 0.1723566 | 14.6 | 4 +Diffusion operator | 0.01120409 | 0.9 | 2 +Divergence operator | 0.02008284 | 1.7 | 4 +Source terms | 0.002546747 | 0.2 | 2 +Update ::mettre_a_jour | 0.008625667 | 0.7 | 4 +Computation of the time step dt | 0.001518505 | 0.1 | 4 +Post-treatment operations | 0.01973878 | 1.7 | 1 +Other operations | 0.3096102 | 26.2 | Average number of iteration of the linear solver per call: 23.4 @@ -61,17 +61,17 @@ Average number of iteration of the linear solver per call: 23.4 ----------------------------------------------------------------------------------------------------------- Counter description | Time per step | % loop time | Call(s)/step | Bandwidth ----------------------------------------------------------------------------------------------------------- -Libraries: | 0.513683 | 37.6 | 3 | -Kernels: | 0.841248 | 61.6 | 435 | -Copy host to device: | 0.000779138 | 0.1 | 21 | 5.8 GB/s -Copy device to host: | 0.00079515 | 0.1 | 7 | 14.0 GB/s -Alloc/Free on device: | 0.000129299 | 0.0 | 4 | -GPU: 99% Copy H<->D: 0.12% Alloc/free: 0.0095% Comm: 0% CPU & I/O: 0.72% +Libraries: | 0.47665 | 40.4 | 3 | +Kernels: | 0.692056 | 58.6 | 434 | +Copy host to device: | 0.000810867 | 0.1 | 21 | 5.5 GB/s +Copy device to host: | 0.000802662 | 0.1 | 7 | 13.9 GB/s +Alloc/Free on device: | 0.000131847 | 0.0 | 4 | +GPU: 99% Copy H<->D: 0.14% Alloc/free: 0.011% Comm: 0% CPU & I/O: 0.83% ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Post-resolution statistics ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -Time of the post-resolution: 0.176294 +Time of the post-resolution: 0.184331 -Total time for the whole computation 133.633 +Total time for the whole computation 176.941 -Power consumption: 0 kW 0 kWh 0 € (0.12€/kWh) +[Slurm] Power consumption (210 s): 0.501 kW 0.029 kWh 0.003 € (0.10€/kWh) diff --git a/tests/GPU/Implicit_ef_stab/Implicit_ef_stab_BENCH.TU.topaze_cc80 b/tests/GPU/Implicit_ef_stab/Implicit_ef_stab_BENCH.TU.topaze_cc80 index 06000653dc..a6de5aed35 100644 --- a/tests/GPU/Implicit_ef_stab/Implicit_ef_stab_BENCH.TU.topaze_cc80 +++ b/tests/GPU/Implicit_ef_stab/Implicit_ef_stab_BENCH.TU.topaze_cc80 @@ -8,8 +8,8 @@ Time is given in seconds ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Context of the computation ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -Date: 20-02-2026 -- 17:46:19 -OS: topaze7059__Linux__x86_64__4.18.0-553.89.1.el8_10.x86_64__#1 SMP Sat Nov 29 00:49:18 EST 2025 +Date: 15-05-2026 -- 13:34:57 +OS: topaze7040__Linux__x86_64__4.18.0-553.89.1.el8_10.x86_64__#1 SMP Sat Nov 29 00:49:18 EST 2025 CPU model : AMD EPYC 7763 64-Core Processor Total number of threads:256 GPU model: NVIDIA A100-SXM4-80GB @@ -22,36 +22,36 @@ Total number of elements used for the calculation: 2560000 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Computation start-up statistics ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -Total time of the start-up: 58.6111 +Total time of the start-up: 58.2478 Number of calls to the linear solver per time step: 1 -Average time of the resolution of the linear problem per call: 4.21863 +Average time of the resolution of the linear problem per call: 4.08683 Average number of iteration of the linear solver per call: 0 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Time loop statistics ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ The first time step is not accounted for the computation of the time loop statistics -Total time of the time loop: 4.31741 +Total time of the time loop: 3.80344 Number of time steps: 9 Skipped time steps: 1 -Average time per time step: 0.479712 -Standard deviation between time steps: 0.0290592 -Time elapsed in the skipped time steps: 31.3083 +Average time per time step: 0.422605 +Standard deviation between time steps: 0.0280516 +Time elapsed in the skipped time steps: 39.0974 Standard counter description | Time/step | % loop time | Call(s)/step ------------------------------------------------------------------------------------------ -Linear solver resolutions Ax=B | 0.252873 | 6.4 | 3 -Matrix assembly for implicit scheme | 0.04422392 | 1.1 | 1 -Convection operator | 0.04284369 | 1.1 | 4 -Diffusion operator | 0.005367558 | 0.1 | 2 -Divergence operator | 0.008792561 | 0.2 | 4 -Source terms | 0.00110497 | 0.0 | 2 -Update ::mettre_a_jour | 0.005035711 | 0.1 | 4 -Computation of the time step dt | 0.001184417 | 0.0 | 4 -Post-treatment operations | 0.0116908 | 0.3 | 1 -Other operations | 0.1065955 | 2.7 | +Linear solver resolutions Ax=B | 0.242674 | 57.4 | 3 +Matrix assembly for implicit scheme | 0.02970123 | 7.0 | 1 +Convection operator | 0.02794444 | 6.6 | 4 +Diffusion operator | 0.004167412 | 1.0 | 2 +Divergence operator | 0.003920845 | 0.9 | 4 +Source terms | 0.0003530411 | 0.1 | 2 +Update ::mettre_a_jour | 0.003896615 | 0.9 | 4 +Computation of the time step dt | 0.0009981211 | 0.2 | 4 +Post-treatment operations | 0.009928146 | 2.3 | 1 +Other operations | 0.09902102 | 23.4 | Average number of iteration of the linear solver per call: 23.4 @@ -61,16 +61,17 @@ Average number of iteration of the linear solver per call: 23.4 ----------------------------------------------------------------------------------------------------------- Counter description | Time per step | % loop time | Call(s)/step | Bandwidth ----------------------------------------------------------------------------------------------------------- -Libraries: | 0.235152 | 49.0 | 3 | -Kernels: | 0.230355 | 48.0 | 435 | -Copy host to device: | 0.00154799 | 0.3 | 21 | 2.9 GB/s -Copy device to host: | 0.00143133 | 0.3 | 7 | 7.8 GB/s -Alloc/Free on device: | 0.000915653 | 0.2 | 4 | -GPU: 97% Copy H<->D: 0.62% Alloc/free: 0.19% Comm: 0% CPU & I/O: 2.1% +Libraries: | 0.224982 | 53.2 | 3 | +Kernels: | 0.184112 | 43.6 | 434 | +Copy host to device: | 0.00172855 | 0.4 | 21 | 2.6 GB/s +Copy device to host: | 0.000943387 | 0.2 | 7 | 11.8 GB/s +Alloc/Free on device: | 0.000872976 | 0.2 | 4 | +GPU: 97% Copy H<->D: 0.63% Alloc/free: 0.21% Comm: 0% CPU & I/O: 2.4% ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Post-resolution statistics ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -Time of the post-resolution: 0.224487 +Time of the post-resolution: 0.211573 -Total time for the whole computation 94.4613 +Total time for the whole computation 101.36 +[Slurm] Power consumption (135 s): 0.443 kW 0.017 kWh 0.002 € (0.10€/kWh) diff --git a/tests/GPU/JEL_bous/JEL_bous.data b/tests/GPU/JEL_bous/JEL_bous.data index 9a74e93fb3..4e05f4c9b9 100644 --- a/tests/GPU/JEL_bous/JEL_bous.data +++ b/tests/GPU/JEL_bous/JEL_bous.data @@ -39,7 +39,9 @@ END PARTITION # Scatter dom.Zones dom END SCATTER # -vef dis +vef dis +Lire dis { reorder { algo hilbert } } + Runge_Kutta_Rationnel_ordre_2 sch Lire sch { diff --git a/tests/GPU/JEL_bous/JEL_bous_BENCH.TU.adastra_gfx90a b/tests/GPU/JEL_bous/JEL_bous_BENCH.TU.adastra_gfx90a index 45216fa313..a4e38d7c08 100644 --- a/tests/GPU/JEL_bous/JEL_bous_BENCH.TU.adastra_gfx90a +++ b/tests/GPU/JEL_bous/JEL_bous_BENCH.TU.adastra_gfx90a @@ -8,8 +8,8 @@ Time is given in seconds ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Context of the computation ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -Date: 18-03-2026 -- 19:54:45 -OS: g1085__Linux__x86_64__5.14.0-570.69.1.el9_6.x86_64__#1 SMP PREEMPT_DYNAMIC Tue Nov 25 01:30:14 EST 2025 +Date: 23-04-2026 -- 09:25:45 +OS: g1182__Linux__x86_64__5.14.0-570.69.1.el9_6.x86_64__#1 SMP PREEMPT_DYNAMIC Tue Nov 25 01:30:14 EST 2025 CPU model : AMD EPYC 7A53 64-Core Processor Total number of threads:128 GPU model: AMD Instinct MI250X @@ -22,40 +22,40 @@ Total number of elements used for the calculation: 2332800 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Computation start-up statistics ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -Total time of the start-up: 41.1329 +Total time of the start-up: 43.3078 Number of calls to the linear solver per time step: 2 -Average time of the resolution of the linear problem per call: 1.21027 +Average time of the resolution of the linear problem per call: 1.50034 Average number of iteration of the linear solver per call: 19.5 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Time loop statistics ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ The first time step is not accounted for the computation of the time loop statistics -Total time of the time loop: 5.62354 +Total time of the time loop: 4.2439 Number of time steps: 9 Skipped time steps: 1 -Average time per time step: 0.624837 -Standard deviation between time steps: 0.0134143 -Time elapsed in the skipped time steps: 1.0322 +Average time per time step: 0.471545 +Standard deviation between time steps: 0.0104923 +Time elapsed in the skipped time steps: 0.864911 Standard counter description | Time/step | % loop time | Call(s)/step ------------------------------------------------------------------------------------------ -Linear solver resolutions Ax=B | 0.12578 | 20.1 | 2 -Convection operator | 0.05487504 | 8.8 | 4 -Diffusion operator | 0.1397324 | 22.4 | 26 -Gradient operator | 0.09053912 | 14.5 | 4 -Divergence operator | 0.02166166 | 3.5 | 3 -Source terms | 0.09107066 | 14.6 | 2 -Update ::mettre_a_jour | 0.01492529 | 2.4 | 1 -Solver for implicit diffusion | 0.03447241 | 5.5 | 4 -Computation of the time step dt | 0.03681047 | 5.9 | 8 -Turbulence model::update | 0.005562819 | 0.9 | 1 -Post-treatment operations | 0.006412253 | 1.0 | 1 -Other operations | 0.002994933 | 0.5 | +Linear solver resolutions Ax=B | 0.112863 | 23.9 | 2 +Convection operator | 0.03530255 | 7.5 | 4 +Diffusion operator | 0.09859322 | 20.9 | 26 +Gradient operator | 0.04428958 | 9.4 | 4 +Divergence operator | 0.01316282 | 2.8 | 3 +Source terms | 0.07620722 | 16.2 | 2 +Update ::mettre_a_jour | 0.01118618 | 2.4 | 1 +Solver for implicit diffusion | 0.03458315 | 7.3 | 4 +Computation of the time step dt | 0.02691001 | 5.7 | 8 +Turbulence model::update | 0.005000574 | 1.1 | 1 +Post-treatment operations | 0.006459059 | 1.4 | 1 +Other operations | 0.006986863 | 1.5 | -Average number of iteration of the linear solver per call: 33 +Average number of iteration of the linear solver per call: 36 ----------------------------------------------------------------------------------------------------------- @@ -63,17 +63,17 @@ Average number of iteration of the linear solver per call: 33 ----------------------------------------------------------------------------------------------------------- Counter description | Time per step | % loop time | Call(s)/step | Bandwidth ----------------------------------------------------------------------------------------------------------- -Libraries: | 0.125386 | 20.1 | 2 | -Kernels: | 0.491607 | 78.7 | 912 | -Copy host to device: | 0.000504216 | 0.1 | 18 | 7.1 GB/s -Copy device to host: | 0.000747571 | 0.1 | 7 | 14.1 GB/s -Alloc/Free on device: | 2.81546e-05 | 0.0 | 0 | -GPU: 99% Copy H<->D: 0.2% Alloc/free: 0.0045% Comm: 0% CPU & I/O: 1.1% +Libraries: | 0.112453 | 23.8 | 2 | +Kernels: | 0.35097 | 74.4 | 910 | +Copy host to device: | 0.000499709 | 0.1 | 18 | 7.1 GB/s +Copy device to host: | 0.000743318 | 0.2 | 7 | 14.2 GB/s +Alloc/Free on device: | 2.73587e-05 | 0.0 | 0 | +GPU: 98% Copy H<->D: 0.26% Alloc/free: 0.0058% Comm: 0% CPU & I/O: 1.5% ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Post-resolution statistics ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -Time of the post-resolution: 0.227312 +Time of the post-resolution: 0.184669 -Total time for the whole computation 48.016 +Total time for the whole computation 48.6013 -[Slurm] Power consumption (55 s): 0.490 kW 0.007 kWh 0.001 € (0.10€/kWh) +[Slurm] Power consumption (57 s): 0.460 kW 0.007 kWh 0.001 € (0.10€/kWh) diff --git a/tests/GPU/JEL_bous/JEL_bous_BENCH.TU.adastra_gfx942 b/tests/GPU/JEL_bous/JEL_bous_BENCH.TU.adastra_gfx942 index 3f909c97a7..ecb0139234 100644 --- a/tests/GPU/JEL_bous/JEL_bous_BENCH.TU.adastra_gfx942 +++ b/tests/GPU/JEL_bous/JEL_bous_BENCH.TU.adastra_gfx942 @@ -8,8 +8,8 @@ Time is given in seconds ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Context of the computation ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -Date: 17-03-2026 -- 19:11:30 -OS: a1007__Linux__x86_64__5.14.0-570.69.1.el9_6.x86_64__#1 SMP PREEMPT_DYNAMIC Tue Nov 25 01:30:14 EST 2025 +Date: 23-04-2026 -- 14:46:22 +OS: a1003__Linux__x86_64__5.14.0-570.69.1.el9_6.x86_64__#1 SMP PREEMPT_DYNAMIC Tue Nov 25 01:30:14 EST 2025 CPU model : AMD Instinct MI300A Accelerator Total number of threads:192 GPU model: AMD Instinct MI300A @@ -22,40 +22,40 @@ Total number of elements used for the calculation: 2332800 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Computation start-up statistics ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -Total time of the start-up: 52.8219 +Total time of the start-up: 50.9967 Number of calls to the linear solver per time step: 2 -Average time of the resolution of the linear problem per call: 1.66151 +Average time of the resolution of the linear problem per call: 1.80608 Average number of iteration of the linear solver per call: 19.5 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Time loop statistics ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ The first time step is not accounted for the computation of the time loop statistics -Total time of the time loop: 1.7971 +Total time of the time loop: 1.7142 Number of time steps: 9 Skipped time steps: 1 -Average time per time step: 0.199678 -Standard deviation between time steps: 0.00636364 -Time elapsed in the skipped time steps: 0.793102 +Average time per time step: 0.190467 +Standard deviation between time steps: 0.00651408 +Time elapsed in the skipped time steps: 0.781128 Standard counter description | Time/step | % loop time | Call(s)/step ------------------------------------------------------------------------------------------ -Linear solver resolutions Ax=B | 0.0579626 | 29.0 | 2 -Convection operator | 0.01474113 | 7.4 | 4 -Diffusion operator | 0.04030403 | 20.2 | 26 -Gradient operator | 0.01876236 | 9.4 | 4 -Divergence operator | 0.005992479 | 3.0 | 3 -Source terms | 0.01091527 | 5.5 | 2 -Update ::mettre_a_jour | 0.00445164 | 2.2 | 1 -Solver for implicit diffusion | 0.02312014 | 11.6 | 4 -Computation of the time step dt | 0.01067176 | 5.3 | 8 -Turbulence model::update | 0.00141954 | 0.7 | 1 -Post-treatment operations | 0.005924567 | 3.0 | 1 -Other operations | 0.005412221 | 2.7 | +Linear solver resolutions Ax=B | 0.0581491 | 30.5 | 2 +Convection operator | 0.01218876 | 6.4 | 4 +Diffusion operator | 0.03752443 | 19.7 | 26 +Gradient operator | 0.01810211 | 9.5 | 4 +Divergence operator | 0.004492509 | 2.4 | 3 +Source terms | 0.01069241 | 5.6 | 2 +Update ::mettre_a_jour | 0.003962929 | 2.1 | 1 +Solver for implicit diffusion | 0.02360572 | 12.4 | 4 +Computation of the time step dt | 0.008076957 | 4.2 | 8 +Turbulence model::update | 0.001345484 | 0.7 | 1 +Post-treatment operations | 0.006274298 | 3.3 | 1 +Other operations | 0.006052508 | 3.2 | -Average number of iteration of the linear solver per call: 33 +Average number of iteration of the linear solver per call: 36 ----------------------------------------------------------------------------------------------------------- @@ -63,17 +63,17 @@ Average number of iteration of the linear solver per call: 33 ----------------------------------------------------------------------------------------------------------- Counter description | Time per step | % loop time | Call(s)/step | Bandwidth ----------------------------------------------------------------------------------------------------------- -Libraries: | 0.0577297 | 28.9 | 2 | -Kernels: | 0.134693 | 67.5 | 912 | -Copy host to device: | 0.00041607 | 0.2 | 18 | 8.6 GB/s -Copy device to host: | 0.000501596 | 0.3 | 7 | 21.0 GB/s -Alloc/Free on device: | 0.000339519 | 0.2 | 0 | -GPU: 96% Copy H<->D: 0.46% Alloc/free: 0.17% Comm: 0% CPU & I/O: 3% +Libraries: | 0.0579074 | 30.4 | 2 | +Kernels: | 0.12491 | 65.6 | 910 | +Copy host to device: | 0.000426176 | 0.2 | 18 | 8.4 GB/s +Copy device to host: | 0.000547077 | 0.3 | 7 | 19.3 GB/s +Alloc/Free on device: | 0.000345408 | 0.2 | 0 | +GPU: 96% Copy H<->D: 0.51% Alloc/free: 0.18% Comm: 0% CPU & I/O: 3.3% ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Post-resolution statistics ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -Time of the post-resolution: 0.170727 +Time of the post-resolution: 0.160747 -Total time for the whole computation 55.5828 +Total time for the whole computation 53.6528 -[Slurm] Power consumption (76 s): 0.557 kW 0.012 kWh 0.001 € (0.10€/kWh) +[Slurm] Power consumption (63 s): 0.647 kW 0.011 kWh 0.001 € (0.10€/kWh) diff --git a/tests/GPU/JEL_bous/JEL_bous_BENCH.TU.dalianvl_cc100 b/tests/GPU/JEL_bous/JEL_bous_BENCH.TU.dalianvl_cc100 new file mode 100644 index 0000000000..a4325074e6 --- /dev/null +++ b/tests/GPU/JEL_bous/JEL_bous_BENCH.TU.dalianvl_cc100 @@ -0,0 +1,78 @@ + # Global performance file # + +This is the global file for tracking performance in TRUST. It stores aggregated quantities. +More detailed statistics can be found in the JEL_bous_BENCH_csv.TU file +For time loop, only standard counters of level 1 are printed alongside your custom counters +Time is given in seconds + +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Context of the computation +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Date: 22-05-2026 -- 12:17:26 +OS: dalianvl14__Linux__aarch64__5.14.0-503.40.1.el9_5.aarch64__#1 SMP PREEMPT_DYNAMIC Thu Apr 24 13:05:29 UTC 2025 +CPU model : Unknown Linux CPU +Total number of threads:144 +GPU model: NVIDIA GB200 +CUDA runtime version: 13.0 +CUDA drivers version: 13.20 +Nb procs used for the computation: 1 +TRUST version: 1.9.8_beta +Total number of elements used for the calculation: 2332800 + +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Computation start-up statistics +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Total time of the start-up: 27.6976 + +Number of calls to the linear solver per time step: 2 +Average time of the resolution of the linear problem per call: 1.22819 +Average number of iteration of the linear solver per call: 19.5 + +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Time loop statistics +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +The first time step is not accounted for the computation of the time loop statistics +Total time of the time loop: 1.08387 +Number of time steps: 9 +Skipped time steps: 1 +Average time per time step: 0.12043 +Standard deviation between time steps: 0.0031966 +Time elapsed in the skipped time steps: 0.371688 + + +Standard counter description | Time/step | % loop time | Call(s)/step +------------------------------------------------------------------------------------------ +Linear solver resolutions Ax=B | 0.0523012 | 43.4 | 2 +Convection operator | 0.008088967 | 6.7 | 4 +Diffusion operator | 0.02270846 | 18.9 | 26 +Gradient operator | 0.004848305 | 4.0 | 4 +Divergence operator | 0.001220368 | 1.0 | 3 +Source terms | 0.003061725 | 2.5 | 2 +Update ::mettre_a_jour | 0.00164626 | 1.4 | 1 +Solver for implicit diffusion | 0.01236225 | 10.3 | 4 +Computation of the time step dt | 0.002862308 | 2.4 | 8 +Turbulence model::update | 0.0006005719 | 0.5 | 1 +Post-treatment operations | 0.005131665 | 4.3 | 1 +Other operations | 0.005598225 | 4.6 | + +Average number of iteration of the linear solver per call: 33 + + +----------------------------------------------------------------------------------------------------------- + GPU statistics +----------------------------------------------------------------------------------------------------------- +Counter description | Time per step | % loop time | Call(s)/step | Bandwidth +----------------------------------------------------------------------------------------------------------- +Libraries: | 0.0520444 | 43.2 | 2 | +Kernels: | 0.0606194 | 50.3 | 910 | +Copy host to device: | 0.000342986 | 0.3 | 18 | 10.4 GB/s +Copy device to host: | 0.000255232 | 0.2 | 7 | 41.3 GB/s +Alloc/Free on device: | 0.000147381 | 0.1 | 0 | +GPU: 94% Copy H<->D: 0.5% Alloc/free: 0.12% Comm: 0% CPU & I/O: 5.8% +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Post-resolution statistics +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Time of the post-resolution: 0.130801 + +Total time for the whole computation 29.284 + diff --git a/tests/GPU/JEL_bous/JEL_bous_BENCH.TU.eureka_cc89 b/tests/GPU/JEL_bous/JEL_bous_BENCH.TU.eureka_cc89 new file mode 100644 index 0000000000..d0c1e60cfb --- /dev/null +++ b/tests/GPU/JEL_bous/JEL_bous_BENCH.TU.eureka_cc89 @@ -0,0 +1,78 @@ + # Global performance file # + +This is the global file for tracking performance in TRUST. It stores aggregated quantities. +More detailed statistics can be found in the JEL_bous_BENCH_csv.TU file +For time loop, only standard counters of level 1 are printed alongside your custom counters +Time is given in seconds + +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Context of the computation +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Date: 11-05-2026 -- 08:34:44 +OS: eureka__Linux__x86_64__6.17.0-20-generic__#20~24.04.1-Ubuntu SMP PREEMPT_DYNAMIC Thu Mar 19 01:28:37 UTC 2 +CPU model : INTEL(R) XEON(R) PLATINUM 8580 +Total number of threads:240 +GPU model: NVIDIA RTX 6000 Ada Generation +CUDA runtime version: 12.90 +CUDA drivers version: 13.10 +Nb procs used for the computation: 1 +TRUST version: 1.9.8_beta +Total number of elements used for the calculation: 2332800 + +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Computation start-up statistics +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Total time of the start-up: 37.701 + +Number of calls to the linear solver per time step: 2 +Average time of the resolution of the linear problem per call: 1.65191 +Average number of iteration of the linear solver per call: 19.5 + +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Time loop statistics +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +The first time step is not accounted for the computation of the time loop statistics +Total time of the time loop: 2.346 +Number of time steps: 9 +Skipped time steps: 1 +Average time per time step: 0.260667 +Standard deviation between time steps: 0.00717772 +Time elapsed in the skipped time steps: 1.30342 + + +Standard counter description | Time/step | % loop time | Call(s)/step +------------------------------------------------------------------------------------------ +Linear solver resolutions Ax=B | 0.100194 | 38.4 | 2 +Convection operator | 0.01751202 | 6.7 | 4 +Diffusion operator | 0.04631485 | 17.8 | 26 +Gradient operator | 0.01122161 | 4.3 | 4 +Divergence operator | 0.003520645 | 1.4 | 3 +Source terms | 0.01072508 | 4.1 | 2 +Update ::mettre_a_jour | 0.004113718 | 1.6 | 1 +Solver for implicit diffusion | 0.03897518 | 15.0 | 4 +Computation of the time step dt | 0.007720912 | 3.0 | 8 +Turbulence model::update | 0.001730033 | 0.7 | 1 +Post-treatment operations | 0.005838174 | 2.2 | 1 +Other operations | 0.01280066 | 4.9 | + +Average number of iteration of the linear solver per call: 33 + + +----------------------------------------------------------------------------------------------------------- + GPU statistics +----------------------------------------------------------------------------------------------------------- +Counter description | Time per step | % loop time | Call(s)/step | Bandwidth +----------------------------------------------------------------------------------------------------------- +Libraries: | 0.0999176 | 38.3 | 2 | +Kernels: | 0.153016 | 58.7 | 910 | +Copy host to device: | 0.000531594 | 0.2 | 18 | 6.7 GB/s +Copy device to host: | 0.00163624 | 0.6 | 7 | 6.4 GB/s +Alloc/Free on device: | 0.000130279 | 0.0 | 0 | +GPU: 97% Copy H<->D: 0.83% Alloc/free: 0.05% Comm: 0% CPU & I/O: 2.1% +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Post-resolution statistics +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Time of the post-resolution: 0.0708352 + +Total time for the whole computation 41.4213 + diff --git a/tests/GPU/JEL_bous/JEL_bous_BENCH.TU.irene-amd-ccrt_cc70 b/tests/GPU/JEL_bous/JEL_bous_BENCH.TU.irene-amd-ccrt_cc70 index 1979b6ebb7..e757b8e9f3 100644 --- a/tests/GPU/JEL_bous/JEL_bous_BENCH.TU.irene-amd-ccrt_cc70 +++ b/tests/GPU/JEL_bous/JEL_bous_BENCH.TU.irene-amd-ccrt_cc70 @@ -8,8 +8,8 @@ Time is given in seconds ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Context of the computation ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -Date: 20-03-2026 -- 14:19:16 -OS: irene7053__Linux__x86_64__4.18.0-553.89.1.el8_10.x86_64__#1 SMP Sat Nov 29 00:49:18 EST 2025 +Date: 23-04-2026 -- 14:55:37 +OS: irene7051__Linux__x86_64__4.18.0-553.89.1.el8_10.x86_64__#1 SMP Sat Nov 29 00:49:18 EST 2025 CPU model : Intel(R) Xeon(R) Gold 6230 CPU @ 2.10GHz Total number of threads:80 GPU model: Tesla V100-SXM2-16GB @@ -22,38 +22,38 @@ Total number of elements used for the calculation: 2332800 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Computation start-up statistics ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -Total time of the start-up: 52.8484 +Total time of the start-up: 52.0313 Number of calls to the linear solver per time step: 2 -Average time of the resolution of the linear problem per call: 2.81103 +Average time of the resolution of the linear problem per call: 2.46353 Average number of iteration of the linear solver per call: 19.5 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Time loop statistics ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ The first time step is not accounted for the computation of the time loop statistics -Total time of the time loop: 5.40516 +Total time of the time loop: 3.91246 Number of time steps: 9 Skipped time steps: 1 -Average time per time step: 0.600573 -Standard deviation between time steps: 0.016464 -Time elapsed in the skipped time steps: 2.03739 +Average time per time step: 0.434718 +Standard deviation between time steps: 0.0117024 +Time elapsed in the skipped time steps: 1.83216 Standard counter description | Time/step | % loop time | Call(s)/step ------------------------------------------------------------------------------------------ -Linear solver resolutions Ax=B | 0.178813 | 29.8 | 2 -Convection operator | 0.04298263 | 7.2 | 4 -Diffusion operator | 0.1637061 | 27.3 | 26 -Gradient operator | 0.03590419 | 6.0 | 4 -Divergence operator | 0.02346987 | 3.9 | 3 -Source terms | 0.03134122 | 5.2 | 2 -Update ::mettre_a_jour | 0.01515883 | 2.5 | 1 -Solver for implicit diffusion | 0.04919646 | 8.2 | 4 -Computation of the time step dt | 0.03779348 | 6.3 | 8 -Turbulence model::update | 0.004684588 | 0.8 | 1 -Post-treatment operations | 0.008450638 | 1.4 | 1 -Other operations | 0.009071418 | 1.5 | +Linear solver resolutions Ax=B | 0.151745 | 34.9 | 2 +Convection operator | 0.02512061 | 5.8 | 4 +Diffusion operator | 0.09764007 | 22.5 | 26 +Gradient operator | 0.01804826 | 4.2 | 4 +Divergence operator | 0.01293974 | 3.0 | 3 +Source terms | 0.02053407 | 4.7 | 2 +Update ::mettre_a_jour | 0.01018209 | 2.3 | 1 +Solver for implicit diffusion | 0.04906223 | 11.3 | 4 +Computation of the time step dt | 0.02539198 | 5.8 | 8 +Turbulence model::update | 0.00363699 | 0.8 | 1 +Post-treatment operations | 0.008557846 | 2.0 | 1 +Other operations | 0.01185921 | 2.7 | Average number of iteration of the linear solver per call: 33 @@ -63,17 +63,17 @@ Average number of iteration of the linear solver per call: 33 ----------------------------------------------------------------------------------------------------------- Counter description | Time per step | % loop time | Call(s)/step | Bandwidth ----------------------------------------------------------------------------------------------------------- -Libraries: | 0.17824 | 29.7 | 2 | -Kernels: | 0.410514 | 68.4 | 912 | -Copy host to device: | 0.00116731 | 0.2 | 18 | 3.1 GB/s -Copy device to host: | 0.00258639 | 0.4 | 7 | 4.1 GB/s -Alloc/Free on device: | 0.000100786 | 0.0 | 0 | -GPU: 98% Copy H<->D: 0.63% Alloc/free: 0.017% Comm: 0% CPU & I/O: 1.3% +Libraries: | 0.15118 | 34.8 | 2 | +Kernels: | 0.271847 | 62.5 | 910 | +Copy host to device: | 0.00114374 | 0.3 | 18 | 3.1 GB/s +Copy device to host: | 0.00269277 | 0.6 | 7 | 3.9 GB/s +Alloc/Free on device: | 0.000103352 | 0.0 | 0 | +GPU: 97% Copy H<->D: 0.88% Alloc/free: 0.024% Comm: 0% CPU & I/O: 1.8% ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Post-resolution statistics ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -Time of the post-resolution: 0.168883 +Time of the post-resolution: 0.165987 -Total time for the whole computation 60.4598 +Total time for the whole computation 57.9419 -[Slurm] Power consumption (83 s): 0.208 kW 0.005 kWh 0.000 € (0.10€/kWh) +[Slurm] Power consumption (70 s): 0.168 kW 0.003 kWh 0.000 € (0.10€/kWh) diff --git a/tests/GPU/JEL_bous/JEL_bous_BENCH.TU.is157091_cc86 b/tests/GPU/JEL_bous/JEL_bous_BENCH.TU.is157091_cc86 index 1748ac8337..b5dfaed898 100644 --- a/tests/GPU/JEL_bous/JEL_bous_BENCH.TU.is157091_cc86 +++ b/tests/GPU/JEL_bous/JEL_bous_BENCH.TU.is157091_cc86 @@ -8,7 +8,7 @@ Time is given in seconds ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Context of the computation ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -Date: 11-03-2026 -- 20:22:08 +Date: 23-04-2026 -- 11:36:24 OS: is157091__Linux__x86_64__5.11.12-300.fc34.x86_64__#1 SMP Wed Apr 7 16:31:13 UTC 2021 CPU model : AMD Ryzen Threadripper PRO 5975WX 32-Cores Total number of threads:64 @@ -22,38 +22,38 @@ Total number of elements used for the calculation: 2332800 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Computation start-up statistics ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -Total time of the start-up: 32.5388 +Total time of the start-up: 33.1677 Number of calls to the linear solver per time step: 2 -Average time of the resolution of the linear problem per call: 1.27684 +Average time of the resolution of the linear problem per call: 1.52807 Average number of iteration of the linear solver per call: 19.5 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Time loop statistics ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ The first time step is not accounted for the computation of the time loop statistics -Total time of the time loop: 5.07593 +Total time of the time loop: 4.11182 Number of time steps: 9 Skipped time steps: 1 -Average time per time step: 0.563992 -Standard deviation between time steps: 0.0136432 -Time elapsed in the skipped time steps: 1.26174 +Average time per time step: 0.456869 +Standard deviation between time steps: 0.0119697 +Time elapsed in the skipped time steps: 1.21145 Standard counter description | Time/step | % loop time | Call(s)/step ------------------------------------------------------------------------------------------ -Linear solver resolutions Ax=B | 0.186567 | 33.1 | 2 -Convection operator | 0.04115435 | 7.3 | 4 -Diffusion operator | 0.118614 | 21.0 | 26 -Gradient operator | 0.03055508 | 5.4 | 4 -Divergence operator | 0.01801101 | 3.2 | 3 -Source terms | 0.03885931 | 6.9 | 2 -Update ::mettre_a_jour | 0.01322145 | 2.3 | 1 -Solver for implicit diffusion | 0.05424782 | 9.6 | 4 -Computation of the time step dt | 0.0408768 | 7.2 | 8 -Turbulence model::update | 0.005068876 | 0.9 | 1 -Post-treatment operations | 0.005201474 | 0.9 | 1 -Other operations | 0.0116147 | 2.1 | +Linear solver resolutions Ax=B | 0.160973 | 35.2 | 2 +Convection operator | 0.03524869 | 7.7 | 4 +Diffusion operator | 0.08888196 | 19.5 | 26 +Gradient operator | 0.01371358 | 3.0 | 4 +Divergence operator | 0.0114062 | 2.5 | 3 +Source terms | 0.03076204 | 6.7 | 2 +Update ::mettre_a_jour | 0.01001952 | 2.2 | 1 +Solver for implicit diffusion | 0.05382268 | 11.8 | 4 +Computation of the time step dt | 0.02984461 | 6.5 | 8 +Turbulence model::update | 0.004468384 | 1.0 | 1 +Post-treatment operations | 0.005182095 | 1.1 | 1 +Other operations | 0.01254604 | 2.7 | Average number of iteration of the linear solver per call: 33 @@ -63,16 +63,16 @@ Average number of iteration of the linear solver per call: 33 ----------------------------------------------------------------------------------------------------------- Counter description | Time per step | % loop time | Call(s)/step | Bandwidth ----------------------------------------------------------------------------------------------------------- -Libraries: | 0.185407 | 32.9 | 2 | -Kernels: | 0.371386 | 65.8 | 912 | -Copy host to device: | 0.000484581 | 0.1 | 18 | 7.4 GB/s -Copy device to host: | 0.00107678 | 0.2 | 7 | 9.8 GB/s -Alloc/Free on device: | 0.000149894 | 0.0 | 0 | -GPU: 99% Copy H<->D: 0.28% Alloc/free: 0.027% Comm: 0% CPU & I/O: 0.97% +Libraries: | 0.160438 | 35.1 | 2 | +Kernels: | 0.289686 | 63.4 | 910 | +Copy host to device: | 0.000593965 | 0.1 | 18 | 6.0 GB/s +Copy device to host: | 0.00107771 | 0.2 | 7 | 9.8 GB/s +Alloc/Free on device: | 0.000141054 | 0.0 | 0 | +GPU: 99% Copy H<->D: 0.37% Alloc/free: 0.031% Comm: 0% CPU & I/O: 1.1% ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Post-resolution statistics ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -Time of the post-resolution: 0.265853 +Time of the post-resolution: 0.0692136 -Total time for the whole computation 39.1423 +Total time for the whole computation 38.5602 diff --git a/tests/GPU/JEL_bous/JEL_bous_BENCH.TU.is159479_cc120 b/tests/GPU/JEL_bous/JEL_bous_BENCH.TU.is159479_cc120 new file mode 100644 index 0000000000..ceb6b4f984 --- /dev/null +++ b/tests/GPU/JEL_bous/JEL_bous_BENCH.TU.is159479_cc120 @@ -0,0 +1,78 @@ + # Global performance file # + +This is the global file for tracking performance in TRUST. It stores aggregated quantities. +More detailed statistics can be found in the JEL_bous_BENCH_csv.TU file +For time loop, only standard counters of level 1 are printed alongside your custom counters +Time is given in seconds + +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Context of the computation +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Date: 24-04-2026 -- 14:36:31 +OS: is159479.intra.cea.fr__Linux__x86_64__6.12.0-160000.26-default__#1 SMP PREEMPT_DYNAMIC Thu Feb 5 00:00:11 UTC 2026 (f +CPU model : AMD Ryzen Threadripper PRO 9965WX 24-Cores +Total number of threads:48 +GPU model: NVIDIA RTX PRO 6000 Blackwell Workstation Edition +CUDA runtime version: 13.0 +CUDA drivers version: 13.0 +Nb procs used for the computation: 1 +TRUST version: 1.9.8_beta +Total number of elements used for the calculation: 2332800 + +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Computation start-up statistics +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Total time of the start-up: 24.8498 + +Number of calls to the linear solver per time step: 2 +Average time of the resolution of the linear problem per call: 0.909361 +Average number of iteration of the linear solver per call: 19.5 + +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Time loop statistics +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +The first time step is not accounted for the computation of the time loop statistics +Total time of the time loop: 1.46471 +Number of time steps: 9 +Skipped time steps: 1 +Average time per time step: 0.162746 +Standard deviation between time steps: 0.00530569 +Time elapsed in the skipped time steps: 0.776553 + + +Standard counter description | Time/step | % loop time | Call(s)/step +------------------------------------------------------------------------------------------ +Linear solver resolutions Ax=B | 0.0690585 | 42.4 | 2 +Convection operator | 0.01185419 | 7.3 | 4 +Diffusion operator | 0.02884948 | 17.7 | 26 +Gradient operator | 0.006323226 | 3.9 | 4 +Divergence operator | 0.001823318 | 1.1 | 3 +Source terms | 0.006613129 | 4.1 | 2 +Update ::mettre_a_jour | 0.002277541 | 1.4 | 1 +Solver for implicit diffusion | 0.0200064 | 12.3 | 4 +Computation of the time step dt | 0.004297829 | 2.6 | 8 +Turbulence model::update | 0.001027548 | 0.6 | 1 +Post-treatment operations | 0.003883966 | 2.4 | 1 +Other operations | 0.006730717 | 4.1 | + +Average number of iteration of the linear solver per call: 33 + + +----------------------------------------------------------------------------------------------------------- + GPU statistics +----------------------------------------------------------------------------------------------------------- +Counter description | Time per step | % loop time | Call(s)/step | Bandwidth +----------------------------------------------------------------------------------------------------------- +Libraries: | 0.0682865 | 42.0 | 2 | +Kernels: | 0.0889239 | 54.6 | 910 | +Copy host to device: | 0.000346185 | 0.2 | 18 | 10.3 GB/s +Copy device to host: | 0.00149129 | 0.9 | 7 | 7.1 GB/s +Alloc/Free on device: | 4.52976e-05 | 0.0 | 0 | +GPU: 97% Copy H<->D: 1.1% Alloc/free: 0.028% Comm: 0% CPU & I/O: 2.2% +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Post-resolution statistics +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Time of the post-resolution: 0.0580743 + +Total time for the whole computation 27.1491 + diff --git a/tests/GPU/JEL_bous/JEL_bous_BENCH.TU.is247793_gfx1100 b/tests/GPU/JEL_bous/JEL_bous_BENCH.TU.is247793_gfx1100 index 6655875c6c..7713af45da 100644 --- a/tests/GPU/JEL_bous/JEL_bous_BENCH.TU.is247793_gfx1100 +++ b/tests/GPU/JEL_bous/JEL_bous_BENCH.TU.is247793_gfx1100 @@ -1,53 +1,78 @@ -Statistiques d'initialisation du calcul - -Temps total 45.5252 - -Statistiques de resolution du probleme - -Temps total 8.20278 - - -Timesteps 10 -Secondes / pas de temps 0.820271 -Dont solveurs Ax=B 0.209617 25% (2 appels/pas de temps) -Dont solveur diffusion_implicite 0.087671 10% (4 appels/pas de temps) -Dont mettre_a_jour 0.018403 2% (1 appel/pas de temps) -Dont operateurs convection 0.113021 13% (4 appels/pas de temps) -Dont operateurs diffusion 0.218056 26% (26 appels/pas de temps) -Dont operateurs gradient 0.039941 4% (4 appels/pas de temps) -Dont operateurs divergence 0.012809 1% (3 appels/pas de temps) -Dont operateurs source 0.052145 6% (2 appels/pas de temps) -Dont operations postraitement 0.017725 2% (1 appel/pas de temps) -Dont calcul dt 0.022425 2% (8 appels/pas de temps) -Dont modele turbulence 0.006923 0% (1 appel/pas de temps) -Dont calcul divers 0.021534 2% (0 appels/pas de temps) -Nb solveur / pas de temps 2 -Secondes / solveur 0.104808 -Iterations / solveur 27.5 -GPU statistics per time step (experimental): -Libraries : 0.209143 s 25.5% 2.0 calls -Kernels : 0.547938 s 66.8% 895.4 calls -Copy H2D : 0.025278 s 3.1% 59.6 calls 11.2 GB/s -Copy D2H : 0.013749 s 1.7% 80.7 calls 12.7 GB/s -Alloc/Free: 0.001174 s 0.1% 8.0 calls -GPU: 92.2% Copy H<->D: 4.7% Alloc/Free: 0.1% Comm: 0% CPU & Others: 2.8% -I/O: - -Timesteps = number of time steps -Nb solveur = number of linear system resolutions -Nb assemblage implicite = number of matrix assemblies for the implicit scheme -Iterations = average number of iterations of the solver -Communications = fraction of the time spent - in communications between processors (excluding io files) -Network latency = time of one mpsum measured by an internal bench over 0.1s -Network bandwidth = maximum on all processors - of the average bandwidth of send_recv operations -Waiting time = estimation of the waiting time of the different processors - -Max_waiting_time big => probably due to a bad partitioning -Communications > 30% => too many processors or network too slow - -Statistiques de post resolution - -Temps total 0.119974 + # Global performance file # + +This is the global file for tracking performance in TRUST. It stores aggregated quantities. +More detailed statistics can be found in the JEL_bous_BENCH_csv.TU file +For time loop, only standard counters of level 1 are printed alongside your custom counters +Time is given in seconds + +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Context of the computation +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Date: 22-05-2026 -- 18:28:55 +OS: is247793__Linux__x86_64__6.8.0-107-generic__#107~22.04.1-Ubuntu SMP PREEMPT_DYNAMIC Wed Mar 18 23:40:43 UTC +CPU model : AMD Ryzen Threadripper PRO 7965WX 24-Cores +Total number of threads:48 +GPU model: AMD Radeon PRO W7900 +HIP runtime version: 7.53 +HIP drivers version: 7.53 +Nb procs used for the computation: 1 +TRUST version: 1.9.8_beta +Total number of elements used for the calculation: 2332800 + +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Computation start-up statistics +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Total time of the start-up: 29.6466 + +Number of calls to the linear solver per time step: 2 +Average time of the resolution of the linear problem per call: 2.0431 +Average number of iteration of the linear solver per call: 19.5 + +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Time loop statistics +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +The first time step is not accounted for the computation of the time loop statistics +Total time of the time loop: 4.40577 +Number of time steps: 9 +Skipped time steps: 1 +Average time per time step: 0.48953 +Standard deviation between time steps: 0.011864 +Time elapsed in the skipped time steps: 1.64208 + + +Standard counter description | Time/step | % loop time | Call(s)/step +------------------------------------------------------------------------------------------ +Linear solver resolutions Ax=B | 0.190709 | 39.0 | 2 +Convection operator | 0.0311751 | 6.4 | 4 +Diffusion operator | 0.08601413 | 17.6 | 26 +Gradient operator | 0.02478783 | 5.1 | 4 +Divergence operator | 0.007880499 | 1.6 | 3 +Source terms | 0.03365951 | 6.9 | 2 +Update ::mettre_a_jour | 0.007327208 | 1.5 | 1 +Solver for implicit diffusion | 0.06193504 | 12.7 | 4 +Computation of the time step dt | 0.01854743 | 3.8 | 8 +Turbulence model::update | 0.00304363 | 0.6 | 1 +Post-treatment operations | 0.004884117 | 1.0 | 1 +Other operations | 0.01956652 | 4.0 | + +Average number of iteration of the linear solver per call: 36 + + +----------------------------------------------------------------------------------------------------------- + GPU statistics +----------------------------------------------------------------------------------------------------------- +Counter description | Time per step | % loop time | Call(s)/step | Bandwidth +----------------------------------------------------------------------------------------------------------- +Libraries: | 0.190255 | 38.9 | 2 | +Kernels: | 0.293721 | 60.0 | 910 | +Copy host to device: | 0.000463657 | 0.1 | 18 | 7.7 GB/s +Copy device to host: | 0.00109986 | 0.2 | 7 | 9.6 GB/s +Alloc/Free on device: | 0.000104513 | 0.0 | 0 | +GPU: 99% Copy H<->D: 0.32% Alloc/free: 0.021% Comm: 0% CPU & I/O: 0.79% +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Post-resolution statistics +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Time of the post-resolution: 0.138724 + +Total time for the whole computation 35.8332 diff --git a/tests/GPU/JEL_bous/JEL_bous_BENCH.TU.jean-zay_cc90 b/tests/GPU/JEL_bous/JEL_bous_BENCH.TU.jean-zay_cc90 index 5710fadd8e..8bc7ec847b 100644 --- a/tests/GPU/JEL_bous/JEL_bous_BENCH.TU.jean-zay_cc90 +++ b/tests/GPU/JEL_bous/JEL_bous_BENCH.TU.jean-zay_cc90 @@ -8,8 +8,8 @@ Time is given in seconds ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Context of the computation ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -Date: 20-03-2026 -- 14:04:54 -OS: jzxh082__Linux__x86_64__5.14.0-570.58.1.el9_6.x86_64__#1 SMP PREEMPT_DYNAMIC Tue Oct 21 04:15:07 EDT 2025 +Date: 23-04-2026 -- 08:17:58 +OS: jzxh011__Linux__x86_64__5.14.0-570.58.1.el9_6.x86_64__#1 SMP PREEMPT_DYNAMIC Tue Oct 21 04:15:07 EDT 2025 CPU model : Intel(R) Xeon(R) Platinum 8468 Total number of threads:192 GPU model: NVIDIA H100 80GB HBM3 @@ -22,38 +22,38 @@ Total number of elements used for the calculation: 2332800 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Computation start-up statistics ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -Total time of the start-up: 34.1096 +Total time of the start-up: 39.7913 Number of calls to the linear solver per time step: 2 -Average time of the resolution of the linear problem per call: 1.50123 +Average time of the resolution of the linear problem per call: 2.18749 Average number of iteration of the linear solver per call: 19.5 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Time loop statistics ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ The first time step is not accounted for the computation of the time loop statistics -Total time of the time loop: 1.58632 +Total time of the time loop: 1.42528 Number of time steps: 9 Skipped time steps: 1 -Average time per time step: 0.176258 -Standard deviation between time steps: 0.0118648 -Time elapsed in the skipped time steps: 1.20137 +Average time per time step: 0.158365 +Standard deviation between time steps: 0.00459821 +Time elapsed in the skipped time steps: 1.00394 Standard counter description | Time/step | % loop time | Call(s)/step ------------------------------------------------------------------------------------------ -Linear solver resolutions Ax=B | 0.066867 | 37.9 | 2 -Convection operator | 0.01270288 | 7.2 | 4 -Diffusion operator | 0.03147784 | 17.9 | 26 -Gradient operator | 0.009996821 | 5.7 | 4 -Divergence operator | 0.004424433 | 2.5 | 3 -Source terms | 0.006921498 | 3.9 | 2 -Update ::mettre_a_jour | 0.003625511 | 2.1 | 1 -Solver for implicit diffusion | 0.0164828 | 9.4 | 4 -Computation of the time step dt | 0.0054422 | 3.1 | 8 -Turbulence model::update | 0.001022485 | 0.6 | 1 -Post-treatment operations | 0.01127463 | 6.4 | 1 -Other operations | 0.006019646 | 3.4 | +Linear solver resolutions Ax=B | 0.0654879 | 41.4 | 2 +Convection operator | 0.009842782 | 6.2 | 4 +Diffusion operator | 0.02657007 | 16.8 | 26 +Gradient operator | 0.00834769 | 5.3 | 4 +Divergence operator | 0.002102096 | 1.3 | 3 +Source terms | 0.006326014 | 4.0 | 2 +Update ::mettre_a_jour | 0.002668702 | 1.7 | 1 +Solver for implicit diffusion | 0.0166026 | 10.5 | 4 +Computation of the time step dt | 0.004435701 | 2.8 | 8 +Turbulence model::update | 0.0009156802 | 0.6 | 1 +Post-treatment operations | 0.008085213 | 5.1 | 1 +Other operations | 0.00698039 | 4.4 | Average number of iteration of the linear solver per call: 33 @@ -63,17 +63,17 @@ Average number of iteration of the linear solver per call: 33 ----------------------------------------------------------------------------------------------------------- Counter description | Time per step | % loop time | Call(s)/step | Bandwidth ----------------------------------------------------------------------------------------------------------- -Libraries: | 0.0666176 | 37.8 | 2 | -Kernels: | 0.0952303 | 54.0 | 912 | -Copy host to device: | 0.00059641 | 0.3 | 18 | 6.0 GB/s -Copy device to host: | 0.00112574 | 0.6 | 7 | 9.4 GB/s -Alloc/Free on device: | 6.89258e-05 | 0.0 | 0 | -GPU: 92% Copy H<->D: 0.98% Alloc/free: 0.039% Comm: 0% CPU & I/O: 7.2% +Libraries: | 0.0652517 | 41.2 | 2 | +Kernels: | 0.0818582 | 51.7 | 910 | +Copy host to device: | 0.000631026 | 0.4 | 18 | 5.7 GB/s +Copy device to host: | 0.00116906 | 0.7 | 7 | 9.0 GB/s +Alloc/Free on device: | 6.88022e-05 | 0.0 | 0 | +GPU: 93% Copy H<->D: 1.1% Alloc/free: 0.043% Comm: 0% CPU & I/O: 5.9% ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Post-resolution statistics ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -Time of the post-resolution: 0.132989 +Time of the post-resolution: 0.0998874 -Total time for the whole computation 37.0303 +Total time for the whole computation 42.3205 -[Slurm] Power consumption (62 s): 0.367 kW 0.006 kWh 0.001 € (0.10€/kWh) +[Slurm] Power consumption (50 s): 0.425 kW 0.006 kWh 0.001 € (0.10€/kWh) diff --git a/tests/GPU/JEL_bous/JEL_bous_BENCH.TU.lumi_gfx90a b/tests/GPU/JEL_bous/JEL_bous_BENCH.TU.lumi_gfx90a index 518c051131..038c9a11fd 100644 --- a/tests/GPU/JEL_bous/JEL_bous_BENCH.TU.lumi_gfx90a +++ b/tests/GPU/JEL_bous/JEL_bous_BENCH.TU.lumi_gfx90a @@ -8,8 +8,8 @@ Time is given in seconds ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Context of the computation ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -Date: 24-02-2026 -- 00:05:59 -OS: nid007971__Linux__x86_64__6.4.0-150600.23.73_15.0.14-cray_shasta_c__#1 SMP Tue Oct 21 20:32:25 UTC 2025 (89d3c98) +Date: 15-05-2026 -- 20:30:58 +OS: nid005005__Linux__x86_64__6.4.0-150600.23.73_15.0.14-cray_shasta_c__#1 SMP Tue Oct 21 20:32:25 UTC 2025 (89d3c98) CPU model : AMD EPYC 7A53 64-Core Processor Total number of threads:128 GPU model: AMD Instinct MI250X @@ -22,40 +22,40 @@ Total number of elements used for the calculation: 2332800 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Computation start-up statistics ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -Total time of the start-up: 79.3918 +Total time of the start-up: 87.1572 Number of calls to the linear solver per time step: 2 -Average time of the resolution of the linear problem per call: 4.23093 +Average time of the resolution of the linear problem per call: 3.02811 Average number of iteration of the linear solver per call: 19.5 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Time loop statistics ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ The first time step is not accounted for the computation of the time loop statistics -Total time of the time loop: 5.85712 +Total time of the time loop: 4.06447 Number of time steps: 9 Skipped time steps: 1 -Average time per time step: 0.650791 -Standard deviation between time steps: 0.0144864 -Time elapsed in the skipped time steps: 1.53947 +Average time per time step: 0.451608 +Standard deviation between time steps: 0.00999368 +Time elapsed in the skipped time steps: 1.32075 Standard counter description | Time/step | % loop time | Call(s)/step ------------------------------------------------------------------------------------------ -Linear solver resolutions Ax=B | 0.124112 | 15.1 | 2 -Convection operator | 0.05576208 | 6.8 | 4 -Diffusion operator | 0.1568919 | 19.1 | 26 -Gradient operator | 0.08943955 | 10.9 | 4 -Divergence operator | 0.02169871 | 2.6 | 3 -Source terms | 0.09198948 | 11.2 | 2 -Update ::mettre_a_jour | 0.01530994 | 1.9 | 1 -Solver for implicit diffusion | 0.03785859 | 4.6 | 4 -Computation of the time step dt | 0.03651262 | 4.4 | 8 -Turbulence model::update | 0.005892088 | 0.7 | 1 -Post-treatment operations | 0.01152028 | 1.4 | 1 -Other operations | 0.003804343 | 0.5 | +Linear solver resolutions Ax=B | 0.108979 | 24.1 | 2 +Convection operator | 0.03298987 | 7.3 | 4 +Diffusion operator | 0.09218553 | 20.4 | 26 +Gradient operator | 0.03981864 | 8.8 | 4 +Divergence operator | 0.01219288 | 2.7 | 3 +Source terms | 0.07573489 | 16.8 | 2 +Update ::mettre_a_jour | 0.01071812 | 2.4 | 1 +Solver for implicit diffusion | 0.03443697 | 7.6 | 4 +Computation of the time step dt | 0.02651764 | 5.9 | 8 +Turbulence model::update | 0.004947016 | 1.1 | 1 +Post-treatment operations | 0.006235741 | 1.4 | 1 +Other operations | 0.006851491 | 1.5 | -Average number of iteration of the linear solver per call: 30 +Average number of iteration of the linear solver per call: 36 ----------------------------------------------------------------------------------------------------------- @@ -63,17 +63,17 @@ Average number of iteration of the linear solver per call: 30 ----------------------------------------------------------------------------------------------------------- Counter description | Time per step | % loop time | Call(s)/step | Bandwidth ----------------------------------------------------------------------------------------------------------- -Libraries: | 0.122748 | 18.9 | 2 | -Kernels: | 0.5152 | 79.2 | 912 | -Copy host to device: | 0.000510664 | 0.1 | 18 | 7.0 GB/s -Copy device to host: | 0.000761906 | 0.1 | 7 | 13.8 GB/s -Alloc/Free on device: | 2.60104e-05 | 0.0 | 0 | -GPU: 98% Copy H<->D: 0.2% Alloc/free: 0.004% Comm: 0% CPU & I/O: 1.8% +Libraries: | 0.108587 | 24.0 | 2 | +Kernels: | 0.335198 | 74.2 | 910 | +Copy host to device: | 0.000510021 | 0.1 | 18 | 7.0 GB/s +Copy device to host: | 0.000747086 | 0.2 | 7 | 14.1 GB/s +Alloc/Free on device: | 2.6155e-05 | 0.0 | 0 | +GPU: 98% Copy H<->D: 0.28% Alloc/free: 0.0058% Comm: 0% CPU & I/O: 1.4% ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Post-resolution statistics ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -Time of the post-resolution: 0.169877 +Time of the post-resolution: 0.170278 -Total time for the whole computation 86.9583 +Total time for the whole computation 92.7127 -Power consumption: 0 kW 0 kWh 0 € (0.12€/kWh) +[Slurm] Power consumption (121 s): 0.490 kW 0.016 kWh 0.002 € (0.10€/kWh) diff --git a/tests/GPU/JEL_bous/JEL_bous_BENCH.TU.topaze_cc80 b/tests/GPU/JEL_bous/JEL_bous_BENCH.TU.topaze_cc80 index e197f0213a..379c065968 100644 --- a/tests/GPU/JEL_bous/JEL_bous_BENCH.TU.topaze_cc80 +++ b/tests/GPU/JEL_bous/JEL_bous_BENCH.TU.topaze_cc80 @@ -8,8 +8,8 @@ Time is given in seconds ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Context of the computation ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -Date: 20-02-2026 -- 17:49:16 -OS: topaze7050__Linux__x86_64__4.18.0-553.89.1.el8_10.x86_64__#1 SMP Sat Nov 29 00:49:18 EST 2025 +Date: 15-05-2026 -- 13:37:16 +OS: topaze7040__Linux__x86_64__4.18.0-553.89.1.el8_10.x86_64__#1 SMP Sat Nov 29 00:49:18 EST 2025 CPU model : AMD EPYC 7763 64-Core Processor Total number of threads:256 GPU model: NVIDIA A100-SXM4-80GB @@ -22,38 +22,38 @@ Total number of elements used for the calculation: 2332800 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Computation start-up statistics ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -Total time of the start-up: 41.3779 +Total time of the start-up: 41.8203 Number of calls to the linear solver per time step: 2 -Average time of the resolution of the linear problem per call: 2.0777 +Average time of the resolution of the linear problem per call: 1.83419 Average number of iteration of the linear solver per call: 19.5 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Time loop statistics ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ The first time step is not accounted for the computation of the time loop statistics -Total time of the time loop: 2.51937 +Total time of the time loop: 2.04769 Number of time steps: 9 Skipped time steps: 1 -Average time per time step: 0.27993 -Standard deviation between time steps: 0.00755805 -Time elapsed in the skipped time steps: 1.24275 +Average time per time step: 0.227521 +Standard deviation between time steps: 0.00633132 +Time elapsed in the skipped time steps: 1.15739 Standard counter description | Time/step | % loop time | Call(s)/step ------------------------------------------------------------------------------------------ -Linear solver resolutions Ax=B | 0.095653 | 22.9 | 2 -Convection operator | 0.02602373 | 6.2 | 4 -Diffusion operator | 0.06149648 | 14.7 | 26 -Gradient operator | 0.01634973 | 3.9 | 4 -Divergence operator | 0.005980824 | 1.4 | 3 -Source terms | 0.0107638 | 2.6 | 2 -Update ::mettre_a_jour | 0.004935778 | 1.2 | 1 -Solver for implicit diffusion | 0.02931744 | 7.0 | 4 -Computation of the time step dt | 0.01242498 | 3.0 | 8 -Turbulence model::update | 0.001668194 | 0.4 | 1 -Post-treatment operations | 0.006153714 | 1.5 | 1 -Other operations | 0.009161827 | 2.2 | +Linear solver resolutions Ax=B | 0.0931296 | 40.9 | 2 +Convection operator | 0.01537725 | 6.8 | 4 +Diffusion operator | 0.04146383 | 18.2 | 26 +Gradient operator | 0.01223332 | 5.4 | 4 +Divergence operator | 0.002743943 | 1.2 | 3 +Source terms | 0.009263052 | 4.1 | 2 +Update ::mettre_a_jour | 0.003475224 | 1.5 | 1 +Solver for implicit diffusion | 0.0260186 | 11.4 | 4 +Computation of the time step dt | 0.007452805 | 3.3 | 8 +Turbulence model::update | 0.001451508 | 0.6 | 1 +Post-treatment operations | 0.006422715 | 2.8 | 1 +Other operations | 0.008489126 | 3.7 | Average number of iteration of the linear solver per call: 33 @@ -63,16 +63,17 @@ Average number of iteration of the linear solver per call: 33 ----------------------------------------------------------------------------------------------------------- Counter description | Time per step | % loop time | Call(s)/step | Bandwidth ----------------------------------------------------------------------------------------------------------- -Libraries: | 0.0953374 | 34.1 | 2 | -Kernels: | 0.176263 | 63.0 | 912 | -Copy host to device: | 0.000504537 | 0.2 | 18 | 7.1 GB/s -Copy device to host: | 0.000921495 | 0.3 | 7 | 11.5 GB/s -Alloc/Free on device: | 9.07312e-05 | 0.0 | 0 | -GPU: 97% Copy H<->D: 0.51% Alloc/free: 0.032% Comm: 0% CPU & I/O: 2.4% +Libraries: | 0.0928011 | 40.8 | 2 | +Kernels: | 0.126048 | 55.4 | 910 | +Copy host to device: | 0.000506734 | 0.2 | 18 | 7.1 GB/s +Copy device to host: | 0.000953698 | 0.4 | 7 | 11.1 GB/s +Alloc/Free on device: | 0.00010037 | 0.0 | 0 | +GPU: 96% Copy H<->D: 0.64% Alloc/free: 0.044% Comm: 0% CPU & I/O: 3.1% ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Post-resolution statistics ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -Time of the post-resolution: 0.193253 +Time of the post-resolution: 0.184978 -Total time for the whole computation 45.3332 +Total time for the whole computation 45.2103 +[Slurm] Power consumption (80 s): 0.400 kW 0.009 kWh 0.001 € (0.10€/kWh) diff --git a/tests/GPU/JEL_bous/PAR_JEL_bous_BENCH.TU.adastra_gfx90ax8 b/tests/GPU/JEL_bous/PAR_JEL_bous_BENCH.TU.adastra_gfx90ax8 index dbf1d79796..d6ecc5ce4c 100644 --- a/tests/GPU/JEL_bous/PAR_JEL_bous_BENCH.TU.adastra_gfx90ax8 +++ b/tests/GPU/JEL_bous/PAR_JEL_bous_BENCH.TU.adastra_gfx90ax8 @@ -8,8 +8,8 @@ Time is given in seconds ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Context of the computation ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -Date: 20-02-2026 -- 18:07:54 -OS: g1081__Linux__x86_64__5.14.0-570.69.1.el9_6.x86_64__#1 SMP PREEMPT_DYNAMIC Tue Nov 25 01:30:14 EST 2025 +Date: 08-06-2026 -- 16:05:31 +OS: g1331__Linux__x86_64__5.14.0-570.69.1.el9_6.x86_64__#1 SMP PREEMPT_DYNAMIC Tue Nov 25 01:30:14 EST 2025 CPU model : AMD EPYC 7A53 64-Core Processor Total number of threads:128 GPU model: AMD Instinct MI250X @@ -22,86 +22,86 @@ Total number of elements used for the calculation: 2332800 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Computation start-up statistics ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -Total time of the start-up: 11.3269 -Number of virtual exchanges: 88 -Maximum number of MPI allreduce per time step 230 +Total time of the start-up: 17.1944 +Number of virtual exchanges: 91 +Maximum number of MPI allreduce per time step 234 Number of calls to the linear solver per time step: 2 -Average time of the resolution of the linear problem per call: 0.544609 +Average time of the resolution of the linear problem per call: 1.73554 Average number of iteration of the linear solver per call: 19.5 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Time loop statistics ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ The first time step is not accounted for the computation of the time loop statistics -Total time of the time loop: 1.7114 +Total time of the time loop: 1.3246 Number of time steps: 9 Skipped time steps: 1 -Average time per time step: 0.190155 -Standard deviation between time steps: 0.00412601 -Time elapsed in the skipped time steps: 0.300029 +Average time per time step: 0.147178 +Standard deviation between time steps: 0.0035771 +Time elapsed in the skipped time steps: 0.332575 -Percent of total time spend in communication: 6.97401 +Percent of total time spend in communication: 8.20387 Standard counter description | Time/step | % loop time | Call(s)/step ------------------------------------------------------------------------------------------ -Linear solver resolutions Ax=B | 0.097368 | 43.6 | 2 -Convection operator | 0.008809825 | 3.9 | 4 -Diffusion operator | 0.0271289 | 12.1 | 26 -Gradient operator | 0.01032468 | 4.6 | 4 -Divergence operator | 0.003033461 | 1.4 | 3 -Source terms | 0.009662045 | 4.3 | 2 -Update ::mettre_a_jour | 0.003470453 | 1.6 | 1 -Solver for implicit diffusion | 0.01615869 | 7.2 | 4 -Computation of the time step dt | 0.00510907 | 2.3 | 8 -Turbulence model::update | 0.001198047 | 0.5 | 1 -Post-treatment operations | 0.002356216 | 1.1 | 1 -Other operations | 0.005535735 | 2.5 | -Number of virtual exchanges per time step: 80 +Linear solver resolutions Ax=B | 0.0605146 | 41.1 | 2 +Convection operator | 0.005624402 | 3.8 | 4 +Diffusion operator | 0.02587681 | 17.6 | 26 +Gradient operator | 0.006207187 | 4.2 | 4 +Divergence operator | 0.002373149 | 1.6 | 3 +Source terms | 0.009926151 | 6.7 | 2 +Update ::mettre_a_jour | 0.003552744 | 2.4 | 1 +Solver for implicit diffusion | 0.01746022 | 11.9 | 4 +Computation of the time step dt | 0.004997502 | 3.4 | 8 +Turbulence model::update | 0.00122556 | 0.8 | 1 +Post-treatment operations | 0.002465866 | 1.7 | 1 +Other operations | 0.006953813 | 4.7 | +Number of virtual exchanges per time step: 76 Maximum number of MPI allreduce per time step 66.7 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Warning: number of MPI allreduce calls per time step is high. Contact TRUST team to run massive parallel calculations ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -Average number of iteration of the linear solver per call: 33 +Average number of iteration of the linear solver per call: 30 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Time loop statistics: IO ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -Output write sequential: 895 MB/s +Output write sequential: 886 MB/s --------------------------------------------------------------------------------------------------------- Warning: One or several PETSc solvers are used and thus the communication time below are under-estimated. --------------------------------------------------------------------------------------------------------- -Average of the fraction of the time spent in communications between processors: 4.7 % -Max of the fraction of the time spent in communications between processors: 8.4 % -Min of the fraction of the time spent in communications between processors: 4 % -Time of one mpsum measured by an internal bench over 0.1s (network latency): 3.39618e-06 -Network maximum bandwidth on all processors: 47.1 GB/s +Average of the fraction of the time spent in communications between processors: 6.5 % +Max of the fraction of the time spent in communications between processors: 11.6 % +Min of the fraction of the time spent in communications between processors: 5.3 % +Time of one mpsum measured by an internal bench over 0.1s (network latency): 3.28297e-06 +Network maximum bandwidth on all processors: 46.3 GB/s Total network traffic: 1136.61 MB/time step Average message size: 473.762 kB -Min waiting time: 4.2 % of total time -Max waiting time: 8 % of total time -Avg waiting time: 5.4375 % of total time +Min waiting time: 5.2 % of total time +Max waiting time: 10.1 % of total time +Avg waiting time: 8.075 % of total time ----------------------------------------------------------------------------------------------------------- GPU statistics ----------------------------------------------------------------------------------------------------------- Counter description | Time per step | % loop time | Call(s)/step | Bandwidth ----------------------------------------------------------------------------------------------------------- -Libraries: | 0.096971 | 51.0 | 2 | -Kernels: | 0.0718802 | 37.8 | 1084 | -Copy host to device: | 0.000331019 | 0.2 | 12 | 3.9 GB/s -Copy device to host: | 0.000457719 | 0.2 | 10 | 4.9 GB/s -Alloc/Free on device: | 0.000156874 | 0.1 | 60 | -GPU: 89% Copy H<->D: 0.41% Alloc/free: 0.082% Comm: 8.2% CPU & I/O: 2.5% +Libraries: | 0.0600684 | 40.8 | 2 | +Kernels: | 0.0671133 | 45.6 | 1086 | +Copy host to device: | 0.000222822 | 0.2 | 9 | 2.4 GB/s +Copy device to host: | 0.000324692 | 0.2 | 7 | 4.6 GB/s +Alloc/Free on device: | 2.89998e-05 | 0.0 | 60 | +GPU: 86% Copy H<->D: 0.37% Alloc/free: 0.02% Comm: 10% CPU & I/O: 2.9% ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Post-resolution statistics ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -Time of the post-resolution: 0.0357117 +Time of the post-resolution: 0.0364495 Maximum number of MPI allreduce per time step 7 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -109,6 +109,6 @@ Max waiting time big => probably due to a bad partitioning Communications > 30% => too many processors or network too slow ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -Total time for the whole computation 13.374 +Total time for the whole computation 18.8881 -Power consumption: 0 kW 0 kWh 0 € (0.12€/kWh) +[Slurm] Power consumption (31 s): 0.524 kW 0.005 kWh 0.000 € (0.10€/kWh) diff --git a/tests/GPU/JEL_bous/PAR_JEL_bous_BENCH.TU.adastra_gfx942x8 b/tests/GPU/JEL_bous/PAR_JEL_bous_BENCH.TU.adastra_gfx942x8 new file mode 100644 index 0000000000..4de8236f49 --- /dev/null +++ b/tests/GPU/JEL_bous/PAR_JEL_bous_BENCH.TU.adastra_gfx942x8 @@ -0,0 +1,114 @@ + # Global performance file # + +This is the global file for tracking performance in TRUST. It stores aggregated quantities. +More detailed statistics can be found in the PAR_JEL_bous_BENCH_csv.TU file +For time loop, only standard counters of level 1 are printed alongside your custom counters +Time is given in seconds + +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Context of the computation +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Date: 08-06-2026 -- 14:44:57 +OS: a1001__Linux__x86_64__5.14.0-570.69.1.el9_6.x86_64__#1 SMP PREEMPT_DYNAMIC Tue Nov 25 01:30:14 EST 2025 +CPU model : AMD Instinct MI300A Accelerator +Total number of threads:192 +GPU model: AMD Instinct MI300A +HIP runtime version: 6.43 +HIP drivers version: 6.43 +Nb procs used for the computation: 8 +TRUST version: 1.9.8_beta +Total number of elements used for the calculation: 2332800 + +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Computation start-up statistics +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Total time of the start-up: 11.8595 +Number of virtual exchanges: 91 +Maximum number of MPI allreduce per time step 234 + +Number of calls to the linear solver per time step: 2 +Average time of the resolution of the linear problem per call: 0.833593 +Average number of iteration of the linear solver per call: 19.5 + +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Time loop statistics +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +The first time step is not accounted for the computation of the time loop statistics +Total time of the time loop: 1.14404 +Number of time steps: 9 +Skipped time steps: 1 +Average time per time step: 0.127115 +Standard deviation between time steps: 0.00446888 +Time elapsed in the skipped time steps: 0.267194 + +Percent of total time spend in communication: 8.26359 + +Standard counter description | Time/step | % loop time | Call(s)/step +------------------------------------------------------------------------------------------ +Linear solver resolutions Ax=B | 0.0791995 | 62.3 | 2 +Convection operator | 0.002333325 | 1.8 | 4 +Diffusion operator | 0.01340551 | 10.5 | 26 +Gradient operator | 0.002692253 | 2.1 | 4 +Divergence operator | 0.000894222 | 0.7 | 3 +Source terms | 0.001624698 | 1.3 | 2 +Update ::mettre_a_jour | 0.002804274 | 2.2 | 1 +Solver for implicit diffusion | 0.01320826 | 10.4 | 4 +Computation of the time step dt | 0.00250911 | 2.0 | 8 +Turbulence model::update | 0.000777171 | 0.6 | 1 +Post-treatment operations | 0.002293142 | 1.8 | 1 +Other operations | 0.005373586 | 4.2 | +Number of virtual exchanges per time step: 76 +Maximum number of MPI allreduce per time step 66.7 + +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Warning: number of MPI allreduce calls per time step is high. Contact TRUST team to run massive parallel calculations +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Average number of iteration of the linear solver per call: 30 + + +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Time loop statistics: IO +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Output write sequential: 1025 MB/s + +--------------------------------------------------------------------------------------------------------- +Warning: One or several PETSc solvers are used and thus the communication time below are under-estimated. +--------------------------------------------------------------------------------------------------------- + +Average of the fraction of the time spent in communications between processors: 7.2 % +Max of the fraction of the time spent in communications between processors: 11.6 % +Min of the fraction of the time spent in communications between processors: 5.9 % +Time of one mpsum measured by an internal bench over 0.1s (network latency): 1.26385e-05 +Network maximum bandwidth on all processors: 51.3 GB/s +Total network traffic: 1136.61 MB/time step +Average message size: 473.762 kB +Min waiting time: 6.3 % of total time +Max waiting time: 10.5 % of total time +Avg waiting time: 8.1875 % of total time + +----------------------------------------------------------------------------------------------------------- + GPU statistics +----------------------------------------------------------------------------------------------------------- +Counter description | Time per step | % loop time | Call(s)/step | Bandwidth +----------------------------------------------------------------------------------------------------------- +Libraries: | 0.0787359 | 61.9 | 2 | +Kernels: | 0.0313283 | 24.6 | 1086 | +Copy host to device: | 0.000201995 | 0.2 | 9 | 2.7 GB/s +Copy device to host: | 0.000282562 | 0.2 | 7 | 5.3 GB/s +Alloc/Free on device: | 7.34502e-05 | 0.1 | 60 | +GPU: 87% Copy H<->D: 0.38% Alloc/free: 0.058% Comm: 10% CPU & I/O: 2.8% +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Post-resolution statistics +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Time of the post-resolution: 0.0479479 +Maximum number of MPI allreduce per time step 7 + +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Max waiting time big => probably due to a bad partitioning +Communications > 30% => too many processors or network too slow +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Total time for the whole computation 13.3187 + +[Slurm] Power consumption (21 s): 0.735 kW 0.004 kWh 0.000 € (0.10€/kWh) diff --git a/tests/GPU/JEL_bous/PAR_JEL_bous_BENCH.TU.jean-zay_cc90x8 b/tests/GPU/JEL_bous/PAR_JEL_bous_BENCH.TU.jean-zay_cc90x8 index 1cdc57caf3..22117dc475 100644 --- a/tests/GPU/JEL_bous/PAR_JEL_bous_BENCH.TU.jean-zay_cc90x8 +++ b/tests/GPU/JEL_bous/PAR_JEL_bous_BENCH.TU.jean-zay_cc90x8 @@ -8,101 +8,100 @@ Time is given in seconds ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Context of the computation ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -Date: 19-11-2025 -- 12:36:54 -OS: jzxh178__Linux__x86_64__5.14.0-427.76.1.el9_4.x86_64__#1 SMP PREEMPT_DYNAMIC Fri Jun 27 09:53:45 EDT 2025 -CPU: Model name: Intel(R) Xeon(R) Platinum 8468 ; Thread(s) per core: 2 -GPU: | NVIDIA-SMI 570.86.15 Driver Version: 570.86.15 CUDA Version: 12.8 | -| 0 NVIDIA H100 80GB HBM3 On | 00000000: -Nb procs: 8 -TRUST version: 1.9.7_beta +Date: 10-06-2026 -- 10:42:54 +OS: jzxh021__Linux__x86_64__5.14.0-570.116.1.el9_6.x86_64__#1 SMP PREEMPT_DYNAMIC Tue May 19 08:44:54 EDT 2026 +CPU model : Intel(R) Xeon(R) Platinum 8468 +Total number of threads:192 +GPU model: NVIDIA H100 80GB HBM3 +CUDA runtime version: 12.60 +CUDA drivers version: 13.20 +Nb procs used for the computation: 8 +TRUST version: 1.9.8_beta +Total number of elements used for the calculation: 2332800 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Computation start-up statistics ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -Total time of the start-up: 7.94766 -Percent of untracked time during computation start-up: 3.64887e-08 -Number of virtual exchanges: 88 +Total time of the start-up: 8.36044 +Number of virtual exchanges: 91 Maximum number of MPI allreduce per time step 234 Number of calls to the linear solver per time step: 2 -Average time of the resolution of the linear problem per call: 1.06429 -Average number of iteration of the linear solver per call: 10.5 +Average time of the resolution of the linear problem per call: 1.48918 +Average number of iteration of the linear solver per call: 18 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Time loop statistics ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ The first time step is not accounted for the computation of the time loop statistics -Total time of the time loop: 1.28802 +Total time of the time loop: 1.08964 Number of time steps: 9 Skipped time steps: 1 -Average time per time step: 0.143113 -Standard deviation between time steps: 0.00230694 -Time elapsed in the skipped time steps: 0.283043 +Average time per time step: 0.121071 +Standard deviation between time steps: 0.00937755 +Time elapsed in the skipped time steps: 0.389716 -Percent of total time spend in communication: 8.32178 +Percent of total time spend in communication: 3.62109 Standard counter description | Time/step | % loop time | Call(s)/step ------------------------------------------------------------------------------------------ -Linear solver resolutions Ax=B | 0.0953142 | 66.6 | 2 -Convection operator | 0.003255428 | 2.3 | 4 -Diffusion operator | 0.01383943 | 9.7 | 26 -Gradient operator | 0.001517576 | 1.1 | 4 -Divergence operator | 0.002025552 | 1.4 | 3 -Source terms | 0.00112365 | 0.8 | 2 -Update ::mettre_a_jour | 0.002446269 | 1.7 | 1 -Solver for implicit diffusion | 0.01201407 | 8.4 | 4 -Computation of the time step dt | 0.001550891 | 1.1 | 8 -Turbulence model::update | 0.0007519211 | 0.5 | 1 -Post-treatment operations | 0.003537479 | 2.5 | 1 -Other operations | 0.005736537 | 4.0 | - -Untracked time | 7.27e-05 | 0.00564 | - -Total number of virtual exchanges: 848 -Maximum number of MPI allreduce per time step 76.6 +Linear solver resolutions Ax=B | 0.0842484 | 69.6 | 2 +Convection operator | 0.00200081 | 1.7 | 4 +Diffusion operator | 0.01016694 | 8.4 | 26 +Gradient operator | 0.001346033 | 1.1 | 4 +Divergence operator | 0.00067062 | 0.6 | 3 +Source terms | 0.001082813 | 0.9 | 2 +Update ::mettre_a_jour | 0.001681446 | 1.4 | 1 +Solver for implicit diffusion | 0.00801906 | 6.6 | 4 +Computation of the time step dt | 0.001480507 | 1.2 | 8 +Turbulence model::update | 0.0004644671 | 0.4 | 1 +Post-treatment operations | 0.005549941 | 4.6 | 1 +Other operations | 0.004360034 | 3.6 | +Number of virtual exchanges per time step: 76 +Maximum number of MPI allreduce per time step 66.7 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Warning: number of MPI allreduce calls per time step is high. Contact TRUST team to run massive parallel calculations ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -Average number of iteration of the linear solver per call: 18 +Average number of iteration of the linear solver per call: 28.3 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Time loop statistics: IO ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -Output write sequential: 2095 MB/s +Output write sequential: 2180 MB/s --------------------------------------------------------------------------------------------------------- Warning: One or several PETSc solvers are used and thus the communication time below are under-estimated. --------------------------------------------------------------------------------------------------------- -Average of the fraction of the time spent in communications between processors: 6.6 % -Max of the fraction of the time spent in communications between processors: 10 % +Average of the fraction of the time spent in communications between processors: 5.2 % +Max of the fraction of the time spent in communications between processors: 8.2 % Min of the fraction of the time spent in communications between processors: 4.7 % -Time of one mpsum measured by an internal bench over 0.1s (network latency): 4.72424e-06 -Network maximum bandwidth on all processors: 116.6 GB/s -Total network traffic: 1288 MB/time step -Average message size: 461.906 kB -Min waiting time: 5 % of total time -Max waiting time: 8.3 % of total time -Avg waiting time: 6.275 % of total time +Time of one mpsum measured by an internal bench over 0.1s (network latency): 6.09336e-06 +Network maximum bandwidth on all processors: 131.6 GB/s +Total network traffic: 1136.61 MB/time step +Average message size: 473.762 kB +Min waiting time: 4.5 % of total time +Max waiting time: 7.8 % of total time +Avg waiting time: 6.6875 % of total time ----------------------------------------------------------------------------------------------------------- GPU statistics ----------------------------------------------------------------------------------------------------------- Counter description | Time per step | % loop time | Call(s)/step | Bandwidth ----------------------------------------------------------------------------------------------------------- -Libraries: | 0.0950047 | 66.4 | 2 | -Kernels: | 0.0258533 | 18.1 | 1090 | -Copy host to device: | 0.00027859 | 0.2 | 12 | 4.6 GB/s -Copy device to host: | 0.000448907 | 0.3 | 10 | 5.0 GB/s -Alloc/Free on device: | 0.00298098 | 2.1 | 60 | -GPU: 84% Copy H<->D: 0.51% Alloc/free: 2.1% Comm: 8.3% CPU & I/O: 4.6% +Libraries: | 0.0839351 | 69.3 | 2 | +Kernels: | 0.0227148 | 18.8 | 1086 | +Copy host to device: | 0.000178755 | 0.1 | 9 | 3.0 GB/s +Copy device to host: | 0.000334874 | 0.3 | 7 | 4.5 GB/s +Alloc/Free on device: | 0.000144695 | 0.1 | 60 | +GPU: 88% Copy H<->D: 0.42% Alloc/free: 0.12% Comm: 4.9% CPU & I/O: 6.5% ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Post-resolution statistics ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -Time of the post-resolution: 0.0187218 +Time of the post-resolution: 0.0332711 Maximum number of MPI allreduce per time step 7 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -110,5 +109,6 @@ Max waiting time big => probably due to a bad partitioning Communications > 30% => too many processors or network too slow ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -Total time for the whole computation 9.53745 +Total time for the whole computation 9.87309 +[Slurm] Power consumption (25 s): 0.894 kW 0.006 kWh 0.001 € (0.10€/kWh) diff --git a/tests/GPU/JEL_bous/PAR_JEL_bous_BENCH.TU.lumi_gfx90ax8 b/tests/GPU/JEL_bous/PAR_JEL_bous_BENCH.TU.lumi_gfx90ax8 index 64b93ba748..013e038296 100644 --- a/tests/GPU/JEL_bous/PAR_JEL_bous_BENCH.TU.lumi_gfx90ax8 +++ b/tests/GPU/JEL_bous/PAR_JEL_bous_BENCH.TU.lumi_gfx90ax8 @@ -8,8 +8,8 @@ Time is given in seconds ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Context of the computation ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -Date: 24-02-2026 -- 00:10:38 -OS: nid007971__Linux__x86_64__6.4.0-150600.23.73_15.0.14-cray_shasta_c__#1 SMP Tue Oct 21 20:32:25 UTC 2025 (89d3c98) +Date: 15-05-2026 -- 20:37:03 +OS: nid005023__Linux__x86_64__6.4.0-150600.23.73_15.0.14-cray_shasta_c__#1 SMP Tue Oct 21 20:32:25 UTC 2025 (89d3c98) CPU model : AMD EPYC 7A53 64-Core Processor Total number of threads:128 GPU model: AMD Instinct MI250X @@ -22,42 +22,42 @@ Total number of elements used for the calculation: 2332800 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Computation start-up statistics ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -Total time of the start-up: 34.1602 -Number of virtual exchanges: 88 -Maximum number of MPI allreduce per time step 230 +Total time of the start-up: 55.7779 +Number of virtual exchanges: 91 +Maximum number of MPI allreduce per time step 234 Number of calls to the linear solver per time step: 2 -Average time of the resolution of the linear problem per call: 2.11372 +Average time of the resolution of the linear problem per call: 2.46717 Average number of iteration of the linear solver per call: 19.5 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Time loop statistics ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ The first time step is not accounted for the computation of the time loop statistics -Total time of the time loop: 1.73935 +Total time of the time loop: 1.58648 Number of time steps: 9 Skipped time steps: 1 -Average time per time step: 0.193261 -Standard deviation between time steps: 0.00394785 -Time elapsed in the skipped time steps: 0.403044 +Average time per time step: 0.176276 +Standard deviation between time steps: 0.00386474 +Time elapsed in the skipped time steps: 0.382443 -Percent of total time spend in communication: 7.14789 +Percent of total time spend in communication: 7.163 Standard counter description | Time/step | % loop time | Call(s)/step ------------------------------------------------------------------------------------------ -Linear solver resolutions Ax=B | 0.0995881 | 41.8 | 2 -Convection operator | 0.008435531 | 3.5 | 4 -Diffusion operator | 0.02842003 | 11.9 | 26 -Gradient operator | 0.009849339 | 4.1 | 4 -Divergence operator | 0.002951774 | 1.2 | 3 -Source terms | 0.009550072 | 4.0 | 2 -Update ::mettre_a_jour | 0.003524391 | 1.5 | 1 -Solver for implicit diffusion | 0.01629956 | 6.8 | 4 -Computation of the time step dt | 0.005308412 | 2.2 | 8 -Turbulence model::update | 0.001210523 | 0.5 | 1 -Post-treatment operations | 0.002457478 | 1.0 | 1 -Other operations | 0.005665654 | 2.4 | -Number of virtual exchanges per time step: 80 +Linear solver resolutions Ax=B | 0.0943011 | 53.5 | 2 +Convection operator | 0.00531937 | 3.0 | 4 +Diffusion operator | 0.02432901 | 13.8 | 26 +Gradient operator | 0.005692192 | 3.2 | 4 +Divergence operator | 0.003190957 | 1.8 | 3 +Source terms | 0.009910483 | 5.6 | 2 +Update ::mettre_a_jour | 0.003489126 | 2.0 | 1 +Solver for implicit diffusion | 0.01607808 | 9.1 | 4 +Computation of the time step dt | 0.004845297 | 2.7 | 8 +Turbulence model::update | 0.001111563 | 0.6 | 1 +Post-treatment operations | 0.00256484 | 1.5 | 1 +Other operations | 0.005443877 | 3.1 | +Number of virtual exchanges per time step: 76 Maximum number of MPI allreduce per time step 66.7 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -70,38 +70,38 @@ Average number of iteration of the linear solver per call: 30 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Time loop statistics: IO ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -Output write sequential: 963 MB/s +Output write sequential: 951 MB/s --------------------------------------------------------------------------------------------------------- Warning: One or several PETSc solvers are used and thus the communication time below are under-estimated. --------------------------------------------------------------------------------------------------------- -Average of the fraction of the time spent in communications between processors: 4.7 % +Average of the fraction of the time spent in communications between processors: 5 % Max of the fraction of the time spent in communications between processors: 9 % -Min of the fraction of the time spent in communications between processors: 4.4 % -Time of one mpsum measured by an internal bench over 0.1s (network latency): 3.59628e-06 -Network maximum bandwidth on all processors: 41.8 GB/s +Min of the fraction of the time spent in communications between processors: 4.9 % +Time of one mpsum measured by an internal bench over 0.1s (network latency): 3.71493e-06 +Network maximum bandwidth on all processors: 42.4 GB/s Total network traffic: 1136.61 MB/time step Average message size: 473.762 kB -Min waiting time: 4.3 % of total time -Max waiting time: 8.6 % of total time -Avg waiting time: 5.675 % of total time +Min waiting time: 4.8 % of total time +Max waiting time: 8.7 % of total time +Avg waiting time: 6.05 % of total time ----------------------------------------------------------------------------------------------------------- GPU statistics ----------------------------------------------------------------------------------------------------------- Counter description | Time per step | % loop time | Call(s)/step | Bandwidth ----------------------------------------------------------------------------------------------------------- -Libraries: | 0.0991658 | 51.3 | 2 | -Kernels: | 0.0714553 | 37.0 | 1084 | -Copy host to device: | 0.000330243 | 0.2 | 12 | 3.9 GB/s -Copy device to host: | 0.00048114 | 0.2 | 10 | 4.7 GB/s -Alloc/Free on device: | 0.000176988 | 0.1 | 60 | -GPU: 88% Copy H<->D: 0.42% Alloc/free: 0.092% Comm: 8.8% CPU & I/O: 2.4% +Libraries: | 0.0938749 | 53.3 | 2 | +Kernels: | 0.0608573 | 34.5 | 1086 | +Copy host to device: | 0.000337062 | 0.2 | 12 | 3.8 GB/s +Copy device to host: | 0.000501417 | 0.3 | 10 | 4.5 GB/s +Alloc/Free on device: | 3.41883e-05 | 0.0 | 60 | +GPU: 88% Copy H<->D: 0.48% Alloc/free: 0.019% Comm: 8.9% CPU & I/O: 2.8% ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Post-resolution statistics ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -Time of the post-resolution: 0.0405752 +Time of the post-resolution: 0.0376324 Maximum number of MPI allreduce per time step 7 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -109,6 +109,6 @@ Max waiting time big => probably due to a bad partitioning Communications > 30% => too many processors or network too slow ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -Total time for the whole computation 36.3432 +Total time for the whole computation 57.7844 -Power consumption: 0 kW 0 kWh 0 € (0.12€/kWh) +[Slurm] Power consumption (88 s): 0.510 kW 0.012 kWh 0.001 € (0.10€/kWh) diff --git a/tests/GPU/JEL_bous/PAR_JEL_bous_BENCH.TU.topaze_cc80x8 b/tests/GPU/JEL_bous/PAR_JEL_bous_BENCH.TU.topaze_cc80x8 index 56bb61f074..16f94723d3 100644 --- a/tests/GPU/JEL_bous/PAR_JEL_bous_BENCH.TU.topaze_cc80x8 +++ b/tests/GPU/JEL_bous/PAR_JEL_bous_BENCH.TU.topaze_cc80x8 @@ -8,8 +8,8 @@ Time is given in seconds ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Context of the computation ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -Date: 20-02-2026 -- 17:52:49 -OS: topaze7033__Linux__x86_64__4.18.0-553.89.1.el8_10.x86_64__#1 SMP Sat Nov 29 00:49:18 EST 2025 +Date: 15-05-2026 -- 13:40:59 +OS: topaze7018__Linux__x86_64__4.18.0-553.89.1.el8_10.x86_64__#1 SMP Sat Nov 29 00:49:18 EST 2025 CPU model : AMD EPYC 7763 64-Core Processor Total number of threads:256 GPU model: NVIDIA A100-SXM4-80GB @@ -22,86 +22,86 @@ Total number of elements used for the calculation: 2332800 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Computation start-up statistics ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -Total time of the start-up: 10.7416 -Number of virtual exchanges: 88 -Maximum number of MPI allreduce per time step 230 +Total time of the start-up: 10.6546 +Number of virtual exchanges: 91 +Maximum number of MPI allreduce per time step 234 Number of calls to the linear solver per time step: 2 -Average time of the resolution of the linear problem per call: 1.74355 +Average time of the resolution of the linear problem per call: 1.49978 Average number of iteration of the linear solver per call: 18 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Time loop statistics ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ The first time step is not accounted for the computation of the time loop statistics -Total time of the time loop: 1.62302 +Total time of the time loop: 1.57279 Number of time steps: 9 Skipped time steps: 1 -Average time per time step: 0.180335 -Standard deviation between time steps: 0.00980316 -Time elapsed in the skipped time steps: 0.429487 +Average time per time step: 0.174755 +Standard deviation between time steps: 0.0105474 +Time elapsed in the skipped time steps: 0.406459 -Percent of total time spend in communication: 8.93758 +Percent of total time spend in communication: 7.75645 Standard counter description | Time/step | % loop time | Call(s)/step ------------------------------------------------------------------------------------------ -Linear solver resolutions Ax=B | 0.118436 | 51.9 | 2 -Convection operator | 0.00377104 | 1.7 | 4 -Diffusion operator | 0.01757169 | 7.7 | 26 -Gradient operator | 0.002272595 | 1.0 | 4 -Divergence operator | 0.002091061 | 0.9 | 3 -Source terms | 0.001597418 | 0.7 | 2 -Update ::mettre_a_jour | 0.003249867 | 1.4 | 1 -Solver for implicit diffusion | 0.01671793 | 7.3 | 4 -Computation of the time step dt | 0.002845562 | 1.2 | 8 -Turbulence model::update | 0.0009740548 | 0.4 | 1 -Post-treatment operations | 0.004604644 | 2.0 | 1 -Other operations | 0.006203393 | 2.7 | -Number of virtual exchanges per time step: 80 +Linear solver resolutions Ax=B | 0.122977 | 70.4 | 2 +Convection operator | 0.002903186 | 1.7 | 4 +Diffusion operator | 0.0138009 | 7.9 | 26 +Gradient operator | 0.001907345 | 1.1 | 4 +Divergence operator | 0.001499655 | 0.9 | 3 +Source terms | 0.001528563 | 0.9 | 2 +Update ::mettre_a_jour | 0.002610787 | 1.5 | 1 +Solver for implicit diffusion | 0.01478346 | 8.5 | 4 +Computation of the time step dt | 0.002501599 | 1.4 | 8 +Turbulence model::update | 0.0006880512 | 0.4 | 1 +Post-treatment operations | 0.004329323 | 2.5 | 1 +Other operations | 0.005224245 | 3.0 | +Number of virtual exchanges per time step: 76 Maximum number of MPI allreduce per time step 66.7 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Warning: number of MPI allreduce calls per time step is high. Contact TRUST team to run massive parallel calculations ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -Average number of iteration of the linear solver per call: 29.7 +Average number of iteration of the linear solver per call: 28.3 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Time loop statistics: IO ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -Output write sequential: 1082 MB/s +Output write sequential: 1103 MB/s --------------------------------------------------------------------------------------------------------- Warning: One or several PETSc solvers are used and thus the communication time below are under-estimated. --------------------------------------------------------------------------------------------------------- -Average of the fraction of the time spent in communications between processors: 6.8 % -Max of the fraction of the time spent in communications between processors: 11.4 % -Min of the fraction of the time spent in communications between processors: 6.2 % -Time of one mpsum measured by an internal bench over 0.1s (network latency): 1.70475e-05 +Average of the fraction of the time spent in communications between processors: 6.5 % +Max of the fraction of the time spent in communications between processors: 11 % +Min of the fraction of the time spent in communications between processors: 6.1 % +Time of one mpsum measured by an internal bench over 0.1s (network latency): 1.71348e-05 Network maximum bandwidth on all processors: 46.4 GB/s Total network traffic: 1136.61 MB/time step Average message size: 473.762 kB -Min waiting time: 6.9 % of total time -Max waiting time: 10.6 % of total time -Avg waiting time: 8.05 % of total time +Min waiting time: 6.7 % of total time +Max waiting time: 9 % of total time +Avg waiting time: 7.475 % of total time ----------------------------------------------------------------------------------------------------------- GPU statistics ----------------------------------------------------------------------------------------------------------- Counter description | Time per step | % loop time | Call(s)/step | Bandwidth ----------------------------------------------------------------------------------------------------------- -Libraries: | 0.118065 | 65.5 | 2 | -Kernels: | 0.0327413 | 18.2 | 1084 | -Copy host to device: | 0.000336012 | 0.2 | 12 | 3.9 GB/s -Copy device to host: | 0.000480723 | 0.3 | 10 | 4.7 GB/s -Alloc/Free on device: | 0.0033014 | 1.8 | 60 | -GPU: 84% Copy H<->D: 0.45% Alloc/free: 1.8% Comm: 11% CPU & I/O: 2.8% +Libraries: | 0.122609 | 70.2 | 2 | +Kernels: | 0.0292873 | 16.8 | 1086 | +Copy host to device: | 0.00032894 | 0.2 | 12 | 3.9 GB/s +Copy device to host: | 0.000453863 | 0.3 | 10 | 5.0 GB/s +Alloc/Free on device: | 0.000211454 | 0.1 | 60 | +GPU: 87% Copy H<->D: 0.45% Alloc/free: 0.12% Comm: 9.8% CPU & I/O: 2.8% ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Post-resolution statistics ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -Time of the post-resolution: 0.0423874 +Time of the post-resolution: 0.0395549 Maximum number of MPI allreduce per time step 7 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -109,5 +109,6 @@ Max waiting time big => probably due to a bad partitioning Communications > 30% => too many processors or network too slow ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -Total time for the whole computation 12.8365 +Total time for the whole computation 12.6734 +[Slurm] Power consumption (59 s): 5.418 kW 0.089 kWh 0.009 € (0.10€/kWh) diff --git a/tests/GPU/OpenMP_Iterateur/OpenMP_Iterateur.TU.adastra_gfx90a b/tests/GPU/OpenMP_Iterateur/OpenMP_Iterateur.TU.adastra_gfx90a index 47b2a9e24a..023bcc5668 100644 --- a/tests/GPU/OpenMP_Iterateur/OpenMP_Iterateur.TU.adastra_gfx90a +++ b/tests/GPU/OpenMP_Iterateur/OpenMP_Iterateur.TU.adastra_gfx90a @@ -8,8 +8,8 @@ Time is given in seconds ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Context of the computation ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -Date: 20-02-2026 -- 21:15:33 -OS: g1023__Linux__x86_64__5.14.0-570.69.1.el9_6.x86_64__#1 SMP PREEMPT_DYNAMIC Tue Nov 25 01:30:14 EST 2025 +Date: 23-04-2026 -- 09:27:03 +OS: g1182__Linux__x86_64__5.14.0-570.69.1.el9_6.x86_64__#1 SMP PREEMPT_DYNAMIC Tue Nov 25 01:30:14 EST 2025 CPU model : AMD EPYC 7A53 64-Core Processor Total number of threads:128 GPU model: AMD Instinct MI250X @@ -22,37 +22,37 @@ Total number of elements used for the calculation: 51840 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Computation start-up statistics ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -Total time of the start-up: 5.93888 +Total time of the start-up: 6.12593 Number of calls to the linear solver per time step: 2 -Average time of the resolution of the linear problem per call: 0.185608 -Average number of iteration of the linear solver per call: 48 +Average time of the resolution of the linear problem per call: 0.193628 +Average number of iteration of the linear solver per call: 26 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Time loop statistics ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ The first time step is not accounted for the computation of the time loop statistics -Total time of the time loop: 0.131921 +Total time of the time loop: 0.0968883 Number of time steps: 9 Skipped time steps: 1 -Average time per time step: 0.0146579 -Standard deviation between time steps: 0.00787638 -Time elapsed in the skipped time steps: 0.0157475 +Average time per time step: 0.0107654 +Standard deviation between time steps: 0.00741937 +Time elapsed in the skipped time steps: 0.0109251 Standard counter description | Time/step | % loop time | Call(s)/step ------------------------------------------------------------------------------------------ -Linear solver resolutions Ax=B | 0.00852404 | 51.9 | 1 -Convection operator | 0.0004709512 | 2.9 | 1 -Diffusion operator | 0.0002746094 | 1.7 | 1 -Gradient operator | 0.0003199339 | 1.9 | 2 -Divergence operator | 0.0001807804 | 1.1 | 2 -Update ::mettre_a_jour | 0.0003165519 | 1.9 | 1 -Computation of the time step dt | 0.0002572336 | 1.6 | 2 -Post-treatment operations | 0.00343332 | 20.9 | 1 -Other operations | 0.0008805088 | 5.4 | +Linear solver resolutions Ax=B | 0.00462653 | 43.0 | 1 +Convection operator | 0.000513395 | 4.8 | 1 +Diffusion operator | 0.0002880368 | 2.7 | 1 +Gradient operator | 0.0002975948 | 2.8 | 2 +Divergence operator | 0.0001837762 | 1.7 | 2 +Update ::mettre_a_jour | 0.0003529006 | 3.3 | 1 +Computation of the time step dt | 0.0002610717 | 2.4 | 2 +Post-treatment operations | 0.003305068 | 30.7 | 1 +Other operations | 0.0009369908 | 8.7 | -Average number of iteration of the linear solver per call: 42 +Average number of iteration of the linear solver per call: 22.2 ----------------------------------------------------------------------------------------------------------- @@ -60,17 +60,17 @@ Average number of iteration of the linear solver per call: 42 ----------------------------------------------------------------------------------------------------------- Counter description | Time per step | % loop time | Call(s)/step | Bandwidth ----------------------------------------------------------------------------------------------------------- -Libraries: | 0.00845284 | 57.7 | 1 | -Kernels: | 0.00235428 | 16.1 | 98 | -Copy host to device: | 0.000145101 | 1.0 | 8 | 1.5 GB/s -Copy device to host: | 0.000132154 | 0.9 | 4 | 4.1 GB/s +Libraries: | 0.00455693 | 42.3 | 1 | +Kernels: | 0.00238087 | 22.1 | 97 | +Copy host to device: | 0.000149197 | 1.4 | 8 | 1.4 GB/s +Copy device to host: | 0.000136187 | 1.3 | 4 | 4.0 GB/s Alloc/Free on device: | 0 | 0.0 | 0 | -GPU: 74% Copy H<->D: 1.9% Alloc/free: 0% Comm: 0% CPU & I/O: 24% +GPU: 64% Copy H<->D: 2.7% Alloc/free: 0% Comm: 0% CPU & I/O: 33% ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Post-resolution statistics ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -Time of the post-resolution: 0.00639265 +Time of the post-resolution: 0.00551343 -Total time for the whole computation 6.09295 +Total time for the whole computation 6.23927 -Power consumption: 0 kW 0 kWh 0 € (0.12€/kWh) +[Slurm] Power consumption (14 s): 0.268 kW 0.001 kWh 0.000 € (0.10€/kWh) diff --git a/tests/GPU/OpenMP_Iterateur/OpenMP_Iterateur.TU.adastra_gfx942 b/tests/GPU/OpenMP_Iterateur/OpenMP_Iterateur.TU.adastra_gfx942 index b06f2a3e97..99aa203f47 100644 --- a/tests/GPU/OpenMP_Iterateur/OpenMP_Iterateur.TU.adastra_gfx942 +++ b/tests/GPU/OpenMP_Iterateur/OpenMP_Iterateur.TU.adastra_gfx942 @@ -8,8 +8,8 @@ Time is given in seconds ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Context of the computation ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -Date: 17-03-2026 -- 17:58:19 -OS: a1007__Linux__x86_64__5.14.0-570.69.1.el9_6.x86_64__#1 SMP PREEMPT_DYNAMIC Tue Nov 25 01:30:14 EST 2025 +Date: 23-04-2026 -- 14:47:58 +OS: a1003__Linux__x86_64__5.14.0-570.69.1.el9_6.x86_64__#1 SMP PREEMPT_DYNAMIC Tue Nov 25 01:30:14 EST 2025 CPU model : AMD Instinct MI300A Accelerator Total number of threads:192 GPU model: AMD Instinct MI300A @@ -22,37 +22,37 @@ Total number of elements used for the calculation: 51840 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Computation start-up statistics ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -Total time of the start-up: 7.01774 +Total time of the start-up: 7.06812 Number of calls to the linear solver per time step: 2 -Average time of the resolution of the linear problem per call: 0.206942 -Average number of iteration of the linear solver per call: 48 +Average time of the resolution of the linear problem per call: 0.203738 +Average number of iteration of the linear solver per call: 26 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Time loop statistics ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ The first time step is not accounted for the computation of the time loop statistics -Total time of the time loop: 0.0987891 +Total time of the time loop: 0.0858901 Number of time steps: 9 Skipped time steps: 1 -Average time per time step: 0.0109766 -Standard deviation between time steps: 0.00617032 -Time elapsed in the skipped time steps: 0.0148606 +Average time per time step: 0.00954334 +Standard deviation between time steps: 0.0058059 +Time elapsed in the skipped time steps: 0.0155645 Standard counter description | Time/step | % loop time | Call(s)/step ------------------------------------------------------------------------------------------ -Linear solver resolutions Ax=B | 0.00647832 | 59.0 | 1 -Convection operator | 0.0002395161 | 2.2 | 1 -Diffusion operator | 0.0001366721 | 1.2 | 1 -Gradient operator | 0.0001858223 | 1.7 | 2 -Divergence operator | 0.0001313133 | 1.2 | 2 -Update ::mettre_a_jour | 0.0002394639 | 2.2 | 1 -Computation of the time step dt | 0.00016156 | 1.5 | 2 -Post-treatment operations | 0.002719926 | 24.8 | 1 -Other operations | 0.0006839746 | 6.2 | +Linear solver resolutions Ax=B | 0.00467528 | 49.0 | 1 +Convection operator | 0.0002698356 | 2.8 | 1 +Diffusion operator | 0.0001643067 | 1.7 | 1 +Gradient operator | 0.0002388463 | 2.5 | 2 +Divergence operator | 0.0001593037 | 1.7 | 2 +Update ::mettre_a_jour | 0.000304429 | 3.2 | 1 +Computation of the time step dt | 0.0001922116 | 2.0 | 2 +Post-treatment operations | 0.002630262 | 27.6 | 1 +Other operations | 0.0009088682 | 9.5 | -Average number of iteration of the linear solver per call: 42 +Average number of iteration of the linear solver per call: 22.2 ----------------------------------------------------------------------------------------------------------- @@ -60,17 +60,17 @@ Average number of iteration of the linear solver per call: 42 ----------------------------------------------------------------------------------------------------------- Counter description | Time per step | % loop time | Call(s)/step | Bandwidth ----------------------------------------------------------------------------------------------------------- -Libraries: | 0.0064201 | 58.5 | 1 | -Kernels: | 0.00149049 | 13.6 | 98 | -Copy host to device: | 0.000110276 | 1.0 | 8 | 1.9 GB/s -Copy device to host: | 0.000110103 | 1.0 | 4 | 5.0 GB/s +Libraries: | 0.00460235 | 48.2 | 1 | +Kernels: | 0.00185681 | 19.5 | 97 | +Copy host to device: | 0.000133123 | 1.4 | 8 | 1.6 GB/s +Copy device to host: | 0.000127818 | 1.3 | 4 | 4.3 GB/s Alloc/Free on device: | 0 | 0.0 | 0 | -GPU: 72% Copy H<->D: 2% Alloc/free: 0% Comm: 0% CPU & I/O: 26% +GPU: 68% Copy H<->D: 2.7% Alloc/free: 0% Comm: 0% CPU & I/O: 30% ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Post-resolution statistics ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -Time of the post-resolution: 0.00526473 +Time of the post-resolution: 0.0048061 -Total time for the whole computation 7.13666 +Total time for the whole computation 7.17439 -[Slurm] Power consumption (15 s): 0.446 kW 0.002 kWh 0.000 € (0.10€/kWh) +[Slurm] Power consumption (16 s): 0.396 kW 0.002 kWh 0.000 € (0.10€/kWh) diff --git a/tests/GPU/OpenMP_Iterateur/OpenMP_Iterateur.TU.dalianvl_cc100 b/tests/GPU/OpenMP_Iterateur/OpenMP_Iterateur.TU.dalianvl_cc100 new file mode 100644 index 0000000000..28cbcce359 --- /dev/null +++ b/tests/GPU/OpenMP_Iterateur/OpenMP_Iterateur.TU.dalianvl_cc100 @@ -0,0 +1,75 @@ + # Global performance file # + +This is the global file for tracking performance in TRUST. It stores aggregated quantities. +More detailed statistics can be found in the OpenMP_Iterateur_csv.TU file +For time loop, only standard counters of level 1 are printed alongside your custom counters +Time is given in seconds + +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Context of the computation +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Date: 22-05-2026 -- 12:17:34 +OS: dalianvl14__Linux__aarch64__5.14.0-503.40.1.el9_5.aarch64__#1 SMP PREEMPT_DYNAMIC Thu Apr 24 13:05:29 UTC 2025 +CPU model : Unknown Linux CPU +Total number of threads:144 +GPU model: NVIDIA GB200 +CUDA runtime version: 13.0 +CUDA drivers version: 13.20 +Nb procs used for the computation: 1 +TRUST version: 1.9.8_beta +Total number of elements used for the calculation: 51840 + +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Computation start-up statistics +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Total time of the start-up: 1.2994 + +Number of calls to the linear solver per time step: 2 +Average time of the resolution of the linear problem per call: 0.0740778 +Average number of iteration of the linear solver per call: 26 + +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Time loop statistics +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +The first time step is not accounted for the computation of the time loop statistics +Total time of the time loop: 0.106793 +Number of time steps: 9 +Skipped time steps: 1 +Average time per time step: 0.0118658 +Standard deviation between time steps: 0.00600289 +Time elapsed in the skipped time steps: 0.00831561 + + +Standard counter description | Time/step | % loop time | Call(s)/step +------------------------------------------------------------------------------------------ +Linear solver resolutions Ax=B | 0.0065994 | 55.6 | 1 +Convection operator | 0.0002036051 | 1.7 | 1 +Diffusion operator | 0.0001235908 | 1.0 | 1 +Gradient operator | 0.0001959857 | 1.7 | 2 +Divergence operator | 0.0001151963 | 1.0 | 2 +Update ::mettre_a_jour | 0.0002897668 | 2.4 | 1 +Computation of the time step dt | 0.0001778879 | 1.5 | 2 +Post-treatment operations | 0.002707086 | 22.8 | 1 +Other operations | 0.001453324 | 12.2 | + +Average number of iteration of the linear solver per call: 22.2 + + +----------------------------------------------------------------------------------------------------------- + GPU statistics +----------------------------------------------------------------------------------------------------------- +Counter description | Time per step | % loop time | Call(s)/step | Bandwidth +----------------------------------------------------------------------------------------------------------- +Libraries: | 0.00650064 | 54.8 | 1 | +Kernels: | 0.00148095 | 12.5 | 97 | +Copy host to device: | 0.000140437 | 1.2 | 8 | 1.5 GB/s +Copy device to host: | 8.81669e-05 | 0.7 | 4 | 6.2 GB/s +Alloc/Free on device: | 0 | 0.0 | 0 | +GPU: 67% Copy H<->D: 1.9% Alloc/free: 0% Comm: 0% CPU & I/O: 31% +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Post-resolution statistics +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Time of the post-resolution: 0.00378595 + +Total time for the whole computation 1.4183 + diff --git a/tests/GPU/OpenMP_Iterateur/OpenMP_Iterateur.TU.eureka_cc89 b/tests/GPU/OpenMP_Iterateur/OpenMP_Iterateur.TU.eureka_cc89 new file mode 100644 index 0000000000..7093500372 --- /dev/null +++ b/tests/GPU/OpenMP_Iterateur/OpenMP_Iterateur.TU.eureka_cc89 @@ -0,0 +1,75 @@ + # Global performance file # + +This is the global file for tracking performance in TRUST. It stores aggregated quantities. +More detailed statistics can be found in the OpenMP_Iterateur_csv.TU file +For time loop, only standard counters of level 1 are printed alongside your custom counters +Time is given in seconds + +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Context of the computation +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Date: 11-05-2026 -- 08:34:51 +OS: eureka__Linux__x86_64__6.17.0-20-generic__#20~24.04.1-Ubuntu SMP PREEMPT_DYNAMIC Thu Mar 19 01:28:37 UTC 2 +CPU model : INTEL(R) XEON(R) PLATINUM 8580 +Total number of threads:240 +GPU model: NVIDIA RTX 6000 Ada Generation +CUDA runtime version: 12.90 +CUDA drivers version: 13.10 +Nb procs used for the computation: 1 +TRUST version: 1.9.8_beta +Total number of elements used for the calculation: 51840 + +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Computation start-up statistics +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Total time of the start-up: 1.19061 + +Number of calls to the linear solver per time step: 2 +Average time of the resolution of the linear problem per call: 0.124688 +Average number of iteration of the linear solver per call: 26 + +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Time loop statistics +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +The first time step is not accounted for the computation of the time loop statistics +Total time of the time loop: 0.112536 +Number of time steps: 9 +Skipped time steps: 1 +Average time per time step: 0.012504 +Standard deviation between time steps: 0.00837201 +Time elapsed in the skipped time steps: 0.00515174 + + +Standard counter description | Time/step | % loop time | Call(s)/step +------------------------------------------------------------------------------------------ +Linear solver resolutions Ax=B | 0.00692501 | 55.4 | 1 +Convection operator | 0.0002694614 | 2.2 | 1 +Diffusion operator | 0.0001328644 | 1.1 | 1 +Gradient operator | 0.0001807441 | 1.4 | 2 +Divergence operator | 0.0001022032 | 0.8 | 2 +Update ::mettre_a_jour | 0.0002516219 | 2.0 | 1 +Computation of the time step dt | 0.0001513772 | 1.2 | 2 +Post-treatment operations | 0.003694517 | 29.5 | 1 +Other operations | 0.0007962459 | 6.4 | + +Average number of iteration of the linear solver per call: 22.2 + + +----------------------------------------------------------------------------------------------------------- + GPU statistics +----------------------------------------------------------------------------------------------------------- +Counter description | Time per step | % loop time | Call(s)/step | Bandwidth +----------------------------------------------------------------------------------------------------------- +Libraries: | 0.00685329 | 54.8 | 1 | +Kernels: | 0.00129797 | 10.4 | 97 | +Copy host to device: | 0.000107672 | 0.9 | 8 | 2.0 GB/s +Copy device to host: | 0.000139247 | 1.1 | 4 | 3.9 GB/s +Alloc/Free on device: | 0 | 0.0 | 0 | +GPU: 65% Copy H<->D: 2% Alloc/free: 0% Comm: 0% CPU & I/O: 33% +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Post-resolution statistics +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Time of the post-resolution: 0.00246909 + +Total time for the whole computation 1.31078 + diff --git a/tests/GPU/OpenMP_Iterateur/OpenMP_Iterateur.TU.irene-amd-ccrt_cc70 b/tests/GPU/OpenMP_Iterateur/OpenMP_Iterateur.TU.irene-amd-ccrt_cc70 index 69c8125fd3..2f0b2a6301 100644 --- a/tests/GPU/OpenMP_Iterateur/OpenMP_Iterateur.TU.irene-amd-ccrt_cc70 +++ b/tests/GPU/OpenMP_Iterateur/OpenMP_Iterateur.TU.irene-amd-ccrt_cc70 @@ -8,52 +8,51 @@ Time is given in seconds ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Context of the computation ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -Date: 14-11-2025 -- 10:32:59 -OS: irene7056__Linux__x86_64__4.18.0-553.69.1.el8_10.x86_64__#1 SMP Thu Aug 7 18:10:00 EDT 2025 -CPU: Model name: Intel(R) Xeon(R) Gold 6230 CPU @ 2.10GHz ; Thread(s) per core: 2 -GPU: | NVIDIA-SMI 570.195.03 Driver Version: 570.195.03 CUDA Version: 12.8 | - -Nb procs: 1 -TRUST version: 1.9.7_beta +Date: 23-04-2026 -- 14:56:21 +OS: irene7051__Linux__x86_64__4.18.0-553.89.1.el8_10.x86_64__#1 SMP Sat Nov 29 00:49:18 EST 2025 +CPU model : Intel(R) Xeon(R) Gold 6230 CPU @ 2.10GHz +Total number of threads:80 +GPU model: Tesla V100-SXM2-16GB +CUDA runtime version: 12.90 +CUDA drivers version: 13.0 +Nb procs used for the computation: 1 +TRUST version: 1.9.8_beta +Total number of elements used for the calculation: 51840 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Computation start-up statistics ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -Total time of the start-up: 3.65722 -Percent of untracked time during computation start-up: 1.05162e-06 +Total time of the start-up: 1.07934 Number of calls to the linear solver per time step: 2 -Average time of the resolution of the linear problem per call: 0.501134 -Average number of iteration of the linear solver per call: 48 +Average time of the resolution of the linear problem per call: 0.116356 +Average number of iteration of the linear solver per call: 26 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Time loop statistics ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ The first time step is not accounted for the computation of the time loop statistics -Total time of the time loop: 0.233292 +Total time of the time loop: 0.152678 Number of time steps: 9 Skipped time steps: 1 -Average time per time step: 0.0259213 -Standard deviation between time steps: 0.0101201 -Time elapsed in the skipped time steps: 0.0285981 +Average time per time step: 0.0169642 +Standard deviation between time steps: 0.0113575 +Time elapsed in the skipped time steps: 0.0292522 Standard counter description | Time/step | % loop time | Call(s)/step ------------------------------------------------------------------------------------------ -Linear solver resolutions Ax=B | 0.0187438 | 72.3 | 1 -Convection operator | 0.0005950887 | 2.3 | 1 -Diffusion operator | 0.0002307317 | 0.9 | 1 -Gradient operator | 0.000239906 | 0.9 | 2 -Divergence operator | 0.0001558698 | 0.6 | 2 -Update ::mettre_a_jour | 0.0003146512 | 1.2 | 1 -Computation of the time step dt | 0.0002480136 | 1.0 | 2 -Post-treatment operations | 0.004528829 | 17.5 | 1 -Other operations | 0.0008644418 | 3.3 | - -Untracked time | 4.99e-05 | 0.0214 | - +Linear solver resolutions Ax=B | 0.00966756 | 57.0 | 1 +Convection operator | 0.0003741867 | 2.2 | 1 +Diffusion operator | 0.0001976752 | 1.2 | 1 +Gradient operator | 0.0002378338 | 1.4 | 2 +Divergence operator | 0.0001592834 | 0.9 | 2 +Update ::mettre_a_jour | 0.0003339658 | 2.0 | 1 +Computation of the time step dt | 0.0002590474 | 1.5 | 2 +Post-treatment operations | 0.004811701 | 28.4 | 1 +Other operations | 0.0009229322 | 5.4 | -Average number of iteration of the linear solver per call: 41.8 +Average number of iteration of the linear solver per call: 22.2 ----------------------------------------------------------------------------------------------------------- @@ -61,16 +60,17 @@ Average number of iteration of the linear solver per call: 41.8 ----------------------------------------------------------------------------------------------------------- Counter description | Time per step | % loop time | Call(s)/step | Bandwidth ----------------------------------------------------------------------------------------------------------- -Libraries: | 0.0186583 | 72.0 | 1 | -Kernels: | 0.00218357 | 8.4 | 98 | -Copy host to device: | 0.000161664 | 0.6 | 8 | 1.3 GB/s -Copy device to host: | 0.000185338 | 0.7 | 4 | 2.9 GB/s +Libraries: | 0.00957908 | 56.5 | 1 | +Kernels: | 0.00190241 | 11.2 | 97 | +Copy host to device: | 0.000173999 | 1.0 | 8 | 1.2 GB/s +Copy device to host: | 0.000192334 | 1.1 | 4 | 2.8 GB/s Alloc/Free on device: | 0 | 0.0 | 0 | -GPU: 80% Copy H<->D: 1.3% Alloc/free: 0% Comm: 0% CPU & I/O: 18% +GPU: 68% Copy H<->D: 2.2% Alloc/free: 0% Comm: 0% CPU & I/O: 30% ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Post-resolution statistics ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -Time of the post-resolution: 0.0107429 +Time of the post-resolution: 0.00647745 -Total time for the whole computation 3.92985 +Total time for the whole computation 1.26776 +[Slurm] Power consumption (13 s): 0.243 kW 0.001 kWh 0.000 € (0.10€/kWh) diff --git a/tests/GPU/OpenMP_Iterateur/OpenMP_Iterateur.TU.is157091_cc86 b/tests/GPU/OpenMP_Iterateur/OpenMP_Iterateur.TU.is157091_cc86 index 57da033e4c..4ea430e541 100644 --- a/tests/GPU/OpenMP_Iterateur/OpenMP_Iterateur.TU.is157091_cc86 +++ b/tests/GPU/OpenMP_Iterateur/OpenMP_Iterateur.TU.is157091_cc86 @@ -8,7 +8,7 @@ Time is given in seconds ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Context of the computation ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -Date: 11-03-2026 -- 20:22:16 +Date: 22-04-2026 -- 20:45:47 OS: is157091__Linux__x86_64__5.11.12-300.fc34.x86_64__#1 SMP Wed Apr 7 16:31:13 UTC 2021 CPU model : AMD Ryzen Threadripper PRO 5975WX 32-Cores Total number of threads:64 @@ -22,37 +22,37 @@ Total number of elements used for the calculation: 51840 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Computation start-up statistics ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -Total time of the start-up: 0.436959 +Total time of the start-up: 0.491229 Number of calls to the linear solver per time step: 2 -Average time of the resolution of the linear problem per call: 0.0725759 -Average number of iteration of the linear solver per call: 48 +Average time of the resolution of the linear problem per call: 0.0660598 +Average number of iteration of the linear solver per call: 26 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Time loop statistics ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ The first time step is not accounted for the computation of the time loop statistics -Total time of the time loop: 0.173573 +Total time of the time loop: 0.106553 Number of time steps: 9 Skipped time steps: 1 -Average time per time step: 0.0192858 -Standard deviation between time steps: 0.00569502 -Time elapsed in the skipped time steps: 0.00380361 +Average time per time step: 0.0118392 +Standard deviation between time steps: 0.00583933 +Time elapsed in the skipped time steps: 0.00365864 Standard counter description | Time/step | % loop time | Call(s)/step ------------------------------------------------------------------------------------------ -Linear solver resolutions Ax=B | 0.0149777 | 77.7 | 1 -Convection operator | 0.0003584584 | 1.9 | 1 -Diffusion operator | 0.0001395967 | 0.7 | 1 -Gradient operator | 0.0001576809 | 0.8 | 2 -Divergence operator | 0.0001185614 | 0.6 | 2 -Update ::mettre_a_jour | 0.0002077442 | 1.1 | 1 -Computation of the time step dt | 0.0001929384 | 1.0 | 2 -Post-treatment operations | 0.002563678 | 13.3 | 1 -Other operations | 0.0005694627 | 3.0 | +Linear solver resolutions Ax=B | 0.00756416 | 63.9 | 1 +Convection operator | 0.0003625891 | 3.1 | 1 +Diffusion operator | 0.0001376103 | 1.2 | 1 +Gradient operator | 0.0001551979 | 1.3 | 2 +Divergence operator | 0.0001102787 | 0.9 | 2 +Update ::mettre_a_jour | 0.0002033066 | 1.7 | 1 +Computation of the time step dt | 0.0001916568 | 1.6 | 2 +Post-treatment operations | 0.002546436 | 21.5 | 1 +Other operations | 0.0005679914 | 4.8 | -Average number of iteration of the linear solver per call: 41.8 +Average number of iteration of the linear solver per call: 22.2 ----------------------------------------------------------------------------------------------------------- @@ -60,16 +60,16 @@ Average number of iteration of the linear solver per call: 41.8 ----------------------------------------------------------------------------------------------------------- Counter description | Time per step | % loop time | Call(s)/step | Bandwidth ----------------------------------------------------------------------------------------------------------- -Libraries: | 0.0149218 | 77.4 | 1 | -Kernels: | 0.00149672 | 7.8 | 98 | -Copy host to device: | 8.56409e-05 | 0.4 | 8 | 2.5 GB/s -Copy device to host: | 0.000105018 | 0.5 | 4 | 5.2 GB/s +Libraries: | 0.00750619 | 63.4 | 1 | +Kernels: | 0.00146924 | 12.4 | 97 | +Copy host to device: | 8.5329e-05 | 0.7 | 8 | 2.5 GB/s +Copy device to host: | 0.000105709 | 0.9 | 4 | 5.2 GB/s Alloc/Free on device: | 0 | 0.0 | 0 | -GPU: 85% Copy H<->D: 0.99% Alloc/free: 0% Comm: 0% CPU & I/O: 14% +GPU: 76% Copy H<->D: 1.6% Alloc/free: 0% Comm: 0% CPU & I/O: 23% ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Post-resolution statistics ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -Time of the post-resolution: 0.00183913 +Time of the post-resolution: 0.00270217 -Total time for the whole computation 0.616183 +Total time for the whole computation 0.604151 diff --git a/tests/GPU/OpenMP_Iterateur/OpenMP_Iterateur.TU.is159479_cc120 b/tests/GPU/OpenMP_Iterateur/OpenMP_Iterateur.TU.is159479_cc120 new file mode 100644 index 0000000000..19f788812e --- /dev/null +++ b/tests/GPU/OpenMP_Iterateur/OpenMP_Iterateur.TU.is159479_cc120 @@ -0,0 +1,75 @@ + # Global performance file # + +This is the global file for tracking performance in TRUST. It stores aggregated quantities. +More detailed statistics can be found in the OpenMP_Iterateur_csv.TU file +For time loop, only standard counters of level 1 are printed alongside your custom counters +Time is given in seconds + +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Context of the computation +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Date: 13-05-2026 -- 07:05:25 +OS: is159479.intra.cea.fr__Linux__x86_64__6.12.0-160000.29-default__#1 SMP PREEMPT_DYNAMIC Fri May 1 12:45:19 UTC 2026 (6 +CPU model : AMD Ryzen Threadripper PRO 9965WX 24-Cores +Total number of threads:48 +GPU model: NVIDIA RTX PRO 6000 Blackwell Workstation Edition +CUDA runtime version: 13.0 +CUDA drivers version: 13.0 +Nb procs used for the computation: 1 +TRUST version: 1.9.8_beta +Total number of elements used for the calculation: 51840 + +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Computation start-up statistics +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Total time of the start-up: 0.363946 + +Number of calls to the linear solver per time step: 2 +Average time of the resolution of the linear problem per call: 0.0499642 +Average number of iteration of the linear solver per call: 26 + +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Time loop statistics +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +The first time step is not accounted for the computation of the time loop statistics +Total time of the time loop: 0.0787909 +Number of time steps: 9 +Skipped time steps: 1 +Average time per time step: 0.00875454 +Standard deviation between time steps: 0.00358588 +Time elapsed in the skipped time steps: 0.00282589 + + +Standard counter description | Time/step | % loop time | Call(s)/step +------------------------------------------------------------------------------------------ +Linear solver resolutions Ax=B | 0.00601979 | 68.8 | 1 +Convection operator | 0.0001674919 | 1.9 | 1 +Diffusion operator | 8.354833e-05 | 1.0 | 1 +Gradient operator | 0.0001087588 | 1.2 | 2 +Divergence operator | 6.7767e-05 | 0.8 | 2 +Update ::mettre_a_jour | 0.0001461213 | 1.7 | 1 +Computation of the time step dt | 9.826378e-05 | 1.1 | 2 +Post-treatment operations | 0.001635225 | 18.7 | 1 +Other operations | 0.0004275759 | 4.9 | + +Average number of iteration of the linear solver per call: 22.2 + + +----------------------------------------------------------------------------------------------------------- + GPU statistics +----------------------------------------------------------------------------------------------------------- +Counter description | Time per step | % loop time | Call(s)/step | Bandwidth +----------------------------------------------------------------------------------------------------------- +Libraries: | 0.00598622 | 68.4 | 1 | +Kernels: | 0.000918968 | 10.5 | 97 | +Copy host to device: | 6.19321e-05 | 0.7 | 8 | 3.5 GB/s +Copy device to host: | 0.000119888 | 1.4 | 4 | 4.5 GB/s +Alloc/Free on device: | 0 | 0.0 | 0 | +GPU: 79% Copy H<->D: 2.1% Alloc/free: 0% Comm: 0% CPU & I/O: 19% +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Post-resolution statistics +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Time of the post-resolution: 0.00140214 + +Total time for the whole computation 0.446975 + diff --git a/tests/GPU/OpenMP_Iterateur/OpenMP_Iterateur.TU.is247793_gfx1100 b/tests/GPU/OpenMP_Iterateur/OpenMP_Iterateur.TU.is247793_gfx1100 new file mode 100644 index 0000000000..5b87937b2e --- /dev/null +++ b/tests/GPU/OpenMP_Iterateur/OpenMP_Iterateur.TU.is247793_gfx1100 @@ -0,0 +1,75 @@ + # Global performance file # + +This is the global file for tracking performance in TRUST. It stores aggregated quantities. +More detailed statistics can be found in the OpenMP_Iterateur_csv.TU file +For time loop, only standard counters of level 1 are printed alongside your custom counters +Time is given in seconds + +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Context of the computation +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Date: 22-05-2026 -- 19:04:54 +OS: is247793__Linux__x86_64__6.8.0-107-generic__#107~22.04.1-Ubuntu SMP PREEMPT_DYNAMIC Wed Mar 18 23:40:43 UTC +CPU model : AMD Ryzen Threadripper PRO 7965WX 24-Cores +Total number of threads:48 +GPU model: AMD Radeon PRO W7900 +HIP runtime version: 7.53 +HIP drivers version: 7.53 +Nb procs used for the computation: 1 +TRUST version: 1.9.8_beta +Total number of elements used for the calculation: 51840 + +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Computation start-up statistics +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Total time of the start-up: 1.23986 + +Number of calls to the linear solver per time step: 2 +Average time of the resolution of the linear problem per call: 0.413036 +Average number of iteration of the linear solver per call: 26 + +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Time loop statistics +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +The first time step is not accounted for the computation of the time loop statistics +Total time of the time loop: 0.0998113 +Number of time steps: 9 +Skipped time steps: 1 +Average time per time step: 0.0110901 +Standard deviation between time steps: 0.00412294 +Time elapsed in the skipped time steps: 0.00524283 + + +Standard counter description | Time/step | % loop time | Call(s)/step +------------------------------------------------------------------------------------------ +Linear solver resolutions Ax=B | 0.00646524 | 58.3 | 1 +Convection operator | 0.0003997576 | 3.6 | 1 +Diffusion operator | 0.0002390461 | 2.2 | 1 +Gradient operator | 0.0002989122 | 2.7 | 2 +Divergence operator | 0.0001892956 | 1.7 | 2 +Update ::mettre_a_jour | 0.0003517464 | 3.2 | 1 +Computation of the time step dt | 0.0002773126 | 2.5 | 2 +Post-treatment operations | 0.001906918 | 17.2 | 1 +Other operations | 0.0009619149 | 8.7 | + +Average number of iteration of the linear solver per call: 22.2 + + +----------------------------------------------------------------------------------------------------------- + GPU statistics +----------------------------------------------------------------------------------------------------------- +Counter description | Time per step | % loop time | Call(s)/step | Bandwidth +----------------------------------------------------------------------------------------------------------- +Libraries: | 0.00638564 | 57.6 | 1 | +Kernels: | 0.00255147 | 23.0 | 97 | +Copy host to device: | 0.000176835 | 1.6 | 8 | 1.2 GB/s +Copy device to host: | 9.96538e-05 | 0.9 | 4 | 5.5 GB/s +Alloc/Free on device: | 0 | 0.0 | 0 | +GPU: 81% Copy H<->D: 2.5% Alloc/free: 0% Comm: 0% CPU & I/O: 17% +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Post-resolution statistics +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Time of the post-resolution: 0.00283027 + +Total time for the whole computation 1.34775 + diff --git a/tests/GPU/OpenMP_Iterateur/OpenMP_Iterateur.TU.jean-zay_cc90 b/tests/GPU/OpenMP_Iterateur/OpenMP_Iterateur.TU.jean-zay_cc90 index f8fb79c9e3..4b7e6ff550 100644 --- a/tests/GPU/OpenMP_Iterateur/OpenMP_Iterateur.TU.jean-zay_cc90 +++ b/tests/GPU/OpenMP_Iterateur/OpenMP_Iterateur.TU.jean-zay_cc90 @@ -8,52 +8,51 @@ Time is given in seconds ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Context of the computation ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -Date: 19-11-2025 -- 11:42:09 -OS: jzxh177__Linux__x86_64__5.14.0-427.76.1.el9_4.x86_64__#1 SMP PREEMPT_DYNAMIC Fri Jun 27 09:53:45 EDT 2025 -CPU: Model name: Intel(R) Xeon(R) Platinum 8468 ; Thread(s) per core: 2 -GPU: | NVIDIA-SMI 570.86.15 Driver Version: 570.86.15 CUDA Version: 12.8 | -| 0 NVIDIA H100 80GB HBM3 On | 00000000: -Nb procs: 1 -TRUST version: 1.9.7_beta +Date: 23-04-2026 -- 08:20:22 +OS: jzxh011__Linux__x86_64__5.14.0-570.58.1.el9_6.x86_64__#1 SMP PREEMPT_DYNAMIC Tue Oct 21 04:15:07 EDT 2025 +CPU model : Intel(R) Xeon(R) Platinum 8468 +Total number of threads:192 +GPU model: NVIDIA H100 80GB HBM3 +CUDA runtime version: 12.60 +CUDA drivers version: 13.0 +Nb procs used for the computation: 1 +TRUST version: 1.9.8_beta +Total number of elements used for the calculation: 51840 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Computation start-up statistics ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -Total time of the start-up: 3.28177 -Percent of untracked time during computation start-up: 9.65941e-08 +Total time of the start-up: 2.50003 Number of calls to the linear solver per time step: 2 -Average time of the resolution of the linear problem per call: 0.70507 -Average number of iteration of the linear solver per call: 48 +Average time of the resolution of the linear problem per call: 0.0936644 +Average number of iteration of the linear solver per call: 26 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Time loop statistics ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ The first time step is not accounted for the computation of the time loop statistics -Total time of the time loop: 0.183362 +Total time of the time loop: 0.112767 Number of time steps: 9 Skipped time steps: 1 -Average time per time step: 0.0203736 -Standard deviation between time steps: 0.00609699 -Time elapsed in the skipped time steps: 0.0231738 +Average time per time step: 0.0125296 +Standard deviation between time steps: 0.00642425 +Time elapsed in the skipped time steps: 0.112153 Standard counter description | Time/step | % loop time | Call(s)/step ------------------------------------------------------------------------------------------ -Linear solver resolutions Ax=B | 0.0146579 | 71.9 | 1 -Convection operator | 0.0002678476 | 1.3 | 1 -Diffusion operator | 0.0001548862 | 0.8 | 1 -Gradient operator | 0.0001796749 | 0.9 | 2 -Divergence operator | 0.0001112369 | 0.5 | 2 -Update ::mettre_a_jour | 0.0002635913 | 1.3 | 1 -Computation of the time step dt | 0.0001501152 | 0.7 | 2 -Post-treatment operations | 0.002908446 | 14.3 | 1 -Other operations | 0.001679913 | 8.2 | +Linear solver resolutions Ax=B | 0.00675927 | 53.9 | 1 +Convection operator | 0.000223977 | 1.8 | 1 +Diffusion operator | 0.0001341254 | 1.1 | 1 +Gradient operator | 0.0001789626 | 1.4 | 2 +Divergence operator | 0.0001138861 | 0.9 | 2 +Update ::mettre_a_jour | 0.0002445856 | 2.0 | 1 +Computation of the time step dt | 0.0001574472 | 1.3 | 2 +Post-treatment operations | 0.00296707 | 23.7 | 1 +Other operations | 0.001750308 | 14.0 | -Untracked time | 5.81e-05 | 0.0317 | - - -Average number of iteration of the linear solver per call: 41.8 +Average number of iteration of the linear solver per call: 22.2 ----------------------------------------------------------------------------------------------------------- @@ -61,16 +60,17 @@ Average number of iteration of the linear solver per call: 41.8 ----------------------------------------------------------------------------------------------------------- Counter description | Time per step | % loop time | Call(s)/step | Bandwidth ----------------------------------------------------------------------------------------------------------- -Libraries: | 0.0145851 | 71.6 | 1 | -Kernels: | 0.0014527 | 7.1 | 98 | -Copy host to device: | 0.000107502 | 0.5 | 8 | 2.0 GB/s -Copy device to host: | 0.000125785 | 0.6 | 4 | 4.3 GB/s +Libraries: | 0.00669146 | 53.4 | 1 | +Kernels: | 0.00135639 | 10.8 | 97 | +Copy host to device: | 0.000121315 | 1.0 | 8 | 1.8 GB/s +Copy device to host: | 0.000134266 | 1.1 | 4 | 4.1 GB/s Alloc/Free on device: | 0 | 0.0 | 0 | -GPU: 79% Copy H<->D: 1.1% Alloc/free: 0% Comm: 0% CPU & I/O: 20% +GPU: 64% Copy H<->D: 2% Alloc/free: 0% Comm: 0% CPU & I/O: 34% ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Post-resolution statistics ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -Time of the post-resolution: 0.0166082 +Time of the post-resolution: 0.0203563 -Total time for the whole computation 3.50492 +Total time for the whole computation 2.74533 +[Slurm] Power consumption (12 s): 0.393 kW 0.001 kWh 0.000 € (0.10€/kWh) diff --git a/tests/GPU/OpenMP_Iterateur/OpenMP_Iterateur.TU.lumi_gfx90a b/tests/GPU/OpenMP_Iterateur/OpenMP_Iterateur.TU.lumi_gfx90a index d575a024b3..820e885b9b 100644 --- a/tests/GPU/OpenMP_Iterateur/OpenMP_Iterateur.TU.lumi_gfx90a +++ b/tests/GPU/OpenMP_Iterateur/OpenMP_Iterateur.TU.lumi_gfx90a @@ -1,76 +1 @@ - # Global performance file # - -This is the global file for tracking performance in TRUST. It stores aggregated quantities. -More detailed statistics can be found in the OpenMP_Iterateur_csv.TU file -For time loop, only standard counters of level 1 are printed alongside your custom counters -Time is given in seconds - -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - Context of the computation -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -Date: 24-02-2026 -- 00:12:53 -OS: nid007971__Linux__x86_64__6.4.0-150600.23.73_15.0.14-cray_shasta_c__#1 SMP Tue Oct 21 20:32:25 UTC 2025 (89d3c98) -CPU model : AMD EPYC 7A53 64-Core Processor -Total number of threads:128 -GPU model: AMD Instinct MI250X -HIP runtime version: 6.43 -HIP drivers version: 6.43 -Nb procs used for the computation: 1 -TRUST version: 1.9.8_beta -Total number of elements used for the calculation: 51840 - -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - Computation start-up statistics -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -Total time of the start-up: 39.2215 - -Number of calls to the linear solver per time step: 2 -Average time of the resolution of the linear problem per call: 1.44943 -Average number of iteration of the linear solver per call: 48 - -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - Time loop statistics -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -The first time step is not accounted for the computation of the time loop statistics -Total time of the time loop: 0.12493 -Number of time steps: 9 -Skipped time steps: 1 -Average time per time step: 0.0138811 -Standard deviation between time steps: 0.0069912 -Time elapsed in the skipped time steps: 0.0144053 - - -Standard counter description | Time/step | % loop time | Call(s)/step ------------------------------------------------------------------------------------------- -Linear solver resolutions Ax=B | 0.00808639 | 52.2 | 1 -Convection operator | 0.0004544939 | 2.9 | 1 -Diffusion operator | 0.00026234 | 1.7 | 1 -Gradient operator | 0.0003127612 | 2.0 | 2 -Divergence operator | 0.0001773008 | 1.1 | 2 -Update ::mettre_a_jour | 0.000312985 | 2.0 | 1 -Computation of the time step dt | 0.0002489606 | 1.6 | 2 -Post-treatment operations | 0.003119252 | 20.1 | 1 -Other operations | 0.0009066113 | 5.9 | - -Average number of iteration of the linear solver per call: 42 - - ------------------------------------------------------------------------------------------------------------ - GPU statistics ------------------------------------------------------------------------------------------------------------ -Counter description | Time per step | % loop time | Call(s)/step | Bandwidth ------------------------------------------------------------------------------------------------------------ -Libraries: | 0.00801202 | 57.7 | 1 | -Kernels: | 0.00229587 | 16.5 | 98 | -Copy host to device: | 0.000151657 | 1.1 | 8 | 1.4 GB/s -Copy device to host: | 0.00014025 | 1.0 | 4 | 3.9 GB/s -Alloc/Free on device: | 0 | 0.0 | 0 | -GPU: 74% Copy H<->D: 2.1% Alloc/free: 0% Comm: 0% CPU & I/O: 24% -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - Post-resolution statistics -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -Time of the post-resolution: 0.00522131 - -Total time for the whole computation 39.366 - -Power consumption: 0 kW 0 kWh 0 € (0.12€/kWh) +[Slurm] Power consumption (1667 s): 0.521 kW 0.241 kWh 0.024 € (0.10€/kWh) diff --git a/tests/GPU/OpenMP_Iterateur/OpenMP_Iterateur.TU.topaze_cc80 b/tests/GPU/OpenMP_Iterateur/OpenMP_Iterateur.TU.topaze_cc80 index d45c1128a2..f32139d334 100644 --- a/tests/GPU/OpenMP_Iterateur/OpenMP_Iterateur.TU.topaze_cc80 +++ b/tests/GPU/OpenMP_Iterateur/OpenMP_Iterateur.TU.topaze_cc80 @@ -8,52 +8,51 @@ Time is given in seconds ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Context of the computation ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -Date: 16-11-2025 -- 08:34:37 -OS: topaze7068__Linux__x86_64__4.18.0-553.69.1.el8_10.x86_64__#1 SMP Thu Aug 7 18:10:00 EDT 2025 -CPU: Model name: AMD EPYC 7763 64-Core Processor ; Thread(s) per core: 2 -GPU: | NVIDIA-SMI 570.195.03 Driver Version: 570.195.03 CUDA Version: 12.8 | -| 0 NVIDIA A100-SXM4-80GB On | 0000000 -Nb procs: 1 -TRUST version: 1.9.7_beta +Date: 15-05-2026 -- 13:41:54 +OS: topaze7040__Linux__x86_64__4.18.0-553.89.1.el8_10.x86_64__#1 SMP Sat Nov 29 00:49:18 EST 2025 +CPU model : AMD EPYC 7763 64-Core Processor +Total number of threads:256 +GPU model: NVIDIA A100-SXM4-80GB +CUDA runtime version: 12.90 +CUDA drivers version: 13.0 +Nb procs used for the computation: 1 +TRUST version: 1.9.8_beta +Total number of elements used for the calculation: 51840 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Computation start-up statistics ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -Total time of the start-up: 2.35008 -Percent of untracked time during computation start-up: 1.74462e-07 +Total time of the start-up: 1.39309 Number of calls to the linear solver per time step: 2 -Average time of the resolution of the linear problem per call: 0.362608 -Average number of iteration of the linear solver per call: 48 +Average time of the resolution of the linear problem per call: 0.173037 +Average number of iteration of the linear solver per call: 26 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Time loop statistics ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ The first time step is not accounted for the computation of the time loop statistics -Total time of the time loop: 0.181071 +Total time of the time loop: 0.117747 Number of time steps: 9 Skipped time steps: 1 -Average time per time step: 0.020119 -Standard deviation between time steps: 0.00767738 -Time elapsed in the skipped time steps: 0.0213211 +Average time per time step: 0.013083 +Standard deviation between time steps: 0.007486 +Time elapsed in the skipped time steps: 0.0172257 Standard counter description | Time/step | % loop time | Call(s)/step ------------------------------------------------------------------------------------------ -Linear solver resolutions Ax=B | 0.014519 | 72.2 | 1 -Convection operator | 0.0003598946 | 1.8 | 1 -Diffusion operator | 0.0001784016 | 0.9 | 1 -Gradient operator | 0.0002058404 | 1.0 | 2 -Divergence operator | 0.0001244321 | 0.6 | 2 -Update ::mettre_a_jour | 0.0003190001 | 1.6 | 1 -Computation of the time step dt | 0.0001833313 | 0.9 | 2 -Post-treatment operations | 0.003421306 | 17.0 | 1 -Other operations | 0.000807747 | 4.0 | +Linear solver resolutions Ax=B | 0.00780977 | 59.7 | 1 +Convection operator | 0.0002721863 | 2.1 | 1 +Diffusion operator | 0.0001520589 | 1.2 | 1 +Gradient operator | 0.0001971098 | 1.5 | 2 +Divergence operator | 0.0001235297 | 0.9 | 2 +Update ::mettre_a_jour | 0.0002596853 | 2.0 | 1 +Computation of the time step dt | 0.0001791377 | 1.4 | 2 +Post-treatment operations | 0.003267401 | 25.0 | 1 +Other operations | 0.0008220941 | 6.3 | -Untracked time | 5.96e-05 | 0.0329 | - - -Average number of iteration of the linear solver per call: 41.8 +Average number of iteration of the linear solver per call: 22.2 ----------------------------------------------------------------------------------------------------------- @@ -61,16 +60,17 @@ Average number of iteration of the linear solver per call: 41.8 ----------------------------------------------------------------------------------------------------------- Counter description | Time per step | % loop time | Call(s)/step | Bandwidth ----------------------------------------------------------------------------------------------------------- -Libraries: | 0.0144396 | 71.8 | 1 | -Kernels: | 0.00169249 | 8.4 | 98 | -Copy host to device: | 0.000121641 | 0.6 | 8 | 1.8 GB/s -Copy device to host: | 0.0001136 | 0.6 | 4 | 4.8 GB/s +Libraries: | 0.00773568 | 59.1 | 1 | +Kernels: | 0.0015436 | 11.8 | 97 | +Copy host to device: | 0.000123199 | 0.9 | 8 | 1.7 GB/s +Copy device to host: | 0.00011577 | 0.9 | 4 | 4.7 GB/s Alloc/Free on device: | 0 | 0.0 | 0 | -GPU: 80% Copy H<->D: 1.2% Alloc/free: 0% Comm: 0% CPU & I/O: 19% +GPU: 71% Copy H<->D: 1.8% Alloc/free: 0% Comm: 0% CPU & I/O: 27% ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Post-resolution statistics ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -Time of the post-resolution: 0.00642543 +Time of the post-resolution: 0.00583202 -Total time for the whole computation 2.5589 +Total time for the whole computation 1.53391 +[Slurm] Power consumption (28 s): 0.607 kW 0.005 kWh 0.000 € (0.10€/kWh) diff --git a/tests/GPU/OpenMP_Iterateur/OpenMP_Iterateur.data b/tests/GPU/OpenMP_Iterateur/OpenMP_Iterateur.data index da6a748340..463d7e03d9 100644 --- a/tests/GPU/OpenMP_Iterateur/OpenMP_Iterateur.data +++ b/tests/GPU/OpenMP_Iterateur/OpenMP_Iterateur.data @@ -41,7 +41,8 @@ END PARTITION # Scatter DOM.Zones dom END SCATTER # -VEFPreP1B dis Lire dis { P0 } +VEFPreP1B dis +Lire dis { P0 reorder { algo Hilbert } } # Runge_Kutta_ordre_3 # Scheme_euler_explicit sch diff --git a/tests/GPU/OpenMP_Iterateur/OpenMP_Iterateur_BENCH_AmgX.TU.dalianvl_cc100 b/tests/GPU/OpenMP_Iterateur/OpenMP_Iterateur_BENCH_AmgX.TU.dalianvl_cc100 new file mode 100644 index 0000000000..3faa5b481e --- /dev/null +++ b/tests/GPU/OpenMP_Iterateur/OpenMP_Iterateur_BENCH_AmgX.TU.dalianvl_cc100 @@ -0,0 +1,75 @@ + # Global performance file # + +This is the global file for tracking performance in TRUST. It stores aggregated quantities. +More detailed statistics can be found in the OpenMP_Iterateur_BENCH_AmgX_csv.TU file +For time loop, only standard counters of level 1 are printed alongside your custom counters +Time is given in seconds + +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Context of the computation +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Date: 09-06-2026 -- 17:39:31 +OS: dalianvl08__Linux__aarch64__5.14.0-503.40.1.el9_5.aarch64__#1 SMP PREEMPT_DYNAMIC Thu Apr 24 13:05:29 UTC 2025 +CPU model : Unknown Linux CPU +Total number of threads:144 +GPU model: NVIDIA GB200 +CUDA runtime version: 13.0 +CUDA drivers version: 13.20 +Nb procs used for the computation: 1 +TRUST version: 1.9.8_beta +Total number of elements used for the calculation: 2592000 + +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Computation start-up statistics +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Total time of the start-up: 10.2002 + +Number of calls to the linear solver per time step: 2 +Average time of the resolution of the linear problem per call: 2.06413 +Average number of iteration of the linear solver per call: 41 + +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Time loop statistics +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +The first time step is not accounted for the computation of the time loop statistics +Total time of the time loop: 0.550944 +Number of time steps: 9 +Skipped time steps: 1 +Average time per time step: 0.061216 +Standard deviation between time steps: 0.00242635 +Time elapsed in the skipped time steps: 0.0899983 + + +Standard counter description | Time/step | % loop time | Call(s)/step +------------------------------------------------------------------------------------------ +Linear solver resolutions Ax=B | 0.0498086 | 81.4 | 1 +Convection operator | 0.002785091 | 4.5 | 1 +Diffusion operator | 0.0007763384 | 1.3 | 1 +Gradient operator | 0.001174778 | 1.9 | 2 +Divergence operator | 0.00054559 | 0.9 | 2 +Update ::mettre_a_jour | 0.0007365982 | 1.2 | 1 +Computation of the time step dt | 0.00056079 | 0.9 | 2 +Post-treatment operations | 0.001837264 | 3.0 | 1 +Other operations | 0.002991011 | 4.9 | + +Average number of iteration of the linear solver per call: 31.1 + + +----------------------------------------------------------------------------------------------------------- + GPU statistics +----------------------------------------------------------------------------------------------------------- +Counter description | Time per step | % loop time | Call(s)/step | Bandwidth +----------------------------------------------------------------------------------------------------------- +Libraries: | 0.049687 | 81.2 | 1 | +Kernels: | 0.00790305 | 12.9 | 96 | +Copy host to device: | 0.00016891 | 0.3 | 8 | 16.4 GB/s +Copy device to host: | 0.000112647 | 0.2 | 4 | 54.9 GB/s +Alloc/Free on device: | 0 | 0.0 | 0 | +GPU: 94% Copy H<->D: 0.46% Alloc/free: 0% Comm: 0% CPU & I/O: 5.5% +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Post-resolution statistics +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Time of the post-resolution: 0.191478 + +Total time for the whole computation 11.0326 + diff --git a/tests/GPU/OpenMP_Iterateur/OpenMP_Iterateur_BENCH_AmgX.TU.eureka_cc89 b/tests/GPU/OpenMP_Iterateur/OpenMP_Iterateur_BENCH_AmgX.TU.eureka_cc89 new file mode 100644 index 0000000000..1c29c75593 --- /dev/null +++ b/tests/GPU/OpenMP_Iterateur/OpenMP_Iterateur_BENCH_AmgX.TU.eureka_cc89 @@ -0,0 +1,75 @@ + # Global performance file # + +This is the global file for tracking performance in TRUST. It stores aggregated quantities. +More detailed statistics can be found in the OpenMP_Iterateur_BENCH_AmgX_csv.TU file +For time loop, only standard counters of level 1 are printed alongside your custom counters +Time is given in seconds + +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Context of the computation +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Date: 11-05-2026 -- 08:52:46 +OS: eureka__Linux__x86_64__6.17.0-20-generic__#20~24.04.1-Ubuntu SMP PREEMPT_DYNAMIC Thu Mar 19 01:28:37 UTC 2 +CPU model : INTEL(R) XEON(R) PLATINUM 8580 +Total number of threads:240 +GPU model: NVIDIA RTX 6000 Ada Generation +CUDA runtime version: 12.90 +CUDA drivers version: 13.10 +Nb procs used for the computation: 1 +TRUST version: 1.9.8_beta +Total number of elements used for the calculation: 2592000 + +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Computation start-up statistics +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Total time of the start-up: 12.7089 + +Number of calls to the linear solver per time step: 2 +Average time of the resolution of the linear problem per call: 2.86719 +Average number of iteration of the linear solver per call: 41 + +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Time loop statistics +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +The first time step is not accounted for the computation of the time loop statistics +Total time of the time loop: 0.913517 +Number of time steps: 9 +Skipped time steps: 1 +Average time per time step: 0.101502 +Standard deviation between time steps: 0.00424278 +Time elapsed in the skipped time steps: 0.122592 + + +Standard counter description | Time/step | % loop time | Call(s)/step +------------------------------------------------------------------------------------------ +Linear solver resolutions Ax=B | 0.0746953 | 73.6 | 1 +Convection operator | 0.007268645 | 7.2 | 1 +Diffusion operator | 0.002631552 | 2.6 | 1 +Gradient operator | 0.002676735 | 2.6 | 2 +Divergence operator | 0.001694524 | 1.7 | 2 +Update ::mettre_a_jour | 0.001962899 | 1.9 | 1 +Computation of the time step dt | 0.001669003 | 1.6 | 2 +Post-treatment operations | 0.003160382 | 3.1 | 1 +Other operations | 0.005742889 | 5.7 | + +Average number of iteration of the linear solver per call: 31.1 + + +----------------------------------------------------------------------------------------------------------- + GPU statistics +----------------------------------------------------------------------------------------------------------- +Counter description | Time per step | % loop time | Call(s)/step | Bandwidth +----------------------------------------------------------------------------------------------------------- +Libraries: | 0.0745032 | 73.4 | 1 | +Kernels: | 0.0227984 | 22.5 | 96 | +Copy host to device: | 0.000396213 | 0.4 | 8 | 7.0 GB/s +Copy device to host: | 0.00111494 | 1.1 | 4 | 5.5 GB/s +Alloc/Free on device: | 0 | 0.0 | 0 | +GPU: 96% Copy H<->D: 1.5% Alloc/free: 0% Comm: 0% CPU & I/O: 2.6% +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Post-resolution statistics +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Time of the post-resolution: 0.0563904 + +Total time for the whole computation 13.8014 + diff --git a/tests/GPU/OpenMP_Iterateur/OpenMP_Iterateur_BENCH_AmgX.TU.irene-amd-ccrt_cc70 b/tests/GPU/OpenMP_Iterateur/OpenMP_Iterateur_BENCH_AmgX.TU.irene-amd-ccrt_cc70 index ed993402fa..aae10238db 100644 --- a/tests/GPU/OpenMP_Iterateur/OpenMP_Iterateur_BENCH_AmgX.TU.irene-amd-ccrt_cc70 +++ b/tests/GPU/OpenMP_Iterateur/OpenMP_Iterateur_BENCH_AmgX.TU.irene-amd-ccrt_cc70 @@ -8,8 +8,8 @@ Time is given in seconds ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Context of the computation ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -Date: 20-03-2026 -- 14:20:52 -OS: irene7053__Linux__x86_64__4.18.0-553.89.1.el8_10.x86_64__#1 SMP Sat Nov 29 00:49:18 EST 2025 +Date: 23-04-2026 -- 14:57:06 +OS: irene7051__Linux__x86_64__4.18.0-553.89.1.el8_10.x86_64__#1 SMP Sat Nov 29 00:49:18 EST 2025 CPU model : Intel(R) Xeon(R) Gold 6230 CPU @ 2.10GHz Total number of threads:80 GPU model: Tesla V100-SXM2-16GB @@ -22,35 +22,35 @@ Total number of elements used for the calculation: 2592000 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Computation start-up statistics ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -Total time of the start-up: 19.755 +Total time of the start-up: 20.0048 Number of calls to the linear solver per time step: 2 -Average time of the resolution of the linear problem per call: 4.47609 +Average time of the resolution of the linear problem per call: 4.83528 Average number of iteration of the linear solver per call: 41 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Time loop statistics ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ The first time step is not accounted for the computation of the time loop statistics -Total time of the time loop: 1.61756 +Total time of the time loop: 1.56294 Number of time steps: 9 Skipped time steps: 1 -Average time per time step: 0.179729 -Standard deviation between time steps: 0.00645205 -Time elapsed in the skipped time steps: 0.252334 +Average time per time step: 0.17366 +Standard deviation between time steps: 0.00621117 +Time elapsed in the skipped time steps: 0.240243 Standard counter description | Time/step | % loop time | Call(s)/step ------------------------------------------------------------------------------------------ -Linear solver resolutions Ax=B | 0.127466 | 70.9 | 1 -Convection operator | 0.01145873 | 6.4 | 1 -Diffusion operator | 0.004238428 | 2.4 | 1 -Gradient operator | 0.0100999 | 5.6 | 2 -Divergence operator | 0.006367787 | 3.5 | 2 -Update ::mettre_a_jour | 0.005436899 | 3.0 | 1 -Computation of the time step dt | 0.006054541 | 3.4 | 2 -Post-treatment operations | 0.0041794 | 2.3 | 1 -Other operations | 0.00442691 | 2.5 | +Linear solver resolutions Ax=B | 0.127391 | 73.4 | 1 +Convection operator | 0.01146404 | 6.6 | 1 +Diffusion operator | 0.004249371 | 2.4 | 1 +Gradient operator | 0.004379879 | 2.5 | 2 +Divergence operator | 0.006376915 | 3.7 | 2 +Update ::mettre_a_jour | 0.00544122 | 3.1 | 1 +Computation of the time step dt | 0.006055941 | 3.5 | 2 +Post-treatment operations | 0.004124303 | 2.4 | 1 +Other operations | 0.004177375 | 2.4 | Average number of iteration of the linear solver per call: 31.1 @@ -60,17 +60,17 @@ Average number of iteration of the linear solver per call: 31.1 ----------------------------------------------------------------------------------------------------------- Counter description | Time per step | % loop time | Call(s)/step | Bandwidth ----------------------------------------------------------------------------------------------------------- -Libraries: | 0.127187 | 70.8 | 1 | -Kernels: | 0.046662 | 26.0 | 97 | -Copy host to device: | 0.000836908 | 0.5 | 8 | 3.3 GB/s -Copy device to host: | 0.00149839 | 0.8 | 4 | 4.1 GB/s +Libraries: | 0.127112 | 73.2 | 1 | +Kernels: | 0.0407369 | 23.5 | 96 | +Copy host to device: | 0.000828729 | 0.5 | 8 | 3.3 GB/s +Copy device to host: | 0.00136563 | 0.8 | 4 | 4.5 GB/s Alloc/Free on device: | 0 | 0.0 | 0 | -GPU: 97% Copy H<->D: 1.3% Alloc/free: 0% Comm: 0% CPU & I/O: 2% +GPU: 97% Copy H<->D: 1.3% Alloc/free: 0% Comm: 0% CPU & I/O: 2.1% ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Post-resolution statistics ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -Time of the post-resolution: 0.174162 +Time of the post-resolution: 0.144198 -Total time for the whole computation 21.7991 +Total time for the whole computation 21.9522 -[Slurm] Power consumption (38 s): 0.217 kW 0.002 kWh 0.000 € (0.10€/kWh) +[Slurm] Power consumption (36 s): 0.160 kW 0.002 kWh 0.000 € (0.10€/kWh) diff --git a/tests/GPU/OpenMP_Iterateur/OpenMP_Iterateur_BENCH_AmgX.TU.is157091_cc86 b/tests/GPU/OpenMP_Iterateur/OpenMP_Iterateur_BENCH_AmgX.TU.is157091_cc86 index b039591652..3599c3f3fc 100644 --- a/tests/GPU/OpenMP_Iterateur/OpenMP_Iterateur_BENCH_AmgX.TU.is157091_cc86 +++ b/tests/GPU/OpenMP_Iterateur/OpenMP_Iterateur_BENCH_AmgX.TU.is157091_cc86 @@ -8,7 +8,7 @@ Time is given in seconds ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Context of the computation ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -Date: 10-03-2026 -- 08:42:48 +Date: 22-04-2026 -- 07:55:23 OS: is157091__Linux__x86_64__5.11.12-300.fc34.x86_64__#1 SMP Wed Apr 7 16:31:13 UTC 2021 CPU model : AMD Ryzen Threadripper PRO 5975WX 32-Cores Total number of threads:64 @@ -22,35 +22,35 @@ Total number of elements used for the calculation: 2592000 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Computation start-up statistics ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -Total time of the start-up: 11.0494 +Total time of the start-up: 10.7604 Number of calls to the linear solver per time step: 2 -Average time of the resolution of the linear problem per call: 2.33687 +Average time of the resolution of the linear problem per call: 2.48938 Average number of iteration of the linear solver per call: 41 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Time loop statistics ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ The first time step is not accounted for the computation of the time loop statistics -Total time of the time loop: 1.63426 +Total time of the time loop: 1.58881 Number of time steps: 9 Skipped time steps: 1 -Average time per time step: 0.181585 -Standard deviation between time steps: 0.00729363 -Time elapsed in the skipped time steps: 0.2154 +Average time per time step: 0.176535 +Standard deviation between time steps: 0.00706665 +Time elapsed in the skipped time steps: 0.206322 Standard counter description | Time/step | % loop time | Call(s)/step ------------------------------------------------------------------------------------------ -Linear solver resolutions Ax=B | 0.131686 | 72.5 | 1 -Convection operator | 0.01309432 | 7.2 | 1 -Diffusion operator | 0.00372165 | 2.0 | 1 -Gradient operator | 0.008522686 | 4.7 | 2 -Divergence operator | 0.004614091 | 2.5 | 2 -Update ::mettre_a_jour | 0.003988937 | 2.2 | 1 -Computation of the time step dt | 0.006508071 | 3.6 | 2 -Post-treatment operations | 0.00263361 | 1.5 | 1 -Other operations | 0.006814981 | 3.8 | +Linear solver resolutions Ax=B | 0.131459 | 74.5 | 1 +Convection operator | 0.01314436 | 7.4 | 1 +Diffusion operator | 0.003713571 | 2.1 | 1 +Gradient operator | 0.003726761 | 2.1 | 2 +Divergence operator | 0.004629008 | 2.6 | 2 +Update ::mettre_a_jour | 0.004018701 | 2.3 | 1 +Computation of the time step dt | 0.006475843 | 3.7 | 2 +Post-treatment operations | 0.002728818 | 1.5 | 1 +Other operations | 0.006638199 | 3.8 | Average number of iteration of the linear solver per call: 31.1 @@ -60,16 +60,16 @@ Average number of iteration of the linear solver per call: 31.1 ----------------------------------------------------------------------------------------------------------- Counter description | Time per step | % loop time | Call(s)/step | Bandwidth ----------------------------------------------------------------------------------------------------------- -Libraries: | 0.131402 | 72.4 | 1 | -Kernels: | 0.0469629 | 25.9 | 97 | -Copy host to device: | 0.000331245 | 0.2 | 8 | 8.4 GB/s -Copy device to host: | 0.000617492 | 0.3 | 4 | 10.0 GB/s +Libraries: | 0.131171 | 74.3 | 1 | +Kernels: | 0.0419323 | 23.8 | 96 | +Copy host to device: | 0.000343416 | 0.2 | 8 | 8.1 GB/s +Copy device to host: | 0.000657896 | 0.4 | 4 | 9.4 GB/s Alloc/Free on device: | 0 | 0.0 | 0 | -GPU: 98% Copy H<->D: 0.52% Alloc/free: 0% Comm: 0% CPU & I/O: 1.3% +GPU: 98% Copy H<->D: 0.57% Alloc/free: 0% Comm: 0% CPU & I/O: 1.4% ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Post-resolution statistics ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -Time of the post-resolution: 0.207824 +Time of the post-resolution: 0.0673813 -Total time for the whole computation 13.1069 +Total time for the whole computation 12.6229 diff --git a/tests/GPU/OpenMP_Iterateur/OpenMP_Iterateur_BENCH_AmgX.TU.is159479_cc120 b/tests/GPU/OpenMP_Iterateur/OpenMP_Iterateur_BENCH_AmgX.TU.is159479_cc120 new file mode 100644 index 0000000000..146dce3081 --- /dev/null +++ b/tests/GPU/OpenMP_Iterateur/OpenMP_Iterateur_BENCH_AmgX.TU.is159479_cc120 @@ -0,0 +1,75 @@ + # Global performance file # + +This is the global file for tracking performance in TRUST. It stores aggregated quantities. +More detailed statistics can be found in the OpenMP_Iterateur_BENCH_AmgX_csv.TU file +For time loop, only standard counters of level 1 are printed alongside your custom counters +Time is given in seconds + +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Context of the computation +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Date: 24-04-2026 -- 14:50:20 +OS: is159479.intra.cea.fr__Linux__x86_64__6.12.0-160000.26-default__#1 SMP PREEMPT_DYNAMIC Thu Feb 5 00:00:11 UTC 2026 (f +CPU model : AMD Ryzen Threadripper PRO 9965WX 24-Cores +Total number of threads:48 +GPU model: NVIDIA RTX PRO 6000 Blackwell Workstation Edition +CUDA runtime version: 13.0 +CUDA drivers version: 13.0 +Nb procs used for the computation: 1 +TRUST version: 1.9.8_beta +Total number of elements used for the calculation: 2592000 + +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Computation start-up statistics +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Total time of the start-up: 7.24263 + +Number of calls to the linear solver per time step: 2 +Average time of the resolution of the linear problem per call: 1.58355 +Average number of iteration of the linear solver per call: 41 + +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Time loop statistics +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +The first time step is not accounted for the computation of the time loop statistics +Total time of the time loop: 0.578466 +Number of time steps: 9 +Skipped time steps: 1 +Average time per time step: 0.064274 +Standard deviation between time steps: 0.00427381 +Time elapsed in the skipped time steps: 0.0865707 + + +Standard counter description | Time/step | % loop time | Call(s)/step +------------------------------------------------------------------------------------------ +Linear solver resolutions Ax=B | 0.0491303 | 76.4 | 1 +Convection operator | 0.004380798 | 6.8 | 1 +Diffusion operator | 0.001477663 | 2.3 | 1 +Gradient operator | 0.001421014 | 2.2 | 2 +Divergence operator | 0.0007430587 | 1.2 | 2 +Update ::mettre_a_jour | 0.0009293183 | 1.4 | 1 +Computation of the time step dt | 0.000812331 | 1.3 | 2 +Post-treatment operations | 0.002051846 | 3.2 | 1 +Other operations | 0.003327638 | 5.2 | + +Average number of iteration of the linear solver per call: 31.1 + + +----------------------------------------------------------------------------------------------------------- + GPU statistics +----------------------------------------------------------------------------------------------------------- +Counter description | Time per step | % loop time | Call(s)/step | Bandwidth +----------------------------------------------------------------------------------------------------------- +Libraries: | 0.0490462 | 76.3 | 1 | +Kernels: | 0.0126528 | 19.7 | 96 | +Copy host to device: | 0.000231439 | 0.4 | 8 | 12.0 GB/s +Copy device to host: | 0.000867666 | 1.3 | 4 | 7.1 GB/s +Alloc/Free on device: | 0 | 0.0 | 0 | +GPU: 96% Copy H<->D: 1.7% Alloc/free: 0% Comm: 0% CPU & I/O: 2.3% +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Post-resolution statistics +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Time of the post-resolution: 0.0518869 + +Total time for the whole computation 7.95956 + diff --git a/tests/GPU/OpenMP_Iterateur/OpenMP_Iterateur_BENCH_AmgX.TU.jean-zay_cc90 b/tests/GPU/OpenMP_Iterateur/OpenMP_Iterateur_BENCH_AmgX.TU.jean-zay_cc90 index d3bd20b90b..ed1afe0a6b 100644 --- a/tests/GPU/OpenMP_Iterateur/OpenMP_Iterateur_BENCH_AmgX.TU.jean-zay_cc90 +++ b/tests/GPU/OpenMP_Iterateur/OpenMP_Iterateur_BENCH_AmgX.TU.jean-zay_cc90 @@ -8,8 +8,8 @@ Time is given in seconds ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Context of the computation ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -Date: 20-03-2026 -- 14:51:53 -OS: jzxh041__Linux__x86_64__5.14.0-570.58.1.el9_6.x86_64__#1 SMP PREEMPT_DYNAMIC Tue Oct 21 04:15:07 EDT 2025 +Date: 23-04-2026 -- 08:21:31 +OS: jzxh011__Linux__x86_64__5.14.0-570.58.1.el9_6.x86_64__#1 SMP PREEMPT_DYNAMIC Tue Oct 21 04:15:07 EDT 2025 CPU model : Intel(R) Xeon(R) Platinum 8468 Total number of threads:192 GPU model: NVIDIA H100 80GB HBM3 @@ -22,35 +22,35 @@ Total number of elements used for the calculation: 2592000 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Computation start-up statistics ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -Total time of the start-up: 29.0666 +Total time of the start-up: 12.8657 Number of calls to the linear solver per time step: 2 -Average time of the resolution of the linear problem per call: 6.28925 +Average time of the resolution of the linear problem per call: 2.66272 Average number of iteration of the linear solver per call: 41 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Time loop statistics ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ The first time step is not accounted for the computation of the time loop statistics -Total time of the time loop: 0.612503 +Total time of the time loop: 0.606875 Number of time steps: 9 Skipped time steps: 1 -Average time per time step: 0.0680559 -Standard deviation between time steps: 0.00259521 -Time elapsed in the skipped time steps: 0.104328 +Average time per time step: 0.0674305 +Standard deviation between time steps: 0.00262351 +Time elapsed in the skipped time steps: 0.164465 Standard counter description | Time/step | % loop time | Call(s)/step ------------------------------------------------------------------------------------------ -Linear solver resolutions Ax=B | 0.0506794 | 74.5 | 1 -Convection operator | 0.003897978 | 5.7 | 1 -Diffusion operator | 0.001564434 | 2.3 | 1 -Gradient operator | 0.001842167 | 2.7 | 2 -Divergence operator | 0.001292618 | 1.9 | 2 -Update ::mettre_a_jour | 0.001611427 | 2.4 | 1 -Computation of the time step dt | 0.000948646 | 1.4 | 2 -Post-treatment operations | 0.002957776 | 4.3 | 1 -Other operations | 0.003261441 | 4.8 | +Linear solver resolutions Ax=B | 0.0499214 | 74.0 | 1 +Convection operator | 0.003907927 | 5.8 | 1 +Diffusion operator | 0.001567694 | 2.3 | 1 +Gradient operator | 0.001694054 | 2.5 | 2 +Divergence operator | 0.001322626 | 2.0 | 2 +Update ::mettre_a_jour | 0.001578162 | 2.3 | 1 +Computation of the time step dt | 0.0009521459 | 1.4 | 2 +Post-treatment operations | 0.002975969 | 4.4 | 1 +Other operations | 0.003510551 | 5.2 | Average number of iteration of the linear solver per call: 31.1 @@ -60,17 +60,17 @@ Average number of iteration of the linear solver per call: 31.1 ----------------------------------------------------------------------------------------------------------- Counter description | Time per step | % loop time | Call(s)/step | Bandwidth ----------------------------------------------------------------------------------------------------------- -Libraries: | 0.0505594 | 74.3 | 1 | -Kernels: | 0.012317 | 18.1 | 97 | -Copy host to device: | 0.000425871 | 0.6 | 8 | 6.5 GB/s -Copy device to host: | 0.000673408 | 1.0 | 4 | 9.2 GB/s +Libraries: | 0.0497978 | 73.9 | 1 | +Kernels: | 0.0121603 | 18.0 | 96 | +Copy host to device: | 0.000414936 | 0.6 | 8 | 6.7 GB/s +Copy device to host: | 0.0006133 | 0.9 | 4 | 10.1 GB/s Alloc/Free on device: | 0 | 0.0 | 0 | -GPU: 92% Copy H<->D: 1.6% Alloc/free: 0% Comm: 0% CPU & I/O: 6% +GPU: 92% Copy H<->D: 1.5% Alloc/free: 0% Comm: 0% CPU & I/O: 6.6% ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Post-resolution statistics ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -Time of the post-resolution: 0.190546 +Time of the post-resolution: 0.173993 -Total time for the whole computation 29.974 +Total time for the whole computation 13.8111 -[Slurm] Power consumption (46 s): 0.441 kW 0.006 kWh 0.001 € (0.10€/kWh) +[Slurm] Power consumption (22 s): 0.410 kW 0.003 kWh 0.000 € (0.10€/kWh) diff --git a/tests/GPU/OpenMP_Iterateur/OpenMP_Iterateur_BENCH_AmgX.TU.topaze_cc80 b/tests/GPU/OpenMP_Iterateur/OpenMP_Iterateur_BENCH_AmgX.TU.topaze_cc80 index 02a0052a50..dc5440df70 100644 --- a/tests/GPU/OpenMP_Iterateur/OpenMP_Iterateur_BENCH_AmgX.TU.topaze_cc80 +++ b/tests/GPU/OpenMP_Iterateur/OpenMP_Iterateur_BENCH_AmgX.TU.topaze_cc80 @@ -8,8 +8,8 @@ Time is given in seconds ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Context of the computation ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -Date: 20-02-2026 -- 17:11:18 -OS: topaze7071__Linux__x86_64__4.18.0-553.89.1.el8_10.x86_64__#1 SMP Sat Nov 29 00:49:18 EST 2025 +Date: 13-06-2026 -- 12:17:57 +OS: topaze7062__Linux__x86_64__4.18.0-553.123.1.el8_10.x86_64__#1 SMP Mon May 4 13:45:48 EDT 2026 CPU model : AMD EPYC 7763 64-Core Processor Total number of threads:256 GPU model: NVIDIA A100-SXM4-80GB @@ -22,35 +22,35 @@ Total number of elements used for the calculation: 2592000 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Computation start-up statistics ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -Total time of the start-up: 16.5775 +Total time of the start-up: 15.101 Number of calls to the linear solver per time step: 2 -Average time of the resolution of the linear problem per call: 3.4427 +Average time of the resolution of the linear problem per call: 3.22742 Average number of iteration of the linear solver per call: 41 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Time loop statistics ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ The first time step is not accounted for the computation of the time loop statistics -Total time of the time loop: 0.906586 +Total time of the time loop: 0.877615 Number of time steps: 9 Skipped time steps: 1 -Average time per time step: 0.100732 -Standard deviation between time steps: 0.00337763 -Time elapsed in the skipped time steps: 0.169131 +Average time per time step: 0.0975127 +Standard deviation between time steps: 0.00398041 +Time elapsed in the skipped time steps: 0.144939 Standard counter description | Time/step | % loop time | Call(s)/step ------------------------------------------------------------------------------------------ -Linear solver resolutions Ax=B | 0.075859 | 63.5 | 1 -Convection operator | 0.007069063 | 5.9 | 1 -Diffusion operator | 0.002373228 | 2.0 | 1 -Gradient operator | 0.002605388 | 2.2 | 2 -Divergence operator | 0.001750139 | 1.5 | 2 -Update ::mettre_a_jour | 0.001915461 | 1.6 | 1 -Computation of the time step dt | 0.002081152 | 1.7 | 2 -Post-treatment operations | 0.002956459 | 2.5 | 1 -Other operations | 0.004121942 | 3.4 | +Linear solver resolutions Ax=B | 0.0760297 | 78.0 | 1 +Convection operator | 0.006027708 | 6.2 | 1 +Diffusion operator | 0.002294193 | 2.4 | 1 +Gradient operator | 0.002297408 | 2.4 | 2 +Divergence operator | 0.00122019 | 1.3 | 2 +Update ::mettre_a_jour | 0.00161066 | 1.7 | 1 +Computation of the time step dt | 0.001546687 | 1.6 | 2 +Post-treatment operations | 0.002914558 | 3.0 | 1 +Other operations | 0.003571631 | 3.7 | Average number of iteration of the linear solver per call: 31.1 @@ -60,16 +60,17 @@ Average number of iteration of the linear solver per call: 31.1 ----------------------------------------------------------------------------------------------------------- Counter description | Time per step | % loop time | Call(s)/step | Bandwidth ----------------------------------------------------------------------------------------------------------- -Libraries: | 0.0757034 | 75.2 | 1 | -Kernels: | 0.0210069 | 20.9 | 97 | -Copy host to device: | 0.000323824 | 0.3 | 8 | 8.5 GB/s -Copy device to host: | 0.000551814 | 0.5 | 4 | 11.2 GB/s +Libraries: | 0.07588 | 77.8 | 1 | +Kernels: | 0.0176215 | 18.1 | 96 | +Copy host to device: | 0.000328309 | 0.3 | 8 | 8.4 GB/s +Copy device to host: | 0.00052074 | 0.5 | 4 | 11.9 GB/s Alloc/Free on device: | 0 | 0.0 | 0 | -GPU: 96% Copy H<->D: 0.87% Alloc/free: 0% Comm: 0% CPU & I/O: 3.1% +GPU: 96% Copy H<->D: 0.87% Alloc/free: 0% Comm: 0% CPU & I/O: 3.2% ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Post-resolution statistics ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -Time of the post-resolution: 0.295341 +Time of the post-resolution: 0.163937 -Total time for the whole computation 17.9485 +Total time for the whole computation 16.2875 +[Slurm] Power consumption (55 s): 0.416 kW 0.006 kWh 0.001 € (0.10€/kWh) diff --git a/tests/GPU/OpenMP_Iterateur/OpenMP_Iterateur_BENCH_PETSc.TU.adastra_gfx90a b/tests/GPU/OpenMP_Iterateur/OpenMP_Iterateur_BENCH_PETSc.TU.adastra_gfx90a index 237ba5b0d9..71f04bb7ea 100644 --- a/tests/GPU/OpenMP_Iterateur/OpenMP_Iterateur_BENCH_PETSc.TU.adastra_gfx90a +++ b/tests/GPU/OpenMP_Iterateur/OpenMP_Iterateur_BENCH_PETSc.TU.adastra_gfx90a @@ -8,8 +8,8 @@ Time is given in seconds ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Context of the computation ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -Date: 12-03-2026 -- 16:38:02 -OS: g1229__Linux__x86_64__5.14.0-570.69.1.el9_6.x86_64__#1 SMP PREEMPT_DYNAMIC Tue Nov 25 01:30:14 EST 2025 +Date: 23-04-2026 -- 09:27:31 +OS: g1182__Linux__x86_64__5.14.0-570.69.1.el9_6.x86_64__#1 SMP PREEMPT_DYNAMIC Tue Nov 25 01:30:14 EST 2025 CPU model : AMD EPYC 7A53 64-Core Processor Total number of threads:128 GPU model: AMD Instinct MI250X @@ -22,37 +22,37 @@ Total number of elements used for the calculation: 2592000 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Computation start-up statistics ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -Total time of the start-up: 15.1194 +Total time of the start-up: 14.8615 Number of calls to the linear solver per time step: 2 -Average time of the resolution of the linear problem per call: 1.55704 -Average number of iteration of the linear solver per call: 17.5 +Average time of the resolution of the linear problem per call: 1.74019 +Average number of iteration of the linear solver per call: 18.5 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Time loop statistics ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ The first time step is not accounted for the computation of the time loop statistics -Total time of the time loop: 1.11758 +Total time of the time loop: 1.0108 Number of time steps: 9 Skipped time steps: 1 -Average time per time step: 0.124175 -Standard deviation between time steps: 0.00298425 -Time elapsed in the skipped time steps: 0.163265 +Average time per time step: 0.112311 +Standard deviation between time steps: 0.00396454 +Time elapsed in the skipped time steps: 0.135985 Standard counter description | Time/step | % loop time | Call(s)/step ------------------------------------------------------------------------------------------ -Linear solver resolutions Ax=B | 0.0515469 | 41.5 | 1 -Convection operator | 0.01753108 | 14.1 | 1 -Diffusion operator | 0.008836219 | 7.1 | 1 -Gradient operator | 0.02405718 | 19.4 | 2 -Divergence operator | 0.005545095 | 4.5 | 2 -Update ::mettre_a_jour | 0.004695313 | 3.8 | 1 -Computation of the time step dt | 0.00673917 | 5.4 | 2 -Post-treatment operations | 0.002858661 | 2.3 | 1 -Other operations | 0.002365791 | 1.9 | +Linear solver resolutions Ax=B | 0.0553498 | 49.3 | 1 +Convection operator | 0.01779112 | 15.8 | 1 +Diffusion operator | 0.009032714 | 8.0 | 1 +Gradient operator | 0.007908956 | 7.0 | 2 +Divergence operator | 0.005581833 | 5.0 | 2 +Update ::mettre_a_jour | 0.004612555 | 4.1 | 1 +Computation of the time step dt | 0.006740941 | 6.0 | 2 +Post-treatment operations | 0.002857232 | 2.5 | 1 +Other operations | 0.002436298 | 2.2 | -Average number of iteration of the linear solver per call: 13.6 +Average number of iteration of the linear solver per call: 14.4 ----------------------------------------------------------------------------------------------------------- @@ -60,17 +60,17 @@ Average number of iteration of the linear solver per call: 13.6 ----------------------------------------------------------------------------------------------------------- Counter description | Time per step | % loop time | Call(s)/step | Bandwidth ----------------------------------------------------------------------------------------------------------- -Libraries: | 0.0513757 | 41.4 | 1 | -Kernels: | 0.0690257 | 55.6 | 97 | -Copy host to device: | 0.000286796 | 0.2 | 8 | 9.7 GB/s -Copy device to host: | 0.000368582 | 0.3 | 4 | 16.8 GB/s +Libraries: | 0.0551808 | 49.1 | 1 | +Kernels: | 0.0532523 | 47.4 | 96 | +Copy host to device: | 0.000285677 | 0.3 | 8 | 9.7 GB/s +Copy device to host: | 0.000371596 | 0.3 | 4 | 16.6 GB/s Alloc/Free on device: | 0 | 0.0 | 0 | -GPU: 97% Copy H<->D: 0.53% Alloc/free: 0% Comm: 0% CPU & I/O: 2.5% +GPU: 97% Copy H<->D: 0.59% Alloc/free: 0% Comm: 0% CPU & I/O: 2.9% ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Post-resolution statistics ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -Time of the post-resolution: 0.150748 +Time of the post-resolution: 0.155407 -Total time for the whole computation 16.551 +Total time for the whole computation 16.1637 -[Slurm] Power consumption (24 s): 0.395 kW 0.003 kWh 0.000 € (0.10€/kWh) +[Slurm] Power consumption (23 s): 0.392 kW 0.003 kWh 0.000 € (0.10€/kWh) diff --git a/tests/GPU/OpenMP_Iterateur/OpenMP_Iterateur_BENCH_PETSc.TU.lumi_gfx90a b/tests/GPU/OpenMP_Iterateur/OpenMP_Iterateur_BENCH_PETSc.TU.lumi_gfx90a index 590e3e362a..efe1a573df 100644 --- a/tests/GPU/OpenMP_Iterateur/OpenMP_Iterateur_BENCH_PETSc.TU.lumi_gfx90a +++ b/tests/GPU/OpenMP_Iterateur/OpenMP_Iterateur_BENCH_PETSc.TU.lumi_gfx90a @@ -8,8 +8,8 @@ Time is given in seconds ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Context of the computation ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -Date: 24-02-2026 -- 00:15:17 -OS: nid007971__Linux__x86_64__6.4.0-150600.23.73_15.0.14-cray_shasta_c__#1 SMP Tue Oct 21 20:32:25 UTC 2025 (89d3c98) +Date: 15-05-2026 -- 20:43:18 +OS: nid005023__Linux__x86_64__6.4.0-150600.23.73_15.0.14-cray_shasta_c__#1 SMP Tue Oct 21 20:32:25 UTC 2025 (89d3c98) CPU model : AMD EPYC 7A53 64-Core Processor Total number of threads:128 GPU model: AMD Instinct MI250X @@ -22,37 +22,37 @@ Total number of elements used for the calculation: 2592000 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Computation start-up statistics ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -Total time of the start-up: 53.4901 +Total time of the start-up: 72.2337 Number of calls to the linear solver per time step: 2 -Average time of the resolution of the linear problem per call: 3.29056 -Average number of iteration of the linear solver per call: 17.5 +Average time of the resolution of the linear problem per call: 3.83761 +Average number of iteration of the linear solver per call: 19.5 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Time loop statistics ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ The first time step is not accounted for the computation of the time loop statistics -Total time of the time loop: 1.12433 +Total time of the time loop: 1.00974 Number of time steps: 9 Skipped time steps: 1 -Average time per time step: 0.124926 -Standard deviation between time steps: 0.00291511 -Time elapsed in the skipped time steps: 0.149759 +Average time per time step: 0.112194 +Standard deviation between time steps: 0.00433867 +Time elapsed in the skipped time steps: 0.138662 Standard counter description | Time/step | % loop time | Call(s)/step ------------------------------------------------------------------------------------------ -Linear solver resolutions Ax=B | 0.0503064 | 35.5 | 1 -Convection operator | 0.02040081 | 14.4 | 1 -Diffusion operator | 0.007673064 | 5.4 | 1 -Gradient operator | 0.02397075 | 16.9 | 2 -Divergence operator | 0.005495745 | 3.9 | 2 -Update ::mettre_a_jour | 0.004568919 | 3.2 | 1 -Computation of the time step dt | 0.006858336 | 4.8 | 2 -Post-treatment operations | 0.002841983 | 2.0 | 1 -Other operations | 0.002810071 | 2.0 | +Linear solver resolutions Ax=B | 0.0555408 | 49.5 | 1 +Convection operator | 0.01741344 | 15.5 | 1 +Diffusion operator | 0.008625326 | 7.7 | 1 +Gradient operator | 0.008388428 | 7.5 | 2 +Divergence operator | 0.005668888 | 5.1 | 2 +Update ::mettre_a_jour | 0.004655274 | 4.1 | 1 +Computation of the time step dt | 0.006789526 | 6.1 | 2 +Post-treatment operations | 0.002835855 | 2.5 | 1 +Other operations | 0.002276084 | 2.0 | -Average number of iteration of the linear solver per call: 13.6 +Average number of iteration of the linear solver per call: 15 ----------------------------------------------------------------------------------------------------------- @@ -60,17 +60,17 @@ Average number of iteration of the linear solver per call: 13.6 ----------------------------------------------------------------------------------------------------------- Counter description | Time per step | % loop time | Call(s)/step | Bandwidth ----------------------------------------------------------------------------------------------------------- -Libraries: | 0.0501355 | 40.1 | 1 | -Kernels: | 0.0709778 | 56.8 | 97 | -Copy host to device: | 0.000305926 | 0.2 | 8 | 9.0 GB/s -Copy device to host: | 0.000366232 | 0.3 | 4 | 16.9 GB/s +Libraries: | 0.0553671 | 49.3 | 1 | +Kernels: | 0.0529794 | 47.2 | 96 | +Copy host to device: | 0.000301069 | 0.3 | 8 | 9.2 GB/s +Copy device to host: | 0.00036894 | 0.3 | 4 | 16.8 GB/s Alloc/Free on device: | 0 | 0.0 | 0 | -GPU: 97% Copy H<->D: 0.54% Alloc/free: 0% Comm: 0% CPU & I/O: 2.5% +GPU: 97% Copy H<->D: 0.6% Alloc/free: 0% Comm: 0% CPU & I/O: 2.8% ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Post-resolution statistics ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -Time of the post-resolution: 0.143902 +Time of the post-resolution: 0.15637 -Total time for the whole computation 54.9081 +Total time for the whole computation 73.5385 -Power consumption: 0 kW 0 kWh 0 € (0.12€/kWh) +[Slurm] Power consumption (101 s): 0.467 kW 0.013 kWh 0.001 € (0.10€/kWh) diff --git a/tests/GPU/OpenMP_Iterateur/PAR_OpenMP_Iterateur_BENCH_AmgX.TU.dalianvl_cc100x4 b/tests/GPU/OpenMP_Iterateur/PAR_OpenMP_Iterateur_BENCH_AmgX.TU.dalianvl_cc100x4 new file mode 100644 index 0000000000..e1d34358e0 --- /dev/null +++ b/tests/GPU/OpenMP_Iterateur/PAR_OpenMP_Iterateur_BENCH_AmgX.TU.dalianvl_cc100x4 @@ -0,0 +1,101 @@ + # Global performance file # + +This is the global file for tracking performance in TRUST. It stores aggregated quantities. +More detailed statistics can be found in the PAR_OpenMP_Iterateur_BENCH_AmgX_csv.TU file +For time loop, only standard counters of level 1 are printed alongside your custom counters +Time is given in seconds + +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Context of the computation +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Date: 11-06-2026 -- 12:56:02 +OS: dalianvl16__Linux__aarch64__5.14.0-503.40.1.el9_5.aarch64__#1 SMP PREEMPT_DYNAMIC Thu Apr 24 13:05:29 UTC 2025 +CPU model : Unknown Linux CPU +Total number of threads:144 +GPU model: NVIDIA GB200 +CUDA runtime version: 13.0 +CUDA drivers version: 13.20 +Nb procs used for the computation: 4 +TRUST version: 1.9.8_beta +Total number of elements used for the calculation: 2592000 + +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Computation start-up statistics +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Total time of the start-up: 6.31723 +Number of virtual exchanges: 59 +Maximum number of MPI allreduce per time step 113 + +Number of calls to the linear solver per time step: 2 +Average time of the resolution of the linear problem per call: 0.987298 +Average number of iteration of the linear solver per call: 45.5 + +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Time loop statistics +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +The first time step is not accounted for the computation of the time loop statistics +Total time of the time loop: 1.66244 +Number of time steps: 9 +Skipped time steps: 1 +Average time per time step: 0.184716 +Standard deviation between time steps: 0.013327 +Time elapsed in the skipped time steps: 0.251559 + +Percent of total time spend in communication: 0.554609 + +Standard counter description | Time/step | % loop time | Call(s)/step +------------------------------------------------------------------------------------------ +Linear solver resolutions Ax=B | 0.176631 | 95.6 | 1 +Convection operator | 0.0009179114 | 0.5 | 1 +Diffusion operator | 0.0003050746 | 0.2 | 1 +Gradient operator | 0.0003565659 | 0.2 | 2 +Divergence operator | 0.0002437128 | 0.1 | 2 +Update ::mettre_a_jour | 0.001284476 | 0.7 | 1 +Computation of the time step dt | 0.0003241821 | 0.2 | 2 +Post-treatment operations | 0.0007485209 | 0.4 | 1 +Other operations | 0.0039044 | 2.1 | +Number of virtual exchanges per time step: 9 +Maximum number of MPI allreduce per time step 14 + +Average number of iteration of the linear solver per call: 34.7 + + +--------------------------------------------------------------------------------------------------------- +Warning: One or several PETSc solvers are used and thus the communication time below are under-estimated. +--------------------------------------------------------------------------------------------------------- + +Average of the fraction of the time spent in communications between processors: 0.8 % +Max of the fraction of the time spent in communications between processors: 1.1 % +Min of the fraction of the time spent in communications between processors: 0.6 % +Time of one mpsum measured by an internal bench over 0.1s (network latency): 3.48609e-06 +Network maximum bandwidth on all processors: 54.2 GB/s +Total network traffic: 194.73 MB/time step +Average message size: 740.105 kB +Min waiting time: 0.6 % of total time +Max waiting time: 1.1 % of total time +Avg waiting time: 0.95 % of total time + +----------------------------------------------------------------------------------------------------------- + GPU statistics +----------------------------------------------------------------------------------------------------------- +Counter description | Time per step | % loop time | Call(s)/step | Bandwidth +----------------------------------------------------------------------------------------------------------- +Libraries: | 0.176361 | 95.5 | 1 | +Kernels: | 0.00414017 | 2.2 | 151 | +Copy host to device: | 0.000122941 | 0.1 | 5 | 6.2 GB/s +Copy device to host: | 0.000104189 | 0.1 | 4 | 14.6 GB/s +Alloc/Free on device: | 4.65778e-07 | 0.0 | 6 | +GPU: 98% Copy H<->D: 0.12% Alloc/free: 0.00025% Comm: 0.64% CPU & I/O: 1.5% +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Post-resolution statistics +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Time of the post-resolution: 0.469793 +Maximum number of MPI allreduce per time step 6 + +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Max waiting time big => probably due to a bad partitioning +Communications > 30% => too many processors or network too slow +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Total time for the whole computation 8.70103 + diff --git a/tests/GPU/OpenMP_Iterateur/PAR_OpenMP_Iterateur_BENCH_AmgX.TU.jean-zay_cc90x4 b/tests/GPU/OpenMP_Iterateur/PAR_OpenMP_Iterateur_BENCH_AmgX.TU.jean-zay_cc90x4 new file mode 100644 index 0000000000..bd4d2b39f6 --- /dev/null +++ b/tests/GPU/OpenMP_Iterateur/PAR_OpenMP_Iterateur_BENCH_AmgX.TU.jean-zay_cc90x4 @@ -0,0 +1,102 @@ + # Global performance file # + +This is the global file for tracking performance in TRUST. It stores aggregated quantities. +More detailed statistics can be found in the PAR_OpenMP_Iterateur_BENCH_AmgX_csv.TU file +For time loop, only standard counters of level 1 are printed alongside your custom counters +Time is given in seconds + +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Context of the computation +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Date: 10-06-2026 -- 15:24:25 +OS: jzxh032__Linux__x86_64__5.14.0-570.116.1.el9_6.x86_64__#1 SMP PREEMPT_DYNAMIC Tue May 19 08:44:54 EDT 2026 +CPU model : Intel(R) Xeon(R) Platinum 8468 +Total number of threads:192 +GPU model: NVIDIA H100 80GB HBM3 +CUDA runtime version: 12.60 +CUDA drivers version: 13.20 +Nb procs used for the computation: 4 +TRUST version: 1.9.8_beta +Total number of elements used for the calculation: 2592000 + +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Computation start-up statistics +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Total time of the start-up: 8.97592 +Number of virtual exchanges: 59 +Maximum number of MPI allreduce per time step 113 + +Number of calls to the linear solver per time step: 2 +Average time of the resolution of the linear problem per call: 1.69108 +Average number of iteration of the linear solver per call: 45.5 + +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Time loop statistics +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +The first time step is not accounted for the computation of the time loop statistics +Total time of the time loop: 0.912864 +Number of time steps: 9 +Skipped time steps: 1 +Average time per time step: 0.101429 +Standard deviation between time steps: 0.00683941 +Time elapsed in the skipped time steps: 0.232016 + +Percent of total time spend in communication: 0.73194 + +Standard counter description | Time/step | % loop time | Call(s)/step +------------------------------------------------------------------------------------------ +Linear solver resolutions Ax=B | 0.0933083 | 92.0 | 1 +Convection operator | 0.001174145 | 1.2 | 1 +Diffusion operator | 0.0004917334 | 0.5 | 1 +Gradient operator | 0.000429222 | 0.4 | 2 +Divergence operator | 0.0002555969 | 0.3 | 2 +Update ::mettre_a_jour | 0.001087015 | 1.1 | 1 +Computation of the time step dt | 0.0004033608 | 0.4 | 2 +Post-treatment operations | 0.001093626 | 1.1 | 1 +Other operations | 0.003186304 | 3.1 | +Number of virtual exchanges per time step: 9 +Maximum number of MPI allreduce per time step 14 + +Average number of iteration of the linear solver per call: 34.7 + + +--------------------------------------------------------------------------------------------------------- +Warning: One or several PETSc solvers are used and thus the communication time below are under-estimated. +--------------------------------------------------------------------------------------------------------- + +Average of the fraction of the time spent in communications between processors: 2.7 % +Max of the fraction of the time spent in communications between processors: 4.2 % +Min of the fraction of the time spent in communications between processors: 0.8 % +Time of one mpsum measured by an internal bench over 0.1s (network latency): 4.03775e-06 +Network maximum bandwidth on all processors: 62.0 GB/s +Total network traffic: 194.73 MB/time step +Average message size: 740.105 kB +Min waiting time: 0.8 % of total time +Max waiting time: 4.1 % of total time +Avg waiting time: 3.25 % of total time + +----------------------------------------------------------------------------------------------------------- + GPU statistics +----------------------------------------------------------------------------------------------------------- +Counter description | Time per step | % loop time | Call(s)/step | Bandwidth +----------------------------------------------------------------------------------------------------------- +Libraries: | 0.093002 | 91.7 | 1 | +Kernels: | 0.00481281 | 4.7 | 151 | +Copy host to device: | 0.000142865 | 0.1 | 5 | 5.3 GB/s +Copy device to host: | 0.000247582 | 0.2 | 4 | 6.2 GB/s +Alloc/Free on device: | 3.79444e-07 | 0.0 | 6 | +GPU: 96% Copy H<->D: 0.38% Alloc/free: 0.00037% Comm: 0.92% CPU & I/O: 2.3% +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Post-resolution statistics +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Time of the post-resolution: 0.495779 +Maximum number of MPI allreduce per time step 6 + +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Max waiting time big => probably due to a bad partitioning +Communications > 30% => too many processors or network too slow +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Total time for the whole computation 10.6166 + +[Slurm] Power consumption (27 s): 0.438 kW 0.003 kWh 0.000 € (0.10€/kWh) diff --git a/tests/GPU/OpenMP_Iterateur/PAR_OpenMP_Iterateur_BENCH_AmgX_10.TU.jean-zay_cc90x8 b/tests/GPU/OpenMP_Iterateur/PAR_OpenMP_Iterateur_BENCH_AmgX_10.TU.jean-zay_cc90x8 new file mode 100644 index 0000000000..4910e01b67 --- /dev/null +++ b/tests/GPU/OpenMP_Iterateur/PAR_OpenMP_Iterateur_BENCH_AmgX_10.TU.jean-zay_cc90x8 @@ -0,0 +1,102 @@ + # Global performance file # + +This is the global file for tracking performance in TRUST. It stores aggregated quantities. +More detailed statistics can be found in the PAR_OpenMP_Iterateur_BENCH_AmgX_10_csv.TU file +For time loop, only standard counters of level 1 are printed alongside your custom counters +Time is given in seconds + +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Context of the computation +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Date: 10-06-2026 -- 16:05:33 +OS: jzxh250__Linux__x86_64__5.14.0-570.116.1.el9_6.x86_64__#1 SMP PREEMPT_DYNAMIC Tue May 19 08:44:54 EDT 2026 +CPU model : Intel(R) Xeon(R) Platinum 8468 +Total number of threads:192 +GPU model: NVIDIA H100 80GB HBM3 +CUDA runtime version: 12.60 +CUDA drivers version: 13.20 +Nb procs used for the computation: 8 +TRUST version: 1.9.8_beta +Total number of elements used for the calculation: 80864000 + +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Computation start-up statistics +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Total time of the start-up: 62.3521 +Number of virtual exchanges: 59 +Maximum number of MPI allreduce per time step 113 + +Number of calls to the linear solver per time step: 2 +Average time of the resolution of the linear problem per call: 15.4526 +Average number of iteration of the linear solver per call: 97 + +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Time loop statistics +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +The first time step is not accounted for the computation of the time loop statistics +Total time of the time loop: 4.87771 +Number of time steps: 9 +Skipped time steps: 1 +Average time per time step: 0.541968 +Standard deviation between time steps: 0.0373707 +Time elapsed in the skipped time steps: 0.944194 + +Percent of total time spend in communication: 1.12627 + +Standard counter description | Time/step | % loop time | Call(s)/step +------------------------------------------------------------------------------------------ +Linear solver resolutions Ax=B | 0.479851 | 88.5 | 1 +Convection operator | 0.01469592 | 2.7 | 1 +Diffusion operator | 0.00588249 | 1.1 | 1 +Gradient operator | 0.005783263 | 1.1 | 2 +Divergence operator | 0.00287426 | 0.5 | 2 +Update ::mettre_a_jour | 0.01230296 | 2.3 | 1 +Computation of the time step dt | 0.003788952 | 0.7 | 2 +Post-treatment operations | 0.004320248 | 0.8 | 1 +Other operations | 0.01246934 | 2.3 | +Number of virtual exchanges per time step: 9 +Maximum number of MPI allreduce per time step 14 + +Average number of iteration of the linear solver per call: 72 + + +--------------------------------------------------------------------------------------------------------- +Warning: One or several PETSc solvers are used and thus the communication time below are under-estimated. +--------------------------------------------------------------------------------------------------------- + +Average of the fraction of the time spent in communications between processors: 1.6 % +Max of the fraction of the time spent in communications between processors: 2.6 % +Min of the fraction of the time spent in communications between processors: 1.2 % +Time of one mpsum measured by an internal bench over 0.1s (network latency): 5.31916e-06 +Network maximum bandwidth on all processors: 172.1 GB/s +Total network traffic: 2535.2 MB/time step +Average message size: 4477.39 kB +Min waiting time: 1.3 % of total time +Max waiting time: 2.4 % of total time +Avg waiting time: 1.9625 % of total time + +----------------------------------------------------------------------------------------------------------- + GPU statistics +----------------------------------------------------------------------------------------------------------- +Counter description | Time per step | % loop time | Call(s)/step | Bandwidth +----------------------------------------------------------------------------------------------------------- +Libraries: | 0.479276 | 88.4 | 1 | +Kernels: | 0.0465683 | 8.6 | 133 | +Copy host to device: | 0.000555787 | 0.1 | 5 | 8.8 GB/s +Copy device to host: | 0.000704379 | 0.1 | 4 | 13.0 GB/s +Alloc/Free on device: | 4.16333e-07 | 0.0 | 6 | +GPU: 97% Copy H<->D: 0.23% Alloc/free: 7.7e-05% Comm: 1.3% CPU & I/O: 1.4% +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Post-resolution statistics +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Time of the post-resolution: 1.47244 +Maximum number of MPI allreduce per time step 6 + +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Max waiting time big => probably due to a bad partitioning +Communications > 30% => too many processors or network too slow +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Total time for the whole computation 69.6465 + +[Slurm] Power consumption (86 s): 0.950 kW 0.023 kWh 0.002 € (0.10€/kWh) diff --git a/tests/GPU/OpenMP_Iterateur/PAR_OpenMP_Iterateur_BENCH_PETSc.TU.adastra_gfx90ax4 b/tests/GPU/OpenMP_Iterateur/PAR_OpenMP_Iterateur_BENCH_PETSc.TU.adastra_gfx90ax4 index f63e88a6e3..d47299e2f6 100644 --- a/tests/GPU/OpenMP_Iterateur/PAR_OpenMP_Iterateur_BENCH_PETSc.TU.adastra_gfx90ax4 +++ b/tests/GPU/OpenMP_Iterateur/PAR_OpenMP_Iterateur_BENCH_PETSc.TU.adastra_gfx90ax4 @@ -8,8 +8,8 @@ Time is given in seconds ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Context of the computation ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -Date: 20-02-2026 -- 21:16:48 -OS: g1023__Linux__x86_64__5.14.0-570.69.1.el9_6.x86_64__#1 SMP PREEMPT_DYNAMIC Tue Nov 25 01:30:14 EST 2025 +Date: 08-06-2026 -- 16:11:54 +OS: g1323__Linux__x86_64__5.14.0-570.69.1.el9_6.x86_64__#1 SMP PREEMPT_DYNAMIC Tue Nov 25 01:30:14 EST 2025 CPU model : AMD EPYC 7A53 64-Core Processor Total number of threads:128 GPU model: AMD Instinct MI250X @@ -22,74 +22,74 @@ Total number of elements used for the calculation: 2592000 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Computation start-up statistics ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -Total time of the start-up: 9.11069 -Number of virtual exchanges: 61 -Maximum number of MPI allreduce per time step 113 +Total time of the start-up: 15.7382 +Number of virtual exchanges: 59 +Maximum number of MPI allreduce per time step 112 Number of calls to the linear solver per time step: 2 -Average time of the resolution of the linear problem per call: 0.745132 -Average number of iteration of the linear solver per call: 17.5 +Average time of the resolution of the linear problem per call: 1.49685 +Average number of iteration of the linear solver per call: 19 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Time loop statistics ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ The first time step is not accounted for the computation of the time loop statistics -Total time of the time loop: 0.749941 +Total time of the time loop: 0.536061 Number of time steps: 9 Skipped time steps: 1 -Average time per time step: 0.0833267 -Standard deviation between time steps: 0.00336632 -Time elapsed in the skipped time steps: 0.109205 +Average time per time step: 0.0595624 +Standard deviation between time steps: 0.0020151 +Time elapsed in the skipped time steps: 0.0832984 -Percent of total time spend in communication: 1.85683 +Percent of total time spend in communication: 2.96068 Standard counter description | Time/step | % loop time | Call(s)/step ------------------------------------------------------------------------------------------ -Linear solver resolutions Ax=B | 0.0588244 | 61.6 | 1 -Convection operator | 0.00587872 | 6.2 | 1 -Diffusion operator | 0.002212806 | 2.3 | 1 -Gradient operator | 0.006406243 | 6.7 | 2 -Divergence operator | 0.001422708 | 1.5 | 2 -Update ::mettre_a_jour | 0.002037852 | 2.1 | 1 -Computation of the time step dt | 0.001919701 | 2.0 | 2 -Post-treatment operations | 0.00125179 | 1.3 | 1 -Other operations | 0.00337248 | 3.5 | -Number of virtual exchanges per time step: 10 +Linear solver resolutions Ax=B | 0.0390233 | 65.5 | 1 +Convection operator | 0.005349557 | 9.0 | 1 +Diffusion operator | 0.002420475 | 4.1 | 1 +Gradient operator | 0.002731023 | 4.6 | 2 +Divergence operator | 0.0008404393 | 1.4 | 2 +Update ::mettre_a_jour | 0.001865685 | 3.1 | 1 +Computation of the time step dt | 0.001907613 | 3.2 | 2 +Post-treatment operations | 0.001141614 | 1.9 | 1 +Other operations | 0.004282682 | 7.2 | +Number of virtual exchanges per time step: 9 Maximum number of MPI allreduce per time step 14 -Average number of iteration of the linear solver per call: 13.6 +Average number of iteration of the linear solver per call: 14.8 --------------------------------------------------------------------------------------------------------- Warning: One or several PETSc solvers are used and thus the communication time below are under-estimated. --------------------------------------------------------------------------------------------------------- -Average of the fraction of the time spent in communications between processors: 2.2 % -Max of the fraction of the time spent in communications between processors: 2.9 % -Min of the fraction of the time spent in communications between processors: 2.1 % -Time of one mpsum measured by an internal bench over 0.1s (network latency): 2.15456e-06 -Network maximum bandwidth on all processors: 33.5 GB/s +Average of the fraction of the time spent in communications between processors: 3.2 % +Max of the fraction of the time spent in communications between processors: 4 % +Min of the fraction of the time spent in communications between processors: 3.4 % +Time of one mpsum measured by an internal bench over 0.1s (network latency): 2.38312e-06 +Network maximum bandwidth on all processors: 31.0 GB/s Total network traffic: 194.73 MB/time step Average message size: 751.53 kB -Min waiting time: 2 % of total time -Max waiting time: 2.9 % of total time -Avg waiting time: 2.5 % of total time +Min waiting time: 3.3 % of total time +Max waiting time: 3.9 % of total time +Avg waiting time: 3.6 % of total time ----------------------------------------------------------------------------------------------------------- GPU statistics ----------------------------------------------------------------------------------------------------------- Counter description | Time per step | % loop time | Call(s)/step | Bandwidth ----------------------------------------------------------------------------------------------------------- -Libraries: | 0.0585978 | 70.3 | 1 | -Kernels: | 0.0209497 | 25.1 | 151 | -Copy host to device: | 0.000159277 | 0.2 | 5 | 4.8 GB/s -Copy device to host: | 0.000248381 | 0.3 | 4 | 6.1 GB/s -Alloc/Free on device: | 1.34147e-05 | 0.0 | 6 | -GPU: 95% Copy H<->D: 0.49% Alloc/free: 0.016% Comm: 2.1% CPU & I/O: 1.9% +Libraries: | 0.0387932 | 65.1 | 1 | +Kernels: | 0.016855 | 28.3 | 151 | +Copy host to device: | 0.000166681 | 0.3 | 5 | 4.6 GB/s +Copy device to host: | 0.000260071 | 0.4 | 4 | 5.9 GB/s +Alloc/Free on device: | 3.67556e-07 | 0.0 | 6 | +GPU: 93% Copy H<->D: 0.72% Alloc/free: 0.00062% Comm: 3.4% CPU & I/O: 2.4% ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Post-resolution statistics ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -Time of the post-resolution: 0.745979 +Time of the post-resolution: 0.75471 Maximum number of MPI allreduce per time step 6 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -97,6 +97,6 @@ Max waiting time big => probably due to a bad partitioning Communications > 30% => too many processors or network too slow ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -Total time for the whole computation 10.7158 +Total time for the whole computation 17.1123 -Power consumption: 0 kW 0 kWh 0 € (0.12€/kWh) +[Slurm] Power consumption (28 s): 0.465 kW 0.004 kWh 0.000 € (0.10€/kWh) diff --git a/tests/GPU/OpenMP_Iterateur/PAR_OpenMP_Iterateur_BENCH_PETSc.TU.adastra_gfx942x4 b/tests/GPU/OpenMP_Iterateur/PAR_OpenMP_Iterateur_BENCH_PETSc.TU.adastra_gfx942x4 index 07313561f2..cc3bd0fc9b 100644 --- a/tests/GPU/OpenMP_Iterateur/PAR_OpenMP_Iterateur_BENCH_PETSc.TU.adastra_gfx942x4 +++ b/tests/GPU/OpenMP_Iterateur/PAR_OpenMP_Iterateur_BENCH_PETSc.TU.adastra_gfx942x4 @@ -8,13 +8,13 @@ Time is given in seconds ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Context of the computation ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -Date: 20-02-2026 -- 18:11:04 +Date: 08-06-2026 -- 14:46:38 OS: a1001__Linux__x86_64__5.14.0-570.69.1.el9_6.x86_64__#1 SMP PREEMPT_DYNAMIC Tue Nov 25 01:30:14 EST 2025 CPU model : AMD Instinct MI300A Accelerator Total number of threads:192 GPU model: AMD Instinct MI300A -HIP runtime version: 6.41 -HIP drivers version: 6.41 +HIP runtime version: 6.43 +HIP drivers version: 6.43 Nb procs used for the computation: 4 TRUST version: 1.9.8_beta Total number of elements used for the calculation: 2592000 @@ -22,74 +22,74 @@ Total number of elements used for the calculation: 2592000 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Computation start-up statistics ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -Total time of the start-up: 4.0396 -Number of virtual exchanges: 61 -Maximum number of MPI allreduce per time step 113 +Total time of the start-up: 10.1716 +Number of virtual exchanges: 59 +Maximum number of MPI allreduce per time step 112 Number of calls to the linear solver per time step: 2 -Average time of the resolution of the linear problem per call: 0.66545 -Average number of iteration of the linear solver per call: 17.5 +Average time of the resolution of the linear problem per call: 0.95387 +Average number of iteration of the linear solver per call: 16.5 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Time loop statistics ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ The first time step is not accounted for the computation of the time loop statistics -Total time of the time loop: 0.783478 +Total time of the time loop: 0.480853 Number of time steps: 9 Skipped time steps: 1 -Average time per time step: 0.0870531 -Standard deviation between time steps: 0.00559937 -Time elapsed in the skipped time steps: 0.123638 +Average time per time step: 0.0534281 +Standard deviation between time steps: 0.00226311 +Time elapsed in the skipped time steps: 0.0790494 -Percent of total time spend in communication: 2.03525 +Percent of total time spend in communication: 3.33139 Standard counter description | Time/step | % loop time | Call(s)/step ------------------------------------------------------------------------------------------ -Linear solver resolutions Ax=B | 0.0768391 | 76.2 | 1 -Convection operator | 0.001744362 | 1.7 | 1 -Diffusion operator | 0.0007313998 | 0.7 | 1 -Gradient operator | 0.0007877113 | 0.8 | 2 -Divergence operator | 0.0007087598 | 0.7 | 2 -Update ::mettre_a_jour | 0.001926733 | 1.9 | 1 -Computation of the time step dt | 0.0008340134 | 0.8 | 2 -Post-treatment operations | 0.001047411 | 1.0 | 1 -Other operations | 0.00243357 | 2.4 | -Number of virtual exchanges per time step: 10 +Linear solver resolutions Ax=B | 0.0431755 | 80.8 | 1 +Convection operator | 0.00164288 | 3.1 | 1 +Diffusion operator | 0.0007242174 | 1.4 | 1 +Gradient operator | 0.0007095389 | 1.3 | 2 +Divergence operator | 0.0005097047 | 1.0 | 2 +Update ::mettre_a_jour | 0.001811757 | 3.4 | 1 +Computation of the time step dt | 0.0008200627 | 1.5 | 2 +Post-treatment operations | 0.0009717177 | 1.8 | 1 +Other operations | 0.00306278 | 5.7 | +Number of virtual exchanges per time step: 9 Maximum number of MPI allreduce per time step 14 -Average number of iteration of the linear solver per call: 13.6 +Average number of iteration of the linear solver per call: 13.2 --------------------------------------------------------------------------------------------------------- Warning: One or several PETSc solvers are used and thus the communication time below are under-estimated. --------------------------------------------------------------------------------------------------------- -Average of the fraction of the time spent in communications between processors: 2 % -Max of the fraction of the time spent in communications between processors: 2.7 % -Min of the fraction of the time spent in communications between processors: 2 % -Time of one mpsum measured by an internal bench over 0.1s (network latency): 5.92733e-06 -Network maximum bandwidth on all processors: 39.2 GB/s +Average of the fraction of the time spent in communications between processors: 3.3 % +Max of the fraction of the time spent in communications between processors: 4.2 % +Min of the fraction of the time spent in communications between processors: 3.5 % +Time of one mpsum measured by an internal bench over 0.1s (network latency): 5.58282e-06 +Network maximum bandwidth on all processors: 36.3 GB/s Total network traffic: 194.73 MB/time step Average message size: 751.53 kB -Min waiting time: 2 % of total time -Max waiting time: 2.6 % of total time -Avg waiting time: 2.2 % of total time +Min waiting time: 3.3 % of total time +Max waiting time: 4 % of total time +Avg waiting time: 3.65 % of total time ----------------------------------------------------------------------------------------------------------- GPU statistics ----------------------------------------------------------------------------------------------------------- Counter description | Time per step | % loop time | Call(s)/step | Bandwidth ----------------------------------------------------------------------------------------------------------- -Libraries: | 0.0766393 | 88.0 | 1 | -Kernels: | 0.00695985 | 8.0 | 151 | -Copy host to device: | 9.11892e-05 | 0.1 | 5 | 8.4 GB/s -Copy device to host: | 0.000198859 | 0.2 | 4 | 7.7 GB/s -Alloc/Free on device: | 1.3989e-05 | 0.0 | 6 | -GPU: 96% Copy H<->D: 0.33% Alloc/free: 0.016% Comm: 2.4% CPU & I/O: 1.3% +Libraries: | 0.0429498 | 80.4 | 1 | +Kernels: | 0.00693115 | 13.0 | 151 | +Copy host to device: | 0.000129734 | 0.2 | 5 | 5.9 GB/s +Copy device to host: | 0.000197762 | 0.4 | 4 | 7.7 GB/s +Alloc/Free on device: | 3.04444e-07 | 0.0 | 6 | +GPU: 93% Copy H<->D: 0.61% Alloc/free: 0.00057% Comm: 3.9% CPU & I/O: 2.1% ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Post-resolution statistics ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -Time of the post-resolution: 0.815148 +Time of the post-resolution: 0.792903 Maximum number of MPI allreduce per time step 6 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -97,6 +97,6 @@ Max waiting time big => probably due to a bad partitioning Communications > 30% => too many processors or network too slow ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -Total time for the whole computation 5.76188 +Total time for the whole computation 11.5244 -Power consumption: 0 kW 0 kWh 0 € (0.12€/kWh) +[Slurm] Power consumption (19 s): 0.680 kW 0.004 kWh 0.000 € (0.10€/kWh) diff --git a/tests/GPU/OpenMP_Iterateur/PAR_OpenMP_Iterateur_BENCH_PETSc.TU.is157091_cc86x2 b/tests/GPU/OpenMP_Iterateur/PAR_OpenMP_Iterateur_BENCH_PETSc.TU.is157091_cc86x2 index 3f45adaf4f..25fb24f1eb 100644 --- a/tests/GPU/OpenMP_Iterateur/PAR_OpenMP_Iterateur_BENCH_PETSc.TU.is157091_cc86x2 +++ b/tests/GPU/OpenMP_Iterateur/PAR_OpenMP_Iterateur_BENCH_PETSc.TU.is157091_cc86x2 @@ -8,13 +8,13 @@ Time is given in seconds ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Context of the computation ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -Date: 11-03-2026 -- 18:35:57 +Date: 08-06-2026 -- 11:20:06 OS: is157091__Linux__x86_64__5.11.12-300.fc34.x86_64__#1 SMP Wed Apr 7 16:31:13 UTC 2021 CPU model : AMD Ryzen Threadripper PRO 5975WX 32-Cores Total number of threads:64 GPU model: NVIDIA RTX A6000 CUDA runtime version: 12.90 -CUDA drivers version: 12.70 +CUDA drivers version: 13.20 Nb procs used for the computation: 2 TRUST version: 1.9.8_beta Total number of elements used for the calculation: 2592000 @@ -22,39 +22,39 @@ Total number of elements used for the calculation: 2592000 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Computation start-up statistics ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -Total time of the start-up: 7.32206 -Number of virtual exchanges: 61 -Maximum number of MPI allreduce per time step 113 +Total time of the start-up: 5.76054 +Number of virtual exchanges: 59 +Maximum number of MPI allreduce per time step 112 Number of calls to the linear solver per time step: 2 -Average time of the resolution of the linear problem per call: 1.74206 +Average time of the resolution of the linear problem per call: 1.20091 Average number of iteration of the linear solver per call: 18.5 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Time loop statistics ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ The first time step is not accounted for the computation of the time loop statistics -Total time of the time loop: 4.2591 +Total time of the time loop: 1.92849 Number of time steps: 9 Skipped time steps: 1 -Average time per time step: 0.473233 -Standard deviation between time steps: 0.0281796 -Time elapsed in the skipped time steps: 0.559638 +Average time per time step: 0.214277 +Standard deviation between time steps: 0.0078799 +Time elapsed in the skipped time steps: 0.253926 -Percent of total time spend in communication: 0.541268 +Percent of total time spend in communication: 3.72389 Standard counter description | Time/step | % loop time | Call(s)/step ------------------------------------------------------------------------------------------ -Linear solver resolutions Ax=B | 0.403631 | 85.3 | 1 -Convection operator | 0.01615821 | 3.4 | 1 -Diffusion operator | 0.004912699 | 1.0 | 1 -Gradient operator | 0.007781903 | 1.6 | 2 -Divergence operator | 0.003409271 | 0.7 | 2 -Update ::mettre_a_jour | 0.007377545 | 1.6 | 1 -Computation of the time step dt | 0.008302543 | 1.8 | 2 -Post-treatment operations | 0.002612658 | 0.6 | 1 -Other operations | 0.0190475 | 4.0 | -Number of virtual exchanges per time step: 10 +Linear solver resolutions Ax=B | 0.161752 | 75.5 | 1 +Convection operator | 0.01520807 | 7.1 | 1 +Diffusion operator | 0.00419924 | 2.0 | 1 +Gradient operator | 0.003383048 | 1.6 | 2 +Divergence operator | 0.001593399 | 0.7 | 2 +Update ::mettre_a_jour | 0.004787313 | 2.2 | 1 +Computation of the time step dt | 0.007714073 | 3.6 | 2 +Post-treatment operations | 0.002160597 | 1.0 | 1 +Other operations | 0.01347913 | 6.3 | +Number of virtual exchanges per time step: 9 Maximum number of MPI allreduce per time step 14 Average number of iteration of the linear solver per call: 13.7 @@ -64,32 +64,32 @@ Average number of iteration of the linear solver per call: 13.7 Warning: One or several PETSc solvers are used and thus the communication time below are under-estimated. --------------------------------------------------------------------------------------------------------- -Average of the fraction of the time spent in communications between processors: 0.5 % -Max of the fraction of the time spent in communications between processors: 0.6 % -Min of the fraction of the time spent in communications between processors: 0.6 % -Time of one mpsum measured by an internal bench over 0.1s (network latency): 1.28883e-06 -Network maximum bandwidth on all processors: 21.0 GB/s +Average of the fraction of the time spent in communications between processors: 3.1 % +Max of the fraction of the time spent in communications between processors: 4.3 % +Min of the fraction of the time spent in communications between processors: 2.7 % +Time of one mpsum measured by an internal bench over 0.1s (network latency): 2.34412e-06 +Network maximum bandwidth on all processors: 8.4 GB/s Total network traffic: 93.8292 MB/time step Average message size: 1893.41 kB -Min waiting time: 0.6 % of total time -Max waiting time: 0.6 % of total time -Avg waiting time: 0.6 % of total time +Min waiting time: 2.8 % of total time +Max waiting time: 4.1 % of total time +Avg waiting time: 3.45 % of total time ----------------------------------------------------------------------------------------------------------- GPU statistics ----------------------------------------------------------------------------------------------------------- Counter description | Time per step | % loop time | Call(s)/step | Bandwidth ----------------------------------------------------------------------------------------------------------- -Libraries: | 0.402598 | 85.1 | 1 | -Kernels: | 0.0651709 | 13.8 | 115 | -Copy host to device: | 0.000457177 | 0.1 | 5 | 3.1 GB/s -Copy device to host: | 0.000608826 | 0.1 | 4 | 5.0 GB/s -Alloc/Free on device: | 7.70153e-05 | 0.0 | 6 | -GPU: 99% Copy H<->D: 0.23% Alloc/free: 0.016% Comm: 0.61% CPU & I/O: 0.3% +Libraries: | 0.161239 | 75.2 | 1 | +Kernels: | 0.0413889 | 19.3 | 115 | +Copy host to device: | 0.000220838 | 0.1 | 5 | 6.4 GB/s +Copy device to host: | 0.00060274 | 0.3 | 4 | 5.1 GB/s +Alloc/Free on device: | 2.69667e-07 | 0.0 | 6 | +GPU: 95% Copy H<->D: 0.38% Alloc/free: 0.00013% Comm: 4.2% CPU & I/O: 0.84% ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Post-resolution statistics ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -Time of the post-resolution: 0.0699469 +Time of the post-resolution: 0.0763386 Maximum number of MPI allreduce per time step 6 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -97,5 +97,5 @@ Max waiting time big => probably due to a bad partitioning Communications > 30% => too many processors or network too slow ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -Total time for the whole computation 12.2108 +Total time for the whole computation 8.0193 diff --git a/tests/GPU/OpenMP_Iterateur/PAR_OpenMP_Iterateur_BENCH_PETSc.TU.lumi_gfx90ax4 b/tests/GPU/OpenMP_Iterateur/PAR_OpenMP_Iterateur_BENCH_PETSc.TU.lumi_gfx90ax4 index c2bef7f8ba..30e69c3d42 100644 --- a/tests/GPU/OpenMP_Iterateur/PAR_OpenMP_Iterateur_BENCH_PETSc.TU.lumi_gfx90ax4 +++ b/tests/GPU/OpenMP_Iterateur/PAR_OpenMP_Iterateur_BENCH_PETSc.TU.lumi_gfx90ax4 @@ -8,8 +8,8 @@ Time is given in seconds ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Context of the computation ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -Date: 24-02-2026 -- 00:19:51 -OS: nid007971__Linux__x86_64__6.4.0-150600.23.73_15.0.14-cray_shasta_c__#1 SMP Tue Oct 21 20:32:25 UTC 2025 (89d3c98) +Date: 05-06-2026 -- 23:15:15 +OS: nid005002__Linux__x86_64__6.4.0-150600.23.73_15.0.14-cray_shasta_c__#1 SMP Tue Oct 21 20:32:25 UTC 2025 (89d3c98) CPU model : AMD EPYC 7A53 64-Core Processor Total number of threads:128 GPU model: AMD Instinct MI250X @@ -22,74 +22,74 @@ Total number of elements used for the calculation: 2592000 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Computation start-up statistics ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -Total time of the start-up: 37.6937 -Number of virtual exchanges: 61 -Maximum number of MPI allreduce per time step 113 +Total time of the start-up: 50.583 +Number of virtual exchanges: 59 +Maximum number of MPI allreduce per time step 112 Number of calls to the linear solver per time step: 2 -Average time of the resolution of the linear problem per call: 2.50994 +Average time of the resolution of the linear problem per call: 3.77037 Average number of iteration of the linear solver per call: 17.5 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Time loop statistics ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ The first time step is not accounted for the computation of the time loop statistics -Total time of the time loop: 0.751937 +Total time of the time loop: 0.526528 Number of time steps: 9 Skipped time steps: 1 -Average time per time step: 0.0835485 -Standard deviation between time steps: 0.00334088 -Time elapsed in the skipped time steps: 0.109768 +Average time per time step: 0.0585031 +Standard deviation between time steps: 0.0027974 +Time elapsed in the skipped time steps: 0.0731386 -Percent of total time spend in communication: 1.7599 +Percent of total time spend in communication: 2.01856 Standard counter description | Time/step | % loop time | Call(s)/step ------------------------------------------------------------------------------------------ -Linear solver resolutions Ax=B | 0.0598234 | 62.5 | 1 -Convection operator | 0.005831945 | 6.1 | 1 -Diffusion operator | 0.00206353 | 2.2 | 1 -Gradient operator | 0.006198873 | 6.5 | 2 -Divergence operator | 0.001405442 | 1.5 | 2 -Update ::mettre_a_jour | 0.002029805 | 2.1 | 1 -Computation of the time step dt | 0.001912096 | 2.0 | 2 -Post-treatment operations | 0.001101474 | 1.2 | 1 -Other operations | 0.003181929 | 3.3 | -Number of virtual exchanges per time step: 10 +Linear solver resolutions Ax=B | 0.0386508 | 66.1 | 1 +Convection operator | 0.005404908 | 9.2 | 1 +Diffusion operator | 0.002414174 | 4.1 | 1 +Gradient operator | 0.00281482 | 4.8 | 2 +Divergence operator | 0.0008206784 | 1.4 | 2 +Update ::mettre_a_jour | 0.00173183 | 3.0 | 1 +Computation of the time step dt | 0.001880702 | 3.2 | 2 +Post-treatment operations | 0.001186549 | 2.0 | 1 +Other operations | 0.00359863 | 6.2 | +Number of virtual exchanges per time step: 9 Maximum number of MPI allreduce per time step 14 -Average number of iteration of the linear solver per call: 13.6 +Average number of iteration of the linear solver per call: 14.1 --------------------------------------------------------------------------------------------------------- Warning: One or several PETSc solvers are used and thus the communication time below are under-estimated. --------------------------------------------------------------------------------------------------------- -Average of the fraction of the time spent in communications between processors: 1.9 % -Max of the fraction of the time spent in communications between processors: 2.5 % -Min of the fraction of the time spent in communications between processors: 1.9 % -Time of one mpsum measured by an internal bench over 0.1s (network latency): 2.09506e-06 -Network maximum bandwidth on all processors: 43.8 GB/s +Average of the fraction of the time spent in communications between processors: 2.9 % +Max of the fraction of the time spent in communications between processors: 4.3 % +Min of the fraction of the time spent in communications between processors: 2.2 % +Time of one mpsum measured by an internal bench over 0.1s (network latency): 2.31699e-06 +Network maximum bandwidth on all processors: 40.1 GB/s Total network traffic: 194.73 MB/time step Average message size: 751.53 kB -Min waiting time: 1.8 % of total time -Max waiting time: 2.4 % of total time -Avg waiting time: 2.125 % of total time +Min waiting time: 2.2 % of total time +Max waiting time: 4 % of total time +Avg waiting time: 3.225 % of total time ----------------------------------------------------------------------------------------------------------- GPU statistics ----------------------------------------------------------------------------------------------------------- Counter description | Time per step | % loop time | Call(s)/step | Bandwidth ----------------------------------------------------------------------------------------------------------- -Libraries: | 0.0595808 | 71.3 | 1 | -Kernels: | 0.020409 | 24.4 | 151 | -Copy host to device: | 0.00016724 | 0.2 | 5 | 4.6 GB/s -Copy device to host: | 0.000260598 | 0.3 | 4 | 5.9 GB/s -Alloc/Free on device: | 1.6014e-05 | 0.0 | 6 | -GPU: 96% Copy H<->D: 0.51% Alloc/free: 0.019% Comm: 2% CPU & I/O: 1.7% +Libraries: | 0.0384101 | 65.7 | 1 | +Kernels: | 0.0168738 | 28.8 | 151 | +Copy host to device: | 0.000169971 | 0.3 | 5 | 4.5 GB/s +Copy device to host: | 0.000271035 | 0.5 | 4 | 5.6 GB/s +Alloc/Free on device: | 4.24111e-07 | 0.0 | 6 | +GPU: 94% Copy H<->D: 0.75% Alloc/free: 0.00072% Comm: 2.3% CPU & I/O: 2.4% ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Post-resolution statistics ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -Time of the post-resolution: 0.599376 +Time of the post-resolution: 0.829184 Maximum number of MPI allreduce per time step 6 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -97,6 +97,6 @@ Max waiting time big => probably due to a bad partitioning Communications > 30% => too many processors or network too slow ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -Total time for the whole computation 39.1548 +Total time for the whole computation 52.0119 -Power consumption: 0 kW 0 kWh 0 € (0.12€/kWh) +[Slurm] Power consumption (74 s): 0.495 kW 0.010 kWh 0.001 € (0.10€/kWh) diff --git a/tests/GPU/OpenMP_Iterateur/PAR_OpenMP_Iterateur_BENCH_PETSc_10.TU.adastra_gfx90ax16 b/tests/GPU/OpenMP_Iterateur/PAR_OpenMP_Iterateur_BENCH_PETSc_10.TU.adastra_gfx90ax16 index e56f9af632..c4c221bcd4 100644 --- a/tests/GPU/OpenMP_Iterateur/PAR_OpenMP_Iterateur_BENCH_PETSc_10.TU.adastra_gfx90ax16 +++ b/tests/GPU/OpenMP_Iterateur/PAR_OpenMP_Iterateur_BENCH_PETSc_10.TU.adastra_gfx90ax16 @@ -1,102 +1 @@ - # Global performance file # - -This is the global file for tracking performance in TRUST. It stores aggregated quantities. -More detailed statistics can be found in the PAR_OpenMP_Iterateur_BENCH_PETSc_10_csv.TU file -For time loop, only standard counters of level 1 are printed alongside your custom counters -Time is given in seconds - -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - Context of the computation -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -Date: 18-03-2026 -- 20:04:16 -OS: g1085__Linux__x86_64__5.14.0-570.69.1.el9_6.x86_64__#1 SMP PREEMPT_DYNAMIC Tue Nov 25 01:30:14 EST 2025 -CPU model : AMD EPYC 7A53 64-Core Processor -Total number of threads:128 -GPU model: AMD Instinct MI250X -HIP runtime version: 6.43 -HIP drivers version: 6.43 -Nb procs used for the computation: 16 -TRUST version: 1.9.8_beta -Total number of elements used for the calculation: 80864000 - -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - Computation start-up statistics -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -Total time of the start-up: 54.0581 -Number of virtual exchanges: 61 -Maximum number of MPI allreduce per time step 113 - -Number of calls to the linear solver per time step: 2 -Average time of the resolution of the linear problem per call: 8.03732 -Average number of iteration of the linear solver per call: 23 - -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - Time loop statistics -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -The first time step is not accounted for the computation of the time loop statistics -Total time of the time loop: 3.87395 -Number of time steps: 9 -Skipped time steps: 1 -Average time per time step: 0.430439 -Standard deviation between time steps: 0.0201513 -Time elapsed in the skipped time steps: 0.520931 - -Percent of total time spend in communication: 4.0433 - -Standard counter description | Time/step | % loop time | Call(s)/step ------------------------------------------------------------------------------------------- -Linear solver resolutions Ax=B | 0.262045 | 60.9 | 1 -Convection operator | 0.03891157 | 9.0 | 1 -Diffusion operator | 0.01705334 | 4.0 | 1 -Gradient operator | 0.05048981 | 11.7 | 2 -Divergence operator | 0.01093064 | 2.5 | 2 -Update ::mettre_a_jour | 0.01398206 | 3.2 | 1 -Computation of the time step dt | 0.01677364 | 3.9 | 2 -Post-treatment operations | 0.00391899 | 0.9 | 1 -Other operations | 0.01633393 | 3.8 | -Number of virtual exchanges per time step: 10 -Maximum number of MPI allreduce per time step 14 - -Average number of iteration of the linear solver per call: 16.8 - - ---------------------------------------------------------------------------------------------------------- -Warning: One or several PETSc solvers are used and thus the communication time below are under-estimated. ---------------------------------------------------------------------------------------------------------- - -Average of the fraction of the time spent in communications between processors: 3.3 % -Max of the fraction of the time spent in communications between processors: 4.8 % -Min of the fraction of the time spent in communications between processors: 2.7 % -Time of one mpsum measured by an internal bench over 0.1s (network latency): 8.05391e-06 -Network maximum bandwidth on all processors: 47.5 GB/s -Total network traffic: 3572.84 MB/time step -Average message size: 2296.83 kB -Min waiting time: 2.8 % of total time -Max waiting time: 4.5 % of total time -Avg waiting time: 3.66875 % of total time - ------------------------------------------------------------------------------------------------------------ - GPU statistics ------------------------------------------------------------------------------------------------------------ -Counter description | Time per step | % loop time | Call(s)/step | Bandwidth ------------------------------------------------------------------------------------------------------------ -Libraries: | 0.261533 | 60.8 | 1 | -Kernels: | 0.144021 | 33.5 | 133 | -Copy host to device: | 0.000297935 | 0.1 | 5 | 9.1 GB/s -Copy device to host: | 0.000368028 | 0.1 | 4 | 14.3 GB/s -Alloc/Free on device: | 1.32222e-05 | 0.0 | 6 | -GPU: 94% Copy H<->D: 0.15% Alloc/free: 0.0031% Comm: 4.6% CPU & I/O: 1% -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - Post-resolution statistics -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -Time of the post-resolution: 3.45915 -Maximum number of MPI allreduce per time step 6 - -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -Max waiting time big => probably due to a bad partitioning -Communications > 30% => too many processors or network too slow -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -Total time for the whole computation 61.9121 - -[Slurm] Power consumption (71 s): 1.265 kW 0.025 kWh 0.002 € (0.10€/kWh) +[Slurm] Power consumption (55 s): 0.558 kW 0.009 kWh 0.001 € (0.10€/kWh) diff --git a/tests/GPU/OpenMP_Iterateur/PAR_OpenMP_Iterateur_BENCH_PETSc_10.TU.adastra_gfx942x16 b/tests/GPU/OpenMP_Iterateur/PAR_OpenMP_Iterateur_BENCH_PETSc_10.TU.adastra_gfx942x16 index 0342f9c835..ce3dcf89ba 100644 --- a/tests/GPU/OpenMP_Iterateur/PAR_OpenMP_Iterateur_BENCH_PETSc_10.TU.adastra_gfx942x16 +++ b/tests/GPU/OpenMP_Iterateur/PAR_OpenMP_Iterateur_BENCH_PETSc_10.TU.adastra_gfx942x16 @@ -8,7 +8,7 @@ Time is given in seconds ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Context of the computation ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -Date: 17-03-2026 -- 19:20:34 +Date: 08-06-2026 -- 14:52:55 OS: a1007__Linux__x86_64__5.14.0-570.69.1.el9_6.x86_64__#1 SMP PREEMPT_DYNAMIC Tue Nov 25 01:30:14 EST 2025 CPU model : AMD Instinct MI300A Accelerator Total number of threads:192 @@ -22,39 +22,39 @@ Total number of elements used for the calculation: 80864000 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Computation start-up statistics ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -Total time of the start-up: 42.2627 -Number of virtual exchanges: 61 -Maximum number of MPI allreduce per time step 113 +Total time of the start-up: 44.6328 +Number of virtual exchanges: 59 +Maximum number of MPI allreduce per time step 112 Number of calls to the linear solver per time step: 2 -Average time of the resolution of the linear problem per call: 5.57906 +Average time of the resolution of the linear problem per call: 5.07416 Average number of iteration of the linear solver per call: 23 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Time loop statistics ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ The first time step is not accounted for the computation of the time loop statistics -Total time of the time loop: 2.96566 +Total time of the time loop: 1.65019 Number of time steps: 9 Skipped time steps: 1 -Average time per time step: 0.329517 -Standard deviation between time steps: 0.0188315 -Time elapsed in the skipped time steps: 0.458303 +Average time per time step: 0.183354 +Standard deviation between time steps: 0.00993221 +Time elapsed in the skipped time steps: 0.272063 -Percent of total time spend in communication: 4.53575 +Percent of total time spend in communication: 8.04868 Standard counter description | Time/step | % loop time | Call(s)/step ------------------------------------------------------------------------------------------ -Linear solver resolutions Ax=B | 0.273889 | 83.1 | 1 -Convection operator | 0.01395275 | 4.2 | 1 -Diffusion operator | 0.003969847 | 1.2 | 1 -Gradient operator | 0.004818868 | 1.5 | 2 -Divergence operator | 0.003827343 | 1.2 | 2 -Update ::mettre_a_jour | 0.008657538 | 2.6 | 1 -Computation of the time step dt | 0.00643696 | 2.0 | 2 -Post-treatment operations | 0.002952047 | 0.9 | 1 -Other operations | 0.01101313 | 3.3 | -Number of virtual exchanges per time step: 10 +Linear solver resolutions Ax=B | 0.133307 | 72.7 | 1 +Convection operator | 0.01025036 | 5.6 | 1 +Diffusion operator | 0.004059215 | 2.2 | 1 +Gradient operator | 0.004627898 | 2.5 | 2 +Divergence operator | 0.001907161 | 1.0 | 2 +Update ::mettre_a_jour | 0.007985502 | 4.4 | 1 +Computation of the time step dt | 0.006856734 | 3.7 | 2 +Post-treatment operations | 0.002799652 | 1.5 | 1 +Other operations | 0.01156031 | 6.3 | +Number of virtual exchanges per time step: 9 Maximum number of MPI allreduce per time step 14 Average number of iteration of the linear solver per call: 16.8 @@ -64,32 +64,32 @@ Average number of iteration of the linear solver per call: 16.8 Warning: One or several PETSc solvers are used and thus the communication time below are under-estimated. --------------------------------------------------------------------------------------------------------- -Average of the fraction of the time spent in communications between processors: 4 % -Max of the fraction of the time spent in communications between processors: 5.7 % -Min of the fraction of the time spent in communications between processors: 3.8 % -Time of one mpsum measured by an internal bench over 0.1s (network latency): 2.05594e-05 -Network maximum bandwidth on all processors: 43.6 GB/s +Average of the fraction of the time spent in communications between processors: 7 % +Max of the fraction of the time spent in communications between processors: 10.7 % +Min of the fraction of the time spent in communications between processors: 6.4 % +Time of one mpsum measured by an internal bench over 0.1s (network latency): 1.91406e-05 +Network maximum bandwidth on all processors: 42.1 GB/s Total network traffic: 3572.84 MB/time step Average message size: 2296.83 kB -Min waiting time: 3.7 % of total time -Max waiting time: 5.2 % of total time -Avg waiting time: 4.5625 % of total time +Min waiting time: 6.9 % of total time +Max waiting time: 9.2 % of total time +Avg waiting time: 7.975 % of total time ----------------------------------------------------------------------------------------------------------- GPU statistics ----------------------------------------------------------------------------------------------------------- Counter description | Time per step | % loop time | Call(s)/step | Bandwidth ----------------------------------------------------------------------------------------------------------- -Libraries: | 0.273451 | 83.0 | 1 | -Kernels: | 0.0357669 | 10.9 | 133 | -Copy host to device: | 0.000190326 | 0.1 | 5 | 14.3 GB/s -Copy device to host: | 0.000208065 | 0.1 | 4 | 25.3 GB/s -Alloc/Free on device: | 1.49636e-05 | 0.0 | 6 | -GPU: 94% Copy H<->D: 0.12% Alloc/free: 0.0045% Comm: 5.2% CPU & I/O: 0.8% +Libraries: | 0.132898 | 72.5 | 1 | +Kernels: | 0.0301383 | 16.4 | 133 | +Copy host to device: | 0.000228833 | 0.1 | 5 | 11.9 GB/s +Copy device to host: | 0.000239867 | 0.1 | 4 | 22.0 GB/s +Alloc/Free on device: | 2.84444e-07 | 0.0 | 6 | +GPU: 89% Copy H<->D: 0.26% Alloc/free: 0.00016% Comm: 9.4% CPU & I/O: 1.4% ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Post-resolution statistics ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -Time of the post-resolution: 1.95444 +Time of the post-resolution: 2.20561 Maximum number of MPI allreduce per time step 6 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -97,6 +97,6 @@ Max waiting time big => probably due to a bad partitioning Communications > 30% => too many processors or network too slow ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -Total time for the whole computation 47.6411 +Total time for the whole computation 48.7606 -[Slurm] Power consumption (57 s): 1.815 kW 0.029 kWh 0.003 € (0.10€/kWh) +[Slurm] Power consumption (57 s): 2.676 kW 0.042 kWh 0.004 € (0.10€/kWh) diff --git a/tests/GPU/OpenMP_Iterateur/PAR_OpenMP_Iterateur_BENCH_PETSc_10.TU.jean-zay_cc90x8 b/tests/GPU/OpenMP_Iterateur/PAR_OpenMP_Iterateur_BENCH_PETSc_10.TU.jean-zay_cc90x8 new file mode 100644 index 0000000000..725817887b --- /dev/null +++ b/tests/GPU/OpenMP_Iterateur/PAR_OpenMP_Iterateur_BENCH_PETSc_10.TU.jean-zay_cc90x8 @@ -0,0 +1,102 @@ + # Global performance file # + +This is the global file for tracking performance in TRUST. It stores aggregated quantities. +More detailed statistics can be found in the PAR_OpenMP_Iterateur_BENCH_PETSc_10_csv.TU file +For time loop, only standard counters of level 1 are printed alongside your custom counters +Time is given in seconds + +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Context of the computation +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Date: 10-06-2026 -- 15:37:09 +OS: jzxh250__Linux__x86_64__5.14.0-570.116.1.el9_6.x86_64__#1 SMP PREEMPT_DYNAMIC Tue May 19 08:44:54 EDT 2026 +CPU model : Intel(R) Xeon(R) Platinum 8468 +Total number of threads:192 +GPU model: NVIDIA H100 80GB HBM3 +CUDA runtime version: 12.60 +CUDA drivers version: 13.20 +Nb procs used for the computation: 8 +TRUST version: 1.9.8_beta +Total number of elements used for the calculation: 80864000 + +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Computation start-up statistics +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Total time of the start-up: 50.1571 +Number of virtual exchanges: 59 +Maximum number of MPI allreduce per time step 112 + +Number of calls to the linear solver per time step: 2 +Average time of the resolution of the linear problem per call: 9.18945 +Average number of iteration of the linear solver per call: 28 + +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Time loop statistics +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +The first time step is not accounted for the computation of the time loop statistics +Total time of the time loop: 3.84827 +Number of time steps: 9 +Skipped time steps: 1 +Average time per time step: 0.427586 +Standard deviation between time steps: 0.0255095 +Time elapsed in the skipped time steps: 0.699009 + +Percent of total time spend in communication: 1.14865 + +Standard counter description | Time/step | % loop time | Call(s)/step +------------------------------------------------------------------------------------------ +Linear solver resolutions Ax=B | 0.368848 | 86.3 | 1 +Convection operator | 0.0146931 | 3.4 | 1 +Diffusion operator | 0.005882547 | 1.4 | 1 +Gradient operator | 0.00578996 | 1.4 | 2 +Divergence operator | 0.002865112 | 0.7 | 2 +Update ::mettre_a_jour | 0.009087068 | 2.1 | 1 +Computation of the time step dt | 0.003678033 | 0.9 | 2 +Post-treatment operations | 0.004289857 | 1.0 | 1 +Other operations | 0.01245212 | 2.9 | +Number of virtual exchanges per time step: 9 +Maximum number of MPI allreduce per time step 14 + +Average number of iteration of the linear solver per call: 20.9 + + +--------------------------------------------------------------------------------------------------------- +Warning: One or several PETSc solvers are used and thus the communication time below are under-estimated. +--------------------------------------------------------------------------------------------------------- + +Average of the fraction of the time spent in communications between processors: 1.6 % +Max of the fraction of the time spent in communications between processors: 2.7 % +Min of the fraction of the time spent in communications between processors: 1.3 % +Time of one mpsum measured by an internal bench over 0.1s (network latency): 5.57243e-06 +Network maximum bandwidth on all processors: 180.4 GB/s +Total network traffic: 2535.2 MB/time step +Average message size: 4541.56 kB +Min waiting time: 1.3 % of total time +Max waiting time: 2.5 % of total time +Avg waiting time: 1.9375 % of total time + +----------------------------------------------------------------------------------------------------------- + GPU statistics +----------------------------------------------------------------------------------------------------------- +Counter description | Time per step | % loop time | Call(s)/step | Bandwidth +----------------------------------------------------------------------------------------------------------- +Libraries: | 0.368458 | 86.2 | 1 | +Kernels: | 0.0464343 | 10.9 | 133 | +Copy host to device: | 0.000551314 | 0.1 | 5 | 8.8 GB/s +Copy device to host: | 0.000781695 | 0.2 | 4 | 11.7 GB/s +Alloc/Free on device: | 3.24e-07 | 0.0 | 6 | +GPU: 97% Copy H<->D: 0.31% Alloc/free: 7.6e-05% Comm: 1.4% CPU & I/O: 1.3% +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Post-resolution statistics +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Time of the post-resolution: 2.33551 +Maximum number of MPI allreduce per time step 6 + +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Max waiting time big => probably due to a bad partitioning +Communications > 30% => too many processors or network too slow +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Total time for the whole computation 57.0399 + +[Slurm] Power consumption (88 s): 0.947 kW 0.023 kWh 0.002 € (0.10€/kWh) diff --git a/tests/GPU/OpenMP_Iterateur/PAR_OpenMP_Iterateur_BENCH_PETSc_10.TU.lumi_gfx90ax16 b/tests/GPU/OpenMP_Iterateur/PAR_OpenMP_Iterateur_BENCH_PETSc_10.TU.lumi_gfx90ax16 index 654fbec68e..d073b666c4 100644 --- a/tests/GPU/OpenMP_Iterateur/PAR_OpenMP_Iterateur_BENCH_PETSc_10.TU.lumi_gfx90ax16 +++ b/tests/GPU/OpenMP_Iterateur/PAR_OpenMP_Iterateur_BENCH_PETSc_10.TU.lumi_gfx90ax16 @@ -8,8 +8,8 @@ Time is given in seconds ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Context of the computation ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -Date: 24-02-2026 -- 00:30:07 -OS: nid007973__Linux__x86_64__6.4.0-150600.23.73_15.0.14-cray_shasta_c__#1 SMP Tue Oct 21 20:32:25 UTC 2025 (89d3c98) +Date: 15-05-2026 -- 21:00:20 +OS: nid005023__Linux__x86_64__6.4.0-150600.23.73_15.0.14-cray_shasta_c__#1 SMP Tue Oct 21 20:32:25 UTC 2025 (89d3c98) CPU model : AMD EPYC 7A53 64-Core Processor Total number of threads:128 GPU model: AMD Instinct MI250X @@ -22,39 +22,39 @@ Total number of elements used for the calculation: 80864000 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Computation start-up statistics ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -Total time of the start-up: 62.7028 -Number of virtual exchanges: 61 -Maximum number of MPI allreduce per time step 113 +Total time of the start-up: 78.8995 +Number of virtual exchanges: 59 +Maximum number of MPI allreduce per time step 112 Number of calls to the linear solver per time step: 2 -Average time of the resolution of the linear problem per call: 7.42008 +Average time of the resolution of the linear problem per call: 8.6603 Average number of iteration of the linear solver per call: 23 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Time loop statistics ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ The first time step is not accounted for the computation of the time loop statistics -Total time of the time loop: 3.98958 +Total time of the time loop: 3.3746 Number of time steps: 9 Skipped time steps: 1 -Average time per time step: 0.443287 -Standard deviation between time steps: 0.0201239 -Time elapsed in the skipped time steps: 0.540091 +Average time per time step: 0.374955 +Standard deviation between time steps: 0.0173637 +Time elapsed in the skipped time steps: 0.473033 -Percent of total time spend in communication: 4.67846 +Percent of total time spend in communication: 4.13723 Standard counter description | Time/step | % loop time | Call(s)/step ------------------------------------------------------------------------------------------ -Linear solver resolutions Ax=B | 0.26956 | 53.6 | 1 -Convection operator | 0.04405853 | 8.8 | 1 -Diffusion operator | 0.01489942 | 3.0 | 1 -Gradient operator | 0.04936467 | 9.8 | 2 -Divergence operator | 0.01089552 | 2.2 | 2 -Update ::mettre_a_jour | 0.01366762 | 2.7 | 1 -Computation of the time step dt | 0.01753448 | 3.5 | 2 -Post-treatment operations | 0.00330442 | 0.7 | 1 -Other operations | 0.02000149 | 4.0 | -Number of virtual exchanges per time step: 10 +Linear solver resolutions Ax=B | 0.25189 | 67.2 | 1 +Convection operator | 0.03422895 | 9.1 | 1 +Diffusion operator | 0.01616347 | 4.3 | 1 +Gradient operator | 0.01467866 | 3.9 | 2 +Divergence operator | 0.01064046 | 2.8 | 2 +Update ::mettre_a_jour | 0.01338688 | 3.6 | 1 +Computation of the time step dt | 0.01679689 | 4.5 | 2 +Post-treatment operations | 0.00328297 | 0.9 | 1 +Other operations | 0.01388677 | 3.7 | +Number of virtual exchanges per time step: 9 Maximum number of MPI allreduce per time step 14 Average number of iteration of the linear solver per call: 16.8 @@ -64,32 +64,32 @@ Average number of iteration of the linear solver per call: 16.8 Warning: One or several PETSc solvers are used and thus the communication time below are under-estimated. --------------------------------------------------------------------------------------------------------- -Average of the fraction of the time spent in communications between processors: 3.6 % -Max of the fraction of the time spent in communications between processors: 5.7 % -Min of the fraction of the time spent in communications between processors: 2.3 % -Time of one mpsum measured by an internal bench over 0.1s (network latency): 8.50623e-06 -Network maximum bandwidth on all processors: 54.1 GB/s +Average of the fraction of the time spent in communications between processors: 3.9 % +Max of the fraction of the time spent in communications between processors: 6.2 % +Min of the fraction of the time spent in communications between processors: 3.4 % +Time of one mpsum measured by an internal bench over 0.1s (network latency): 8.64124e-06 +Network maximum bandwidth on all processors: 42.6 GB/s Total network traffic: 3572.84 MB/time step Average message size: 2296.83 kB -Min waiting time: 2.5 % of total time -Max waiting time: 5.3 % of total time -Avg waiting time: 4.06875 % of total time +Min waiting time: 3.4 % of total time +Max waiting time: 5.8 % of total time +Avg waiting time: 4.3875 % of total time ----------------------------------------------------------------------------------------------------------- GPU statistics ----------------------------------------------------------------------------------------------------------- Counter description | Time per step | % loop time | Call(s)/step | Bandwidth ----------------------------------------------------------------------------------------------------------- -Libraries: | 0.269051 | 60.7 | 1 | -Kernels: | 0.146349 | 33.0 | 133 | -Copy host to device: | 0.000343226 | 0.1 | 5 | 7.9 GB/s -Copy device to host: | 0.000442977 | 0.1 | 4 | 11.9 GB/s -Alloc/Free on device: | 1.62248e-05 | 0.0 | 6 | -GPU: 94% Copy H<->D: 0.18% Alloc/free: 0.0037% Comm: 5.3% CPU & I/O: 0.8% +Libraries: | 0.25138 | 67.0 | 1 | +Kernels: | 0.101628 | 27.1 | 133 | +Copy host to device: | 0.000340213 | 0.1 | 5 | 8.0 GB/s +Copy device to host: | 0.000432693 | 0.1 | 4 | 12.2 GB/s +Alloc/Free on device: | 5.12e-07 | 0.0 | 6 | +GPU: 94% Copy H<->D: 0.21% Alloc/free: 0.00014% Comm: 4.7% CPU & I/O: 0.93% ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Post-resolution statistics ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -Time of the post-resolution: 3.27229 +Time of the post-resolution: 3.00196 Maximum number of MPI allreduce per time step 6 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -97,6 +97,6 @@ Max waiting time big => probably due to a bad partitioning Communications > 30% => too many processors or network too slow ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -Total time for the whole computation 70.5048 +Total time for the whole computation 85.7491 -Power consumption: 0 kW 0 kWh 0 € (0.12€/kWh) +[Slurm] Power consumption (114 s): 1.176 kW 0.037 kWh 0.004 € (0.10€/kWh) diff --git a/tests/GPU/OpenMP_Iterateur/check_perf.sh b/tests/GPU/OpenMP_Iterateur/check_perf.sh index b9c4f392cd..ecd6c19405 100755 --- a/tests/GPU/OpenMP_Iterateur/check_perf.sh +++ b/tests/GPU/OpenMP_Iterateur/check_perf.sh @@ -15,7 +15,7 @@ check() then mv -f $TU $TU_REF && [ "$TRUST_SCM" = 1 ] && git add $TU_REF echo "Creating new reference $TU_REF" - exit + exit 0 fi ref=`TU.sh $TU_REF -dt` new=`TU.sh $TU -dt` @@ -68,6 +68,9 @@ else [ $HOST = is157091 ] && run $HOST$GPU_ARCH 2 OpenMP_Iterateur_BENCH_PETSc [ "`hostname`" = petra ] && run $HOST$GPU_ARCH 2 [ $HOST = topaze ] && run $HOST$GPU_ARCH 4 && run $HOST$GPU_ARCH 8 OpenMP_Iterateur_BENCH_AmgX_10 + [ $HOST = dalianvl ] && run $HOST$GPU_ARCH 4 && run $HOST$GPU_ARCH 8 OpenMP_Iterateur_BENCH_AmgX_10 + [ $HOST = jean-zay ] && run $HOST$GPU_ARCH 4 && run $HOST$GPU_ARCH 8 OpenMP_Iterateur_BENCH_PETSc_10 && run $HOST$GPU_ARCH 8 OpenMP_Iterateur_BENCH_AmgX_10 + [ $HOST = dalia ] && run $HOST$GPU_ARCH 4 && run $HOST$GPU_ARCH 8 OpenMP_Iterateur_BENCH_AmgX_10 [ $HOST = adastra ] && run $HOST$GPU_ARCH 4 && run $HOST$GPU_ARCH 16 OpenMP_Iterateur_BENCH_PETSc_10 [ $HOST = lumi ] && run $HOST$GPU_ARCH 4 && run $HOST$GPU_ARCH 16 OpenMP_Iterateur_BENCH_PETSc_10 [ $HOST = irene-amd-ccrt ] && run $HOST$GPU_ARCH 4 && run $HOST$GPU_ARCH 8 OpenMP_Iterateur_BENCH_AmgX_10 diff --git a/tests/GPU/OpenMP_Iterateur/plot_scaling.py b/tests/GPU/OpenMP_Iterateur/plot_scaling.py new file mode 100644 index 0000000000..e156b3bb73 --- /dev/null +++ b/tests/GPU/OpenMP_Iterateur/plot_scaling.py @@ -0,0 +1,344 @@ +#!/usr/bin/env python3 +""" +plot_scaling.py +--------------- +Plot Performance [MDOF/s] vs Problem Size [MDOF] from *SCALING.* files. + +Usage: + python3 plot_scaling.py # auto-detect *SCALING.* in current dir + python3 plot_scaling.py file1 file2 ... # explicit files + python3 plot_scaling.py -o output.pdf file1 # custom output name + python3 plot_scaling.py -all # also plot Solver/Conv/Diff/Grad/Div + python3 plot_scaling.py -normalize # normalize each curve to [0, 1] +""" + +import sys +import os +import glob +import argparse +from collections import defaultdict + +import matplotlib +matplotlib.use("Agg") +import matplotlib.pyplot as plt +import matplotlib.ticker as ticker +import numpy as np + +# ── Matplotlib / rcParams for paper-quality output ────────────────────────── +plt.rcParams.update({ + "text.usetex": False, # set True if LaTeX is available + "font.family": "serif", + "font.serif": ["DejaVu Serif", "Times New Roman", "Times"], + "font.size": 11, + "axes.titlesize": 13, + "axes.labelsize": 12, + "xtick.labelsize": 10, + "ytick.labelsize": 10, + "legend.fontsize": 9, + "legend.framealpha": 0.85, + "lines.linewidth": 1.5, + "lines.markersize": 6, + "figure.dpi": 150, + "savefig.dpi": 300, + "savefig.bbox": "tight", + "savefig.pad_inches": 0.05, + "axes.grid": True, + "grid.linestyle": "--", + "grid.linewidth": 0.5, + "grid.alpha": 0.6, + "axes.spines.top": False, + "axes.spines.right": False, +}) + +# ── Colour / marker cycle (colour-blind-friendly) ──────────────────────────── +COLORS = [ + "#0072B2", # blue + "#D55E00", # vermilion + "#009E73", # green + "#CC79A7", # pink + "#E69F00", # orange + "#56B4E9", # sky blue + "#F0E442", # yellow +] +MARKERS = ["o", "s", "^", "D", "v", "P", "X"] + +# Metrics available in -all mode, with their column index and display name +ALL_METRICS = [ + ("MDOF/s", 10), + ("Solver", 11), + ("Conv", 13), + ("Diff", 14), + ("Grad", 15), + ("Div", 16), +] +# Line styles to distinguish metrics when multiple configs are present +LINESTYLES = ["-", "--", "-.", ":", (0, (3, 1, 1, 1)), (0, (5, 2))] + + +# ── File parsing ───────────────────────────────────────────────────────────── + +def parse_scaling_file(path: str, all_metrics: bool = False): + """ + Return parsed data from a SCALING file. + + Default mode → {config: ([MDOF], [MDOF/s])} + all_metrics → {config: ([MDOF], {metric_name: [values]})} + + Header layout (0-based column indices): + 0=Config 1=[MTET] 2=[MDOF] 3=TimeStep[s] 4=Solver[s] 5=[its] + 6=[ms/it] 7=Kernels[s] 8=RAM[GB] 9=DRAM[GB] 10=[MDOF/s] + 11=Solver 12=Kernels 13=Conv 14=Diff 15=Grad 16=Div + """ + raw_data: dict = defaultdict(lambda: defaultdict(list)) + + with open(path) as fh: + for lineno, line in enumerate(fh, 1): + line = line.strip() + if not line or line.startswith("#"): + continue + parts = line.split() + if parts[0] == "Config": + continue + min_cols = max(col for _, col in ALL_METRICS) + 1 if all_metrics else 11 + if len(parts) < min_cols: + if len(parts) >= 11: + # file has basic columns only; fall back to default parsing + pass + else: + print(f" [skip] {os.path.basename(path)}:{lineno} – only {len(parts)} columns") + continue + try: + config = parts[0] + mdof = float(parts[2]) + except (ValueError, IndexError) as exc: + print(f" [skip] {os.path.basename(path)}:{lineno} – {exc}") + continue + + raw_data[config]["mdof"].append(mdof) + + if all_metrics: + for name, col in ALL_METRICS: + try: + raw_data[config][name].append(float(parts[col])) + except (ValueError, IndexError): + raw_data[config][name].append(float("nan")) + else: + try: + raw_data[config]["MDOF/s"].append(float(parts[10])) + except (ValueError, IndexError) as exc: + print(f" [skip] {os.path.basename(path)}:{lineno} – {exc}") + raw_data[config]["mdof"].pop() + + # Convert to numpy arrays + result = {} + for config, arrays in raw_data.items(): + mdof = np.array(arrays["mdof"]) + if all_metrics: + metrics = {name: np.array(arrays[name]) for name, _ in ALL_METRICS} + result[config] = (mdof, metrics) + else: + result[config] = (mdof, np.array(arrays["MDOF/s"])) + return result + + +# ── Plotting ───────────────────────────────────────────────────────────────── + +def sort_configs(configs: list) -> list: + """Try to sort configs numerically by the leading MPI count, then GPU count.""" + def key(s): + try: + mpi_part = s.split("MPI")[0] + gpu_part = s.split("+")[1].replace("GPU", "") if "+" in s else "0" + return (int(mpi_part), int(gpu_part)) + except Exception: + return (0, 0) + return sorted(configs, key=key) + + +def _normalize(arr: np.ndarray) -> np.ndarray: + """Normalize array to [0, 1] by its maximum (ignoring NaN).""" + mx = np.nanmax(arr) + if mx == 0 or np.isnan(mx): + return arr + return arr / mx + + +def plot_scaling(files: list, output: str, all_mode: bool = False, normalize: bool = False, log: bool = False): + """Read all files and produce one figure per file (saved as PDF + PNG).""" + + for fpath in files: + print(f"Processing: {fpath}") + data = parse_scaling_file(fpath, all_metrics=all_mode) + + if not data: + print(f" [warn] no valid data found, skipping.") + continue + + fig, ax = plt.subplots(figsize=(6.5, 4.5)) + configs = sort_configs(list(data.keys())) + + if all_mode: + # ── -all: one curve per (config, metric) ──────────────────────── + # Colors cycle over metrics, line styles over configs + metric_names = [name for name, _ in ALL_METRICS] + metric_color = {name: COLORS[i % len(COLORS)] for i, name in enumerate(metric_names)} + config_ls = {cfg: LINESTYLES[i % len(LINESTYLES)] for i, cfg in enumerate(configs)} + + for config in configs: + mdof, metrics = data[config] + order = np.argsort(mdof) + mdof = mdof[order] + ls = config_ls[config] + + for name in metric_names: + vals = metrics[name][order] + if normalize: + vals = _normalize(vals) + label = f"{config} – {name}" + ax.plot( + mdof, vals, + color=metric_color[name], + linestyle=ls, + marker=MARKERS[metric_names.index(name) % len(MARKERS)], + label=label, + zorder=3, + ) + + # Legend: two sections – one for metrics (color), one for configs (linestyle) + from matplotlib.lines import Line2D + legend_handles = [] + for name in metric_names: + legend_handles.append(Line2D([0], [0], color=metric_color[name], + linewidth=2, label=name)) + if len(configs) > 1: + legend_handles.append(Line2D([0], [0], color="none", label="")) + for cfg in configs: + legend_handles.append(Line2D([0], [0], color="grey", + linestyle=config_ls[cfg], label=cfg)) + ax.legend(handles=legend_handles, fontsize=8, loc="best", handlelength=2.5) + + else: + # ── default: one curve per config, [MDOF/s] only ──────────────── + for idx, config in enumerate(configs): + mdof, perf = data[config] + order = np.argsort(mdof) + mdof, perf = mdof[order], perf[order] + if normalize: + perf = _normalize(perf) + + ax.plot( + mdof, perf, + color=COLORS[idx % len(COLORS)], + marker=MARKERS[idx % len(MARKERS)], + label=config, + zorder=3, + ) + + ncol = max(1, len(configs) // 6 + 1) + ax.legend(title="Configuration", ncol=ncol, loc="best", handlelength=2.0) + + # ── axes labels & title ────────────────────────────────────────────── + ax.set_xlabel("Problem size [MDOF]") + if normalize: + ax.set_ylabel("Normalized performance [0–1]") + ax.set_ylim(0, 1.05) + else: + ax.set_ylabel("Performance [MDOF/s]") + ax.set_ylim(bottom=0) + + base = os.path.splitext(os.path.basename(fpath))[0] + parts = base.split("_SCALING") + if len(parts) == 2: + case = parts[0].replace("_", " ") + suffix = parts[1].lstrip(".") + title = f"{case} – Performance Scaling" + if suffix: + title += f"\n({suffix})" + else: + title = base.replace("_", " ") + ax.set_title(title) + + if log: + ax.set_xscale("log") + ax.xaxis.set_major_formatter(ticker.LogFormatterSciNotation(labelOnlyBase=False)) + else: + ax.set_xlim(left=0) + ax.xaxis.set_minor_locator(ticker.AutoMinorLocator()) + ax.yaxis.set_minor_locator(ticker.AutoMinorLocator()) + ax.tick_params(which="minor", length=3) + + # ── save ───────────────────────────────────────────────────────────── + if output: + stem = os.path.splitext(output)[0] + ext = os.path.splitext(output)[1] or ".pdf" + idx_str = f"_{files.index(fpath):02d}" if len(files) > 1 else "" + out_path = stem + idx_str + ext + else: + dir_ = os.path.dirname(os.path.abspath(fpath)) + stem = os.path.splitext(os.path.basename(fpath))[0] + out_path = os.path.join(dir_, stem + ".pdf") + + fig.savefig(out_path) + print(f" -> saved: {out_path}") + + png_path = os.path.splitext(out_path)[0] + ".png" + fig.savefig(png_path) + print(f" -> saved: {png_path}") + + plt.close(fig) + + +# ── Entry point ─────────────────────────────────────────────────────────────── + +def main(): + parser = argparse.ArgumentParser( + description="Plot Performance [MDOF/s] vs Problem Size [MDOF] from SCALING files." + ) + parser.add_argument( + "files", + nargs="*", + help="SCALING file(s). If omitted, auto-detects *SCALING.* in the current directory.", + ) + parser.add_argument( + "-o", "--output", + default="", + help="Output file path (e.g. plot.pdf). Extension determines format.", + ) + parser.add_argument( + "-all", + action="store_true", + dest="all_mode", + help="Plot all kernel metrics (MDOF/s, Solver, Conv, Diff, Grad, Div) in the same figure.", + ) + parser.add_argument( + "-normalize", + action="store_true", + help="Normalize each curve by its maximum so values range from 0 to 1.", + ) + parser.add_argument( + "-log", + action="store_true", + help="Use a logarithmic scale on the X axis.", + ) + args = parser.parse_args() + + files = args.files + if not files: + files = sorted(glob.glob("*SCALING.*")) + _skip_exts = {".swp", ".pdf", ".png", ".eps", ".svg", + ".jpg", ".jpeg", ".pyc", "~"} + files = [f for f in files + if os.path.splitext(f)[1].lower() not in _skip_exts + and not f.endswith("~") + and os.path.isfile(f)] + if not files: + print("No SCALING files found in the current directory.") + print("Usage: python3 plot_scaling.py [file1 file2 ...] [-o output.pdf]") + sys.exit(1) + print(f"Auto-detected files: {files}") + + plot_scaling(files, args.output, all_mode=args.all_mode, normalize=args.normalize, log=args.log) + + +if __name__ == "__main__": + main() diff --git a/tests/GPU/OpenMP_Iterateur/scaling.sh b/tests/GPU/OpenMP_Iterateur/scaling.sh index 800239ec47..c33d03160a 100755 --- a/tests/GPU/OpenMP_Iterateur/scaling.sh +++ b/tests/GPU/OpenMP_Iterateur/scaling.sh @@ -1,8 +1,8 @@ -#!/bin/bash +#!/bin/bash # Scaling a mesh on several GPU [ "$TRUST_ROOT" = "" ] && echo "TRUST_ROOT empty." && exit - -# HOST: +scale=1.15 && [ "$1" != "" ] && scale=$1 +# HOST: HOST=${HOST%.intra.cea.fr} && [ "$HOST" = portable ] && HOST=is246827 # ARCH: GPU_ARCH="" @@ -28,12 +28,12 @@ do gpus="1 2" else gpus="1" - fi + fi # Target problem sizes in MDOF (millions of degrees of freedom). # MDOF formula: 1.2*40*(Nx-1)*(Ny-1)*(Nz-1)/1e6 # Nx, Ny, Nz are scaled uniformly from the base mesh in the .data file. - + # Read base Nombre_de_Noeuds from reference data file. # Prefer /* Nombre_de_Noeuds X Y Z */ commented template (scaling hint); # fall back to the last active Nombre_de_Noeuds line. @@ -44,40 +44,26 @@ do else ref_data=$ROOT/`basename $ROOT`.data fi - read Nx0 Ny0 Nz0 <<< $(awk ' - /\/\* Nombre_de_Noeuds [0-9]/ { cx=$3; cy=$4; cz=$5 } - /^[[:space:]]*Nombre_de_Noeuds [0-9]/ { ax=$2; ay=$3; az=$4 } - END { if (cx!="") print cx,cy,cz; else print ax,ay,az } - ' $ref_data) - [ -z "$Nx0" ] && echo "Error: Nombre_de_Noeuds not found in $ref_data" && exit 1 - echo "# Base mesh from $ref_data: Nombre_de_Noeuds $Nx0 $Ny0 $Nz0" - + Lx=`awk '/Longueurs/ {print $2}' $ref_data` + Ly=`awk '/Longueurs/ {print $3}' $ref_data` + Lz=`awk '/Longueurs/ {print $4}' $ref_data` + alpha=1 for gpu in $gpus do mpis=$TRUST_NB_PHYSICAL_CORES && [ $version = gpu ] && mpis=$gpu - mdof_target=0.1 - inc_mdof=0.1 for mpi in $mpis do while [ 1 ] do - # Compute new Nx Ny Nz matching the target MDOF via uniform scaling: - # alpha = cbrt( target_MDOF*1e6 / (1.2*40*(Nx0-1)*(Ny0-1)*(Nz0-1)) ) - # Ni = 1 + round( alpha * (Ni0-1) ) - read Nx Ny Nz <<< $(awk -v Nx0=$Nx0 -v Ny0=$Ny0 -v Nz0=$Nz0 -v target=$mdof_target \ - 'BEGIN { - base = 1.2 * 40 * (Nx0-1) * (Ny0-1) * (Nz0-1) - alpha = (target * 1e6 / base) ^ (1.0/3.0) - Nx = int(0.5 + 1 + alpha * (Nx0-1)); if (Nx < 2) Nx = 2 - Ny = int(0.5 + 1 + alpha * (Ny0-1)); if (Ny < 2) Ny = 2 - Nz = int(0.5 + 1 + alpha * (Nz0-1)); if (Nz < 2) Nz = 2 - print Nx, Ny, Nz - }') + Nx=`echo $Lx $alpha | awk '{print int($1*$2)}'` + Ny=`echo $Ly $alpha | awk '{print int($1*$2)}'` + Nz=`echo $Lz $alpha | awk '{print int($1*$2)}'` + alpha=`echo $alpha*$scale | bc -l` jdd=$mpi"_"$gpu"_"${Nx}x${Ny}x${Nz} mkdir -p $ROOT/scaling/$jdd && cd $ROOT/scaling/$jdd # Run ? run=1 && [ -f $jdd.out_err ] && run=0 - [ $run = 1 ] && echo "$jdd (target ${mdof_target} MDOF) ..." + [ $run = 1 ] && echo "$jdd ..." # Creation data if [ -f $ROOT/OpenMP_Iterateur.data ] then @@ -100,15 +86,16 @@ do # Decoupage [ $run = 1 ] && [ $mpi != 1 ] && (make_PAR.data $jdd $mpi 1>/dev/null 2>&1;cp PAR_$jdd.data $jdd.data) # Calcul - [ $run = 1 ] && (trust $jdd $mpi -ksp_view -journal=0 1>$jdd.out_err 2>&1 || (rm -f *.TU;echo "Error:See "`pwd`/$jdd.out_err)) - [ "`grep 'Arret des process' $jdd.out_err`" = "" ] && break + #[ $run = 1 ] && (trust $jdd $mpi -ksp_view -journal=0 1>$jdd.out_err 2>&1 || (rm -f *.TU;echo "Error:See "`pwd`/$jdd.out_err)) + [ $run = 1 ] && (trust $jdd $mpi -journal=0 1>$jdd.out_err 2>&1 || (rm -f *.TU;echo "Error:See "`pwd`/$jdd.out_err)) + [ "`grep 'Arret des process' $jdd.out_err`" = "" ] && alpha=`echo $alpha*0.5 | bc -l` && break # Analyse - i=0 && [ "$HOST" = adastra ] && i=1 - hram=`awk -v i=$i '/RAM taken/ {if ($(13+i)>RAM) RAM=$(13+i)} END {print 0.1*int(0.01*RAM)}' $jdd.out_err` - dram=`awk -v i=$i '/RAM allocated on a GPU/ {if ($(1+i)>RAM) RAM=$(1+i)} END {print RAM}' $jdd.out_err` - row=`awk '/Order of the PETSc matrix/ {print $7;exit}' $jdd.out_err` - faces=`awk '/Total number of faces/ {print $NF;exit}' $jdd.out_err` - elems=`awk '/Total number of elements/ {printf($NF);exit}' $jdd.out_err` + i=0 && [ "$HOST" = adastra ] && i=1 + hram=`awk -v i=$i '/RAM taken/ {if ($(13+i)>RAM) RAM=$(13+i)} END {print 0.1*int(0.01*RAM)}' $jdd.out_err` + dram=`awk -v i=$i '/RAM allocated on a GPU/ {if ($(1+i)>RAM) RAM=$(1+i)} END {print RAM}' $jdd.out_err` + row=`awk -v i=$i '/Order of the PETSc matrix/ {print $(7+i);exit}' $jdd.out_err` + faces=`awk -v i=$i '/Total number of faces/ {print $NF;exit}' $jdd.out_err` + elems=`awk -v i=$i '/Total number of elements/ {printf($NF);exit}' $jdd.out_err` # No better to use dof=row dof=$row #dof=`echo 1*$faces | bc -l` # En VDF @@ -117,21 +104,21 @@ do its=`TU.sh $jdd.TU -its` awk -v mpi=$mpi -v gpu=$gpu -v elems=$elems -v row=$row -v dof=$dof -v hram=$hram -v dram=$dram -v dt=$dt -v its=$its '\ BEGIN {config=mpi"MPI"(gpu==0?"":"+"gpu"GPU");mdof=dof/1e6;mtet=elems/1e6} \ - /Linear solver/ {ts=$6;b=dt-ts;ls=mdof/ts} \ + /Linear solver/ {ts=$6;b=dt-ts;ls=mdof/ts*its} \ /Convection operator/ { conv=mdof/$4*$8 } \ /Diffusion operator/ { diff=mdof/$4*$8 } \ /Gradient operator/ { grad=mdof/$4*$8 } \ /Divergence operator/ { dive=mdof/$4*$8 } \ - /Kernels:/ { ks=mdof/$3 } \ - END {printf("%s %7.3f %7.3f %11.3f %9.3f %5d %7.1f %10.3f %7.1f %8.1f %8.1f %6.1f %6.1f %4d %4d %4d %4d\n", \ + /Kernels:/ { ks=int(mdof/$3) } \ + END {printf("%s %7.3f %7.3f %11.3f %9.3f %5d %7.1f %10.3f %7.1f %8.1f %8.1f %4d %7d %4d %4d %4d %4d\n", \ config, mtet, mdof, dt, ts, its, 1000*ts/its, b, hram, dram, mdof/dt, ls, ks, conv, diff, grad, dive)}' $jdd.TU | tee -a $log # Clean - rm -f *.sauv *.xyz *.*lata* *.cgns* *.face *.son *.lml - mdof_target=`echo $mdof_target+$inc_mdof | bc -l` + rm -f *.sauv *.xyz *.*lata* *.cgns* *.face *.son *.lml done done done done echo "$log created." -python3 ./plot_scaling.py -display JEL_bous_SCALING.png +cd $ROOT +python3 ../OpenMP_Iterateur/plot_scaling.py +echo "display JEL_bous_SCALING.png" diff --git a/tests/GPU/OpenMP_Iterateur/weak_scaling.sh b/tests/GPU/OpenMP_Iterateur/weak_scaling.sh index 9169fa5b59..36b72d85fd 100755 --- a/tests/GPU/OpenMP_Iterateur/weak_scaling.sh +++ b/tests/GPU/OpenMP_Iterateur/weak_scaling.sh @@ -80,11 +80,13 @@ do load_imbalance=`awk '/Load imbalance/ {print $NF}' $jdd.out_err | tail -1` dof=`awk '/Total number of elements/ {print $NF}' $jdd.out_err | tail -1` dram=`awk '/allocated on a GPU/ {print $1}' $jdd.out_err | tail -1` - its=`awk '/Iterations/ && /solveur/ {print $NF}' $jdd.TU` + its=`TU.sh $jdd.TU -its` + dt=`TU.sh $jdd.TU -dt` + s=`TU.sh $jdd.TU -solver` gpu="\t" && [ $bench = gpu ] && gpu="+"$mpi"GPU" direct="Off" && [ "`grep 'Enabling GPU' $jdd.out_err`" != "" ] && direct="On" kj=`grep -l $jdd myjob.* 2>/dev/null | tail -1 | awk -F. '{print $2}' | xargs -I {} sacct --format=JobID,ElapsedRaw,ConsumedEnergyRaw,NodeList --jobs={} 2>/dev/null | awk '/\.batch/ {print $3}'` - awk -v host=$HOST -v mpi=$mpi"MPI" -v gpu=$gpu -v dof=$dof -v lib=$load_imbalance -v its=$its -v direct=$direct -v kj=$kj -v dram=$dram '/Secondes/ && /pas de temps/ {dt=$NF} /Dont solveurs/ {s=$4;b=dt-s} END {print host" \t"dof" \t"mpi""gpu"\t"dt" \t"s" \t"b" \t"int(dof/dt*0.001*0.001)" \t"int(its)" \t"lib" \t"kj" \t\t"direct" \t\t"dram" \t\t"1000*s/its}' $jdd.TU + awk -v host=$HOST -v mpi=$mpi"MPI" -v gpu=$gpu -v dof=$dof -v lib=$load_imbalance -v its=$its -v direct=$direct -v kj=$kj -v dram=$dram -v dt=$dt -v s=$s 'END {print host" \t"dof" \t"mpi""gpu"\t"dt" \t"s" \t"dt-s" \t"int(dof/dt*0.001*0.001)" \t"int(its)" \t"lib" \t"kj" \t\t"direct" \t\t"dram" \t\t"1000*s/its}' $jdd.TU rm -f *.xyz *.sauv *.Zones # Clean cd - 1>/dev/null 2>&1 done diff --git a/tests/GPU/OpenMP_QC/OpenMP_QC.data b/tests/GPU/OpenMP_QC/OpenMP_QC.data index 5bf205259d..336b8fd9a8 100644 --- a/tests/GPU/OpenMP_QC/OpenMP_QC.data +++ b/tests/GPU/OpenMP_QC/OpenMP_QC.data @@ -25,7 +25,7 @@ Scatter DOM.Zones dom END SCATTER # VEFPreP1B dis -lire dis { P0 P1 changement_de_base_P1bulle 1 CL_pression_sommet_faible 0 modif_div_face_dirichlet 0 } +Lire dis { reorder { algo Hilbert } } Runge_Kutta_rationnel_ordre_2 sch lire sch diff --git a/tests/GPU/OpenMP_QC/OpenMP_QC_BENCH.TU.adastra_gfx90a b/tests/GPU/OpenMP_QC/OpenMP_QC_BENCH.TU.adastra_gfx90a index cdd75e4ae9..ca96c4e05a 100644 --- a/tests/GPU/OpenMP_QC/OpenMP_QC_BENCH.TU.adastra_gfx90a +++ b/tests/GPU/OpenMP_QC/OpenMP_QC_BENCH.TU.adastra_gfx90a @@ -8,8 +8,8 @@ Time is given in seconds ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Context of the computation ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -Date: 20-02-2026 -- 18:15:49 -OS: g1081__Linux__x86_64__5.14.0-570.69.1.el9_6.x86_64__#1 SMP PREEMPT_DYNAMIC Tue Nov 25 01:30:14 EST 2025 +Date: 23-04-2026 -- 09:34:29 +OS: g1182__Linux__x86_64__5.14.0-570.69.1.el9_6.x86_64__#1 SMP PREEMPT_DYNAMIC Tue Nov 25 01:30:14 EST 2025 CPU model : AMD EPYC 7A53 64-Core Processor Total number of threads:128 GPU model: AMD Instinct MI250X @@ -22,40 +22,40 @@ Total number of elements used for the calculation: 393216 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Computation start-up statistics ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -Total time of the start-up: 12.5807 +Total time of the start-up: 13.6576 Number of calls to the linear solver per time step: 2 -Average time of the resolution of the linear problem per call: 0.353507 +Average time of the resolution of the linear problem per call: 0.43557 Average number of iteration of the linear solver per call: 0 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Time loop statistics ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ The first time step is not accounted for the computation of the time loop statistics -Total time of the time loop: 0.282159 +Total time of the time loop: 0.274393 Number of time steps: 2 Skipped time steps: 1 -Average time per time step: 0.14108 -Standard deviation between time steps: 0.00447892 -Time elapsed in the skipped time steps: 0.299704 +Average time per time step: 0.137196 +Standard deviation between time steps: 0.00374847 +Time elapsed in the skipped time steps: 0.303199 Standard counter description | Time/step | % loop time | Call(s)/step ------------------------------------------------------------------------------------------ -Linear solver resolutions Ax=B | 0.0413907 | 14.2 | 2 -Matrix assembly for implicit scheme | 0.008847495 | 3.0 | 4 -Convection operator | 0.009065098 | 3.1 | 6 -Diffusion operator | 0.0179382 | 6.2 | 18 -Gradient operator | 0.01716352 | 5.9 | 5 -Divergence operator | 0.004838472 | 1.7 | 6 -Source terms | 0.0005823695 | 0.2 | 4 -Update ::mettre_a_jour | 0.01238165 | 4.3 | 4 -Solver for implicit diffusion | 0.008807739 | 3.0 | 4 -Computation of the time step dt | 0.003402077 | 1.2 | 6 -Post-treatment operations | 0.009174429 | 3.2 | 1 -Other operations | 0.007487872 | 2.6 | +Linear solver resolutions Ax=B | 0.0474501 | 34.6 | 2 +Matrix assembly for implicit scheme | 0.008291998 | 6.0 | 4 +Convection operator | 0.008458615 | 6.2 | 6 +Diffusion operator | 0.01870515 | 13.6 | 18 +Gradient operator | 0.01000784 | 7.3 | 5 +Divergence operator | 0.004593083 | 3.3 | 6 +Source terms | 0.0005224655 | 0.4 | 4 +Update ::mettre_a_jour | 0.009446594 | 6.9 | 4 +Solver for implicit diffusion | 0.008450158 | 6.2 | 4 +Computation of the time step dt | 0.003498005 | 2.5 | 6 +Post-treatment operations | 0.008986579 | 6.6 | 1 +Other operations | 0.008785769 | 6.4 | -Average number of iteration of the linear solver per call: 48 +Average number of iteration of the linear solver per call: 56.2 ----------------------------------------------------------------------------------------------------------- @@ -63,17 +63,17 @@ Average number of iteration of the linear solver per call: 48 ----------------------------------------------------------------------------------------------------------- Counter description | Time per step | % loop time | Call(s)/step | Bandwidth ----------------------------------------------------------------------------------------------------------- -Libraries: | 0.0411813 | 29.2 | 2 | -Kernels: | 0.0890872 | 63.1 | 963 | -Copy host to device: | 0.000447347 | 0.3 | 21 | 2.2 GB/s -Copy device to host: | 0.000568236 | 0.4 | 9 | 12.0 GB/s -Alloc/Free on device: | 6.8284e-05 | 0.0 | 9 | -GPU: 92% Copy H<->D: 0.72% Alloc/free: 0.048% Comm: 0% CPU & I/O: 6.9% +Libraries: | 0.0472339 | 34.4 | 2 | +Kernels: | 0.0789504 | 57.5 | 963 | +Copy host to device: | 0.000471898 | 0.3 | 21 | 2.1 GB/s +Copy device to host: | 0.000577923 | 0.4 | 9 | 11.8 GB/s +Alloc/Free on device: | 5.6098e-05 | 0.0 | 9 | +GPU: 92% Copy H<->D: 0.77% Alloc/free: 0.041% Comm: 0% CPU & I/O: 7.2% ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Post-resolution statistics ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -Time of the post-resolution: 0.00244098 +Time of the post-resolution: 0.00252862 -Total time for the whole computation 13.165 +Total time for the whole computation 14.2377 -Power consumption: 0 kW 0 kWh 0 € (0.12€/kWh) +[Slurm] Power consumption (22 s): 0.360 kW 0.002 kWh 0.000 € (0.10€/kWh) diff --git a/tests/GPU/OpenMP_QC/OpenMP_QC_BENCH.TU.dalianvl_cc100 b/tests/GPU/OpenMP_QC/OpenMP_QC_BENCH.TU.dalianvl_cc100 new file mode 100644 index 0000000000..fd74cc1aad --- /dev/null +++ b/tests/GPU/OpenMP_QC/OpenMP_QC_BENCH.TU.dalianvl_cc100 @@ -0,0 +1,78 @@ + # Global performance file # + +This is the global file for tracking performance in TRUST. It stores aggregated quantities. +More detailed statistics can be found in the OpenMP_QC_BENCH_csv.TU file +For time loop, only standard counters of level 1 are printed alongside your custom counters +Time is given in seconds + +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Context of the computation +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Date: 22-05-2026 -- 12:17:46 +OS: dalianvl14__Linux__aarch64__5.14.0-503.40.1.el9_5.aarch64__#1 SMP PREEMPT_DYNAMIC Thu Apr 24 13:05:29 UTC 2025 +CPU model : Unknown Linux CPU +Total number of threads:144 +GPU model: NVIDIA GB200 +CUDA runtime version: 13.0 +CUDA drivers version: 13.20 +Nb procs used for the computation: 1 +TRUST version: 1.9.8_beta +Total number of elements used for the calculation: 393216 + +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Computation start-up statistics +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Total time of the start-up: 7.27273 + +Number of calls to the linear solver per time step: 2 +Average time of the resolution of the linear problem per call: 0.645492 +Average number of iteration of the linear solver per call: 0 + +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Time loop statistics +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +The first time step is not accounted for the computation of the time loop statistics +Total time of the time loop: 0.148116 +Number of time steps: 2 +Skipped time steps: 1 +Average time per time step: 0.0740581 +Standard deviation between time steps: 0.00232288 +Time elapsed in the skipped time steps: 0.181552 + + +Standard counter description | Time/step | % loop time | Call(s)/step +------------------------------------------------------------------------------------------ +Linear solver resolutions Ax=B | 0.0413845 | 55.9 | 2 +Matrix assembly for implicit scheme | 0.001681213 | 2.3 | 4 +Convection operator | 0.00283675 | 3.8 | 6 +Diffusion operator | 0.004026267 | 5.4 | 18 +Gradient operator | 0.001471085 | 2.0 | 5 +Divergence operator | 0.001090974 | 1.5 | 6 +Source terms | 0.0003684635 | 0.5 | 4 +Update ::mettre_a_jour | 0.004602123 | 6.2 | 4 +Solver for implicit diffusion | 0.004396186 | 5.9 | 4 +Computation of the time step dt | 0.000828607 | 1.1 | 6 +Post-treatment operations | 0.005638793 | 7.6 | 1 +Other operations | 0.00573316 | 7.7 | + +Average number of iteration of the linear solver per call: 54 + + +----------------------------------------------------------------------------------------------------------- + GPU statistics +----------------------------------------------------------------------------------------------------------- +Counter description | Time per step | % loop time | Call(s)/step | Bandwidth +----------------------------------------------------------------------------------------------------------- +Libraries: | 0.0411895 | 55.6 | 2 | +Kernels: | 0.0225578 | 30.5 | 963 | +Copy host to device: | 0.000361967 | 0.5 | 21 | 2.7 GB/s +Copy device to host: | 0.000221856 | 0.3 | 9 | 30.8 GB/s +Alloc/Free on device: | 0.000213488 | 0.3 | 9 | +GPU: 86% Copy H<->D: 0.79% Alloc/free: 0.29% Comm: 0% CPU & I/O: 13% +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Post-resolution statistics +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Time of the post-resolution: 0.00191222 + +Total time for the whole computation 7.60432 + diff --git a/tests/GPU/OpenMP_QC/OpenMP_QC_BENCH.TU.eureka_cc89 b/tests/GPU/OpenMP_QC/OpenMP_QC_BENCH.TU.eureka_cc89 new file mode 100644 index 0000000000..fbbaf34459 --- /dev/null +++ b/tests/GPU/OpenMP_QC/OpenMP_QC_BENCH.TU.eureka_cc89 @@ -0,0 +1,78 @@ + # Global performance file # + +This is the global file for tracking performance in TRUST. It stores aggregated quantities. +More detailed statistics can be found in the OpenMP_QC_BENCH_csv.TU file +For time loop, only standard counters of level 1 are printed alongside your custom counters +Time is given in seconds + +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Context of the computation +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Date: 11-05-2026 -- 08:35:02 +OS: eureka__Linux__x86_64__6.17.0-20-generic__#20~24.04.1-Ubuntu SMP PREEMPT_DYNAMIC Thu Mar 19 01:28:37 UTC 2 +CPU model : INTEL(R) XEON(R) PLATINUM 8580 +Total number of threads:240 +GPU model: NVIDIA RTX 6000 Ada Generation +CUDA runtime version: 12.90 +CUDA drivers version: 13.10 +Nb procs used for the computation: 1 +TRUST version: 1.9.8_beta +Total number of elements used for the calculation: 393216 + +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Computation start-up statistics +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Total time of the start-up: 9.42622 + +Number of calls to the linear solver per time step: 2 +Average time of the resolution of the linear problem per call: 0.357419 +Average number of iteration of the linear solver per call: 0 + +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Time loop statistics +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +The first time step is not accounted for the computation of the time loop statistics +Total time of the time loop: 0.175936 +Number of time steps: 2 +Skipped time steps: 1 +Average time per time step: 0.087968 +Standard deviation between time steps: 0.00155373 +Time elapsed in the skipped time steps: 0.315067 + + +Standard counter description | Time/step | % loop time | Call(s)/step +------------------------------------------------------------------------------------------ +Linear solver resolutions Ax=B | 0.0534873 | 60.8 | 2 +Matrix assembly for implicit scheme | 0.002269501 | 2.6 | 4 +Convection operator | 0.004729102 | 5.4 | 6 +Diffusion operator | 0.005568396 | 6.3 | 18 +Gradient operator | 0.002533424 | 2.9 | 5 +Divergence operator | 0.001298084 | 1.5 | 6 +Source terms | 0.000266546 | 0.3 | 4 +Update ::mettre_a_jour | 0.003159048 | 3.6 | 4 +Solver for implicit diffusion | 0.004780001 | 5.4 | 4 +Computation of the time step dt | 0.001324467 | 1.5 | 6 +Post-treatment operations | 0.003367655 | 3.8 | 1 +Other operations | 0.005184457 | 5.9 | + +Average number of iteration of the linear solver per call: 54 + + +----------------------------------------------------------------------------------------------------------- + GPU statistics +----------------------------------------------------------------------------------------------------------- +Counter description | Time per step | % loop time | Call(s)/step | Bandwidth +----------------------------------------------------------------------------------------------------------- +Libraries: | 0.0533364 | 60.6 | 2 | +Kernels: | 0.02909 | 33.1 | 963 | +Copy host to device: | 0.000282456 | 0.3 | 21 | 3.5 GB/s +Copy device to host: | 0.00072923 | 0.8 | 9 | 9.4 GB/s +Alloc/Free on device: | 0.000103919 | 0.1 | 9 | +GPU: 94% Copy H<->D: 1.2% Alloc/free: 0.12% Comm: 0% CPU & I/O: 5% +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Post-resolution statistics +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Time of the post-resolution: 0.000208091 + +Total time for the whole computation 9.91744 + diff --git a/tests/GPU/OpenMP_QC/OpenMP_QC_BENCH.TU.is157091_cc86 b/tests/GPU/OpenMP_QC/OpenMP_QC_BENCH.TU.is157091_cc86 index 81040101ab..f92f5361a6 100644 --- a/tests/GPU/OpenMP_QC/OpenMP_QC_BENCH.TU.is157091_cc86 +++ b/tests/GPU/OpenMP_QC/OpenMP_QC_BENCH.TU.is157091_cc86 @@ -8,7 +8,7 @@ Time is given in seconds ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Context of the computation ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -Date: 11-03-2026 -- 18:36:08 +Date: 22-04-2026 -- 07:55:46 OS: is157091__Linux__x86_64__5.11.12-300.fc34.x86_64__#1 SMP Wed Apr 7 16:31:13 UTC 2021 CPU model : AMD Ryzen Threadripper PRO 5975WX 32-Cores Total number of threads:64 @@ -22,38 +22,38 @@ Total number of elements used for the calculation: 393216 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Computation start-up statistics ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -Total time of the start-up: 6.16451 +Total time of the start-up: 6.07675 Number of calls to the linear solver per time step: 2 -Average time of the resolution of the linear problem per call: 0.279578 +Average time of the resolution of the linear problem per call: 0.338521 Average number of iteration of the linear solver per call: 0 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Time loop statistics ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ The first time step is not accounted for the computation of the time loop statistics -Total time of the time loop: 0.289052 +Total time of the time loop: 0.284861 Number of time steps: 2 Skipped time steps: 1 -Average time per time step: 0.144526 -Standard deviation between time steps: 0.000854199 -Time elapsed in the skipped time steps: 0.316179 +Average time per time step: 0.14243 +Standard deviation between time steps: 0.00177781 +Time elapsed in the skipped time steps: 0.320267 Standard counter description | Time/step | % loop time | Call(s)/step ------------------------------------------------------------------------------------------ -Linear solver resolutions Ax=B | 0.0781792 | 54.1 | 2 -Matrix assembly for implicit scheme | 0.005994391 | 4.1 | 4 -Convection operator | 0.009186321 | 6.4 | 6 -Diffusion operator | 0.009221059 | 6.4 | 18 -Gradient operator | 0.004951293 | 3.4 | 5 -Divergence operator | 0.004072616 | 2.8 | 6 -Source terms | 0.000345246 | 0.2 | 4 -Update ::mettre_a_jour | 0.005491032 | 3.8 | 4 -Solver for implicit diffusion | 0.009880821 | 6.8 | 4 -Computation of the time step dt | 0.003923569 | 2.7 | 6 -Post-treatment operations | 0.0045862 | 3.2 | 1 -Other operations | 0.008694332 | 6.0 | +Linear solver resolutions Ax=B | 0.0777574 | 54.6 | 2 +Matrix assembly for implicit scheme | 0.00667315 | 4.7 | 4 +Convection operator | 0.008953442 | 6.3 | 6 +Diffusion operator | 0.009070647 | 6.4 | 18 +Gradient operator | 0.003474078 | 2.4 | 5 +Divergence operator | 0.004063237 | 2.9 | 6 +Source terms | 0.000364597 | 0.3 | 4 +Update ::mettre_a_jour | 0.005158036 | 3.6 | 4 +Solver for implicit diffusion | 0.009889927 | 6.9 | 4 +Computation of the time step dt | 0.003849252 | 2.7 | 6 +Post-treatment operations | 0.004260108 | 3.0 | 1 +Other operations | 0.00891642 | 6.3 | Average number of iteration of the linear solver per call: 54 @@ -63,16 +63,16 @@ Average number of iteration of the linear solver per call: 54 ----------------------------------------------------------------------------------------------------------- Counter description | Time per step | % loop time | Call(s)/step | Bandwidth ----------------------------------------------------------------------------------------------------------- -Libraries: | 0.07797 | 53.9 | 2 | -Kernels: | 0.0604061 | 41.8 | 963 | -Copy host to device: | 0.00028165 | 0.2 | 21 | 3.5 GB/s -Copy device to host: | 0.000714904 | 0.5 | 9 | 9.6 GB/s -Alloc/Free on device: | 0.000213664 | 0.1 | 9 | -GPU: 96% Copy H<->D: 0.69% Alloc/free: 0.15% Comm: 0% CPU & I/O: 3.4% +Libraries: | 0.0775452 | 54.4 | 2 | +Kernels: | 0.0590798 | 41.5 | 963 | +Copy host to device: | 0.000279161 | 0.2 | 21 | 3.5 GB/s +Copy device to host: | 0.000780016 | 0.5 | 9 | 8.8 GB/s +Alloc/Free on device: | 9.50135e-05 | 0.1 | 9 | +GPU: 96% Copy H<->D: 0.74% Alloc/free: 0.067% Comm: 0% CPU & I/O: 3.3% ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Post-resolution statistics ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -Time of the post-resolution: 0.00025942 +Time of the post-resolution: 0.000268055 -Total time for the whole computation 6.77001 +Total time for the whole computation 6.68216 diff --git a/tests/GPU/OpenMP_QC/OpenMP_QC_BENCH.TU.is159479_cc120 b/tests/GPU/OpenMP_QC/OpenMP_QC_BENCH.TU.is159479_cc120 new file mode 100644 index 0000000000..91a4f8944a --- /dev/null +++ b/tests/GPU/OpenMP_QC/OpenMP_QC_BENCH.TU.is159479_cc120 @@ -0,0 +1,78 @@ + # Global performance file # + +This is the global file for tracking performance in TRUST. It stores aggregated quantities. +More detailed statistics can be found in the OpenMP_QC_BENCH_csv.TU file +For time loop, only standard counters of level 1 are printed alongside your custom counters +Time is given in seconds + +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Context of the computation +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Date: 24-04-2026 -- 14:36:40 +OS: is159479.intra.cea.fr__Linux__x86_64__6.12.0-160000.26-default__#1 SMP PREEMPT_DYNAMIC Thu Feb 5 00:00:11 UTC 2026 (f +CPU model : AMD Ryzen Threadripper PRO 9965WX 24-Cores +Total number of threads:48 +GPU model: NVIDIA RTX PRO 6000 Blackwell Workstation Edition +CUDA runtime version: 13.0 +CUDA drivers version: 13.0 +Nb procs used for the computation: 1 +TRUST version: 1.9.8_beta +Total number of elements used for the calculation: 393216 + +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Computation start-up statistics +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Total time of the start-up: 5.48508 + +Number of calls to the linear solver per time step: 2 +Average time of the resolution of the linear problem per call: 0.218228 +Average number of iteration of the linear solver per call: 0 + +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Time loop statistics +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +The first time step is not accounted for the computation of the time loop statistics +Total time of the time loop: 0.138249 +Number of time steps: 2 +Skipped time steps: 1 +Average time per time step: 0.0691244 +Standard deviation between time steps: 0.00128628 +Time elapsed in the skipped time steps: 0.194232 + + +Standard counter description | Time/step | % loop time | Call(s)/step +------------------------------------------------------------------------------------------ +Linear solver resolutions Ax=B | 0.0460209 | 66.6 | 2 +Matrix assembly for implicit scheme | 0.001367521 | 2.0 | 4 +Convection operator | 0.003197111 | 4.6 | 6 +Diffusion operator | 0.003536 | 5.1 | 18 +Gradient operator | 0.001323068 | 1.9 | 5 +Divergence operator | 0.000847333 | 1.2 | 6 +Source terms | 0.000163009 | 0.2 | 4 +Update ::mettre_a_jour | 0.002235803 | 3.2 | 4 +Solver for implicit diffusion | 0.003205218 | 4.6 | 4 +Computation of the time step dt | 0.000807989 | 1.2 | 6 +Post-treatment operations | 0.002658792 | 3.8 | 1 +Other operations | 0.003761627 | 5.4 | + +Average number of iteration of the linear solver per call: 54 + + +----------------------------------------------------------------------------------------------------------- + GPU statistics +----------------------------------------------------------------------------------------------------------- +Counter description | Time per step | % loop time | Call(s)/step | Bandwidth +----------------------------------------------------------------------------------------------------------- +Libraries: | 0.045932 | 66.4 | 2 | +Kernels: | 0.0191397 | 27.7 | 963 | +Copy host to device: | 0.000208833 | 0.3 | 21 | 4.7 GB/s +Copy device to host: | 0.000781099 | 1.1 | 9 | 8.8 GB/s +Alloc/Free on device: | 4.53425e-05 | 0.1 | 9 | +GPU: 94% Copy H<->D: 1.4% Alloc/free: 0.066% Comm: 0% CPU & I/O: 4.4% +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Post-resolution statistics +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Time of the post-resolution: 0.0001672 + +Total time for the whole computation 5.81773 + diff --git a/tests/GPU/OpenMP_QC/OpenMP_QC_BENCH.TU.is247793_gfx1100 b/tests/GPU/OpenMP_QC/OpenMP_QC_BENCH.TU.is247793_gfx1100 index 698e29466a..8a943b5eb3 100644 --- a/tests/GPU/OpenMP_QC/OpenMP_QC_BENCH.TU.is247793_gfx1100 +++ b/tests/GPU/OpenMP_QC/OpenMP_QC_BENCH.TU.is247793_gfx1100 @@ -1,53 +1,78 @@ -Statistiques d'initialisation du calcul - -Temps total 7.87363 - -Statistiques de resolution du probleme - -Temps total 3.18878 - - -Timesteps 3 -Secondes / pas de temps 1.06292 -Dont solveurs Ax=B 0.786720 74% (2 appels/pas de temps) -Dont solveur diffusion_implicite 0.012582 1% (4 appels/pas de temps) -Dont assemblage matrice_implicite 0.039556 3% (4 appels/pas de temps) -Dont mettre_a_jour 0.007267 0% (4 appels/pas de temps) -Dont operateurs convection 0.042721 4% (6 appels/pas de temps) -Dont operateurs diffusion 0.025574 2% (18 appels/pas de temps) -Dont operateurs gradient 0.008015 0% (5 appels/pas de temps) -Dont operateurs divergence 0.003617 0% (5.66667 appels/pas de temps) -Dont operateurs source 0.002720 0% (4 appels/pas de temps) -Dont operations postraitement 0.119419 11% (1 appel/pas de temps) -Dont calcul dt 0.003073 0% (6 appels/pas de temps) -Dont calcul divers 0.011655 1% (0 appels/pas de temps) -Nb solveur / pas de temps 2 -Secondes / solveur 0.39336 -Iterations / solveur 369 -GPU statistics per time step (experimental): -Libraries : 0.786482 s 74.0% 2.0 calls -Kernels : 0.094407 s 8.9% 1409891.0 calls -Copy H2D : 0.026391 s 2.5% 71.0 calls 3.8 GB/s -Copy D2H : 0.003439 s 0.3% 76.0 calls 9.0 GB/s -Alloc/Free: 0.004708 s 0.4% 58.0 calls -GPU: 82.8% Copy H<->D: 2.8% Alloc/Free: 0.4% Comm: 0% CPU & Others: 13.8% -I/O: - -Timesteps = number of time steps -Nb solveur = number of linear system resolutions -Nb assemblage implicite = number of matrix assemblies for the implicit scheme -Iterations = average number of iterations of the solver -Communications = fraction of the time spent - in communications between processors (excluding io files) -Network latency = time of one mpsum measured by an internal bench over 0.1s -Network bandwidth = maximum on all processors - of the average bandwidth of send_recv operations -Waiting time = estimation of the waiting time of the different processors - -Max_waiting_time big => probably due to a bad partitioning -Communications > 30% => too many processors or network too slow - -Statistiques de post resolution - -Temps total 0.087521 + # Global performance file # + +This is the global file for tracking performance in TRUST. It stores aggregated quantities. +More detailed statistics can be found in the OpenMP_QC_BENCH_csv.TU file +For time loop, only standard counters of level 1 are printed alongside your custom counters +Time is given in seconds + +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Context of the computation +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Date: 22-05-2026 -- 19:05:11 +OS: is247793__Linux__x86_64__6.8.0-107-generic__#107~22.04.1-Ubuntu SMP PREEMPT_DYNAMIC Wed Mar 18 23:40:43 UTC +CPU model : AMD Ryzen Threadripper PRO 7965WX 24-Cores +Total number of threads:48 +GPU model: AMD Radeon PRO W7900 +HIP runtime version: 7.53 +HIP drivers version: 7.53 +Nb procs used for the computation: 1 +TRUST version: 1.9.8_beta +Total number of elements used for the calculation: 393216 + +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Computation start-up statistics +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Total time of the start-up: 6.45541 + +Number of calls to the linear solver per time step: 2 +Average time of the resolution of the linear problem per call: 0.599301 +Average number of iteration of the linear solver per call: 0 + +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Time loop statistics +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +The first time step is not accounted for the computation of the time loop statistics +Total time of the time loop: 0.281635 +Number of time steps: 2 +Skipped time steps: 1 +Average time per time step: 0.140818 +Standard deviation between time steps: 0.00160022 +Time elapsed in the skipped time steps: 0.339711 + + +Standard counter description | Time/step | % loop time | Call(s)/step +------------------------------------------------------------------------------------------ +Linear solver resolutions Ax=B | 0.0737454 | 52.4 | 2 +Matrix assembly for implicit scheme | 0.00484449 | 3.4 | 4 +Convection operator | 0.00740623 | 5.3 | 6 +Diffusion operator | 0.0112249 | 8.0 | 18 +Gradient operator | 0.004945052 | 3.5 | 5 +Divergence operator | 0.003099721 | 2.2 | 6 +Source terms | 0.000512645 | 0.4 | 4 +Update ::mettre_a_jour | 0.006273922 | 4.5 | 4 +Solver for implicit diffusion | 0.01006549 | 7.1 | 4 +Computation of the time step dt | 0.003010985 | 2.1 | 6 +Post-treatment operations | 0.005375151 | 3.8 | 1 +Other operations | 0.0103136 | 7.3 | + +Average number of iteration of the linear solver per call: 56.2 + + +----------------------------------------------------------------------------------------------------------- + GPU statistics +----------------------------------------------------------------------------------------------------------- +Counter description | Time per step | % loop time | Call(s)/step | Bandwidth +----------------------------------------------------------------------------------------------------------- +Libraries: | 0.0735468 | 52.2 | 2 | +Kernels: | 0.0612288 | 43.5 | 963 | +Copy host to device: | 0.000498129 | 0.4 | 21 | 2.0 GB/s +Copy device to host: | 0.000596809 | 0.4 | 9 | 11.5 GB/s +Alloc/Free on device: | 7.26065e-05 | 0.1 | 9 | +GPU: 96% Copy H<->D: 0.78% Alloc/free: 0.052% Comm: 0% CPU & I/O: 3.5% +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Post-resolution statistics +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Time of the post-resolution: 0.000191543 + +Total time for the whole computation 7.07695 diff --git a/tests/GPU/OpenMP_QC/OpenMP_QC_BENCH.TU.lumi_gfx90a b/tests/GPU/OpenMP_QC/OpenMP_QC_BENCH.TU.lumi_gfx90a index b42b08e3c6..3b64cfeb32 100644 --- a/tests/GPU/OpenMP_QC/OpenMP_QC_BENCH.TU.lumi_gfx90a +++ b/tests/GPU/OpenMP_QC/OpenMP_QC_BENCH.TU.lumi_gfx90a @@ -8,8 +8,8 @@ Time is given in seconds ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Context of the computation ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -Date: 24-02-2026 -- 00:32:35 -OS: nid007971__Linux__x86_64__6.4.0-150600.23.73_15.0.14-cray_shasta_c__#1 SMP Tue Oct 21 20:32:25 UTC 2025 (89d3c98) +Date: 15-05-2026 -- 21:03:37 +OS: nid005002__Linux__x86_64__6.4.0-150600.23.73_15.0.14-cray_shasta_c__#1 SMP Tue Oct 21 20:32:25 UTC 2025 (89d3c98) CPU model : AMD EPYC 7A53 64-Core Processor Total number of threads:128 GPU model: AMD Instinct MI250X @@ -22,40 +22,40 @@ Total number of elements used for the calculation: 393216 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Computation start-up statistics ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -Total time of the start-up: 46.4693 +Total time of the start-up: 41.8307 Number of calls to the linear solver per time step: 2 -Average time of the resolution of the linear problem per call: 1.61811 +Average time of the resolution of the linear problem per call: 1.59457 Average number of iteration of the linear solver per call: 0 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Time loop statistics ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ The first time step is not accounted for the computation of the time loop statistics -Total time of the time loop: 0.303302 +Total time of the time loop: 0.275783 Number of time steps: 2 Skipped time steps: 1 -Average time per time step: 0.151651 -Standard deviation between time steps: 0.00396357 -Time elapsed in the skipped time steps: 0.480891 +Average time per time step: 0.137892 +Standard deviation between time steps: 0.00135879 +Time elapsed in the skipped time steps: 0.468064 Standard counter description | Time/step | % loop time | Call(s)/step ------------------------------------------------------------------------------------------ -Linear solver resolutions Ax=B | 0.0502534 | 12.8 | 2 -Matrix assembly for implicit scheme | 0.008697519 | 2.2 | 4 -Convection operator | 0.008822867 | 2.3 | 6 -Diffusion operator | 0.01684057 | 4.3 | 18 -Gradient operator | 0.01633068 | 4.2 | 5 -Divergence operator | 0.004752977 | 1.2 | 6 -Source terms | 0.0005051555 | 0.1 | 4 -Update ::mettre_a_jour | 0.01738123 | 4.4 | 4 -Solver for implicit diffusion | 0.008576068 | 2.2 | 4 -Computation of the time step dt | 0.003482782 | 0.9 | 6 -Post-treatment operations | 0.008540245 | 2.2 | 1 -Other operations | 0.007467535 | 1.9 | +Linear solver resolutions Ax=B | 0.0452876 | 32.8 | 2 +Matrix assembly for implicit scheme | 0.008186238 | 5.9 | 4 +Convection operator | 0.008172697 | 5.9 | 6 +Diffusion operator | 0.01754431 | 12.7 | 18 +Gradient operator | 0.009107283 | 6.6 | 5 +Divergence operator | 0.004602755 | 3.3 | 6 +Source terms | 0.007739103 | 5.6 | 4 +Update ::mettre_a_jour | 0.008671992 | 6.3 | 4 +Solver for implicit diffusion | 0.008301618 | 6.0 | 4 +Computation of the time step dt | 0.003485676 | 2.5 | 6 +Post-treatment operations | 0.008401907 | 6.1 | 1 +Other operations | 0.008390358 | 6.1 | -Average number of iteration of the linear solver per call: 54 +Average number of iteration of the linear solver per call: 56.2 ----------------------------------------------------------------------------------------------------------- @@ -63,17 +63,17 @@ Average number of iteration of the linear solver per call: 54 ----------------------------------------------------------------------------------------------------------- Counter description | Time per step | % loop time | Call(s)/step | Bandwidth ----------------------------------------------------------------------------------------------------------- -Libraries: | 0.049896 | 32.9 | 2 | -Kernels: | 0.0915515 | 60.4 | 963 | -Copy host to device: | 0.000483606 | 0.3 | 21 | 2.0 GB/s -Copy device to host: | 0.000589767 | 0.4 | 9 | 11.6 GB/s -Alloc/Free on device: | 7.3544e-05 | 0.0 | 9 | -GPU: 93% Copy H<->D: 0.71% Alloc/free: 0.048% Comm: 0% CPU & I/O: 6% +Libraries: | 0.0450666 | 32.7 | 2 | +Kernels: | 0.082456 | 59.8 | 963 | +Copy host to device: | 0.000476035 | 0.3 | 21 | 2.1 GB/s +Copy device to host: | 0.000586974 | 0.4 | 9 | 11.7 GB/s +Alloc/Free on device: | 5.8974e-05 | 0.0 | 9 | +GPU: 92% Copy H<->D: 0.77% Alloc/free: 0.043% Comm: 0% CPU & I/O: 6.7% ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Post-resolution statistics ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -Time of the post-resolution: 0.00246653 +Time of the post-resolution: 0.00315599 -Total time for the whole computation 47.256 +Total time for the whole computation 42.5777 -Power consumption: 0 kW 0 kWh 0 € (0.12€/kWh) +[Slurm] Power consumption (64 s): 0.438 kW 0.008 kWh 0.001 € (0.10€/kWh) diff --git a/tests/GPU/PETSC_GAMG/PETSC_GAMG.lml.gz b/tests/GPU/PETSC_GAMG/PETSC_GAMG.lml.gz new file mode 120000 index 0000000000..3b9fb55c30 --- /dev/null +++ b/tests/GPU/PETSC_GAMG/PETSC_GAMG.lml.gz @@ -0,0 +1 @@ +../JEL_bous/JEL_bous.lml.gz \ No newline at end of file diff --git a/tests/GPU/PETSC_KOKKOS/PETSC_KOKKOS.data b/tests/GPU/PETSC_KOKKOS/PETSC_KOKKOS.data index 84f1cff86d..5d480ad45e 100644 --- a/tests/GPU/PETSC_KOKKOS/PETSC_KOKKOS.data +++ b/tests/GPU/PETSC_KOKKOS/PETSC_KOKKOS.data @@ -41,7 +41,8 @@ Scatter DOM.Zones dom_fluide END SCATTER # -VEFPreP1B dis +VEFPreP1B dis +Lire dis { reorder { algo Hilbert } } Scheme_euler_implicit sch Read sch diff --git a/tests/GPU/PETSC_KOKKOS/PETSC_KOKKOS_BENCH.TU.adastra_gfx90a b/tests/GPU/PETSC_KOKKOS/PETSC_KOKKOS_BENCH.TU.adastra_gfx90a index d27d19598c..692c68d2fd 100644 --- a/tests/GPU/PETSC_KOKKOS/PETSC_KOKKOS_BENCH.TU.adastra_gfx90a +++ b/tests/GPU/PETSC_KOKKOS/PETSC_KOKKOS_BENCH.TU.adastra_gfx90a @@ -8,8 +8,8 @@ Time is given in seconds ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Context of the computation ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -Date: 20-02-2026 -- 18:16:57 -OS: g1081__Linux__x86_64__5.14.0-570.69.1.el9_6.x86_64__#1 SMP PREEMPT_DYNAMIC Tue Nov 25 01:30:14 EST 2025 +Date: 24-05-2026 -- 16:10:58 +OS: g1266__Linux__x86_64__5.14.0-570.69.1.el9_6.x86_64__#1 SMP PREEMPT_DYNAMIC Tue Nov 25 01:30:14 EST 2025 CPU model : AMD EPYC 7A53 64-Core Processor Total number of threads:128 GPU model: AMD Instinct MI250X @@ -22,38 +22,38 @@ Total number of elements used for the calculation: 2560000 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Computation start-up statistics ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -Total time of the start-up: 54.0032 +Total time of the start-up: 44.6265 Number of calls to the linear solver per time step: 1 -Average time of the resolution of the linear problem per call: 2.48562 +Average time of the resolution of the linear problem per call: 3.33197 Average number of iteration of the linear solver per call: 0 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Time loop statistics ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ The first time step is not accounted for the computation of the time loop statistics -Total time of the time loop: 16.1813 +Total time of the time loop: 13.9877 Number of time steps: 9 Skipped time steps: 1 -Average time per time step: 1.79792 -Standard deviation between time steps: 0.0782326 -Time elapsed in the skipped time steps: 15.1899 +Average time per time step: 1.55419 +Standard deviation between time steps: 0.0714623 +Time elapsed in the skipped time steps: 26.42 Standard counter description | Time/step | % loop time | Call(s)/step ------------------------------------------------------------------------------------------ -Linear solver resolutions Ax=B | 0.952062 | 27.3 | 3 -Matrix assembly for implicit scheme | 0.1737865 | 5.0 | 1 -Convection operator | 0.2060802 | 5.9 | 4 -Diffusion operator | 0.014529 | 0.4 | 2 -Divergence operator | 0.03175503 | 0.9 | 4 -Source terms | 0.0007008112 | 0.0 | 2 -Update ::mettre_a_jour | 0.01173972 | 0.3 | 4 -Computation of the time step dt | 0.001874476 | 0.1 | 4 -Post-treatment operations | 0.02246132 | 0.6 | 1 -Other operations | 0.3829349 | 11.0 | +Linear solver resolutions Ax=B | 0.88151 | 56.7 | 3 +Matrix assembly for implicit scheme | 0.1248069 | 8.0 | 1 +Convection operator | 0.152424 | 9.8 | 4 +Diffusion operator | 0.01211028 | 0.8 | 2 +Divergence operator | 0.02130311 | 1.4 | 4 +Source terms | 0.0005407483 | 0.0 | 2 +Update ::mettre_a_jour | 0.00905912 | 0.6 | 4 +Computation of the time step dt | 0.001572527 | 0.1 | 4 +Post-treatment operations | 0.02116038 | 1.4 | 1 +Other operations | 0.3296992 | 21.2 | -Average number of iteration of the linear solver per call: 45.1 +Average number of iteration of the linear solver per call: 45.2 ----------------------------------------------------------------------------------------------------------- @@ -61,17 +61,17 @@ Average number of iteration of the linear solver per call: 45.1 ----------------------------------------------------------------------------------------------------------- Counter description | Time per step | % loop time | Call(s)/step | Bandwidth ----------------------------------------------------------------------------------------------------------- -Libraries: | 0.929697 | 51.7 | 3 | -Kernels: | 0.856626 | 47.6 | 435 | -Copy host to device: | 0.000691429 | 0.0 | 21 | 6.5 GB/s -Copy device to host: | 0.000787623 | 0.0 | 7 | 14.1 GB/s -Alloc/Free on device: | 0.00013016 | 0.0 | 4 | -GPU: 99% Copy H<->D: 0.082% Alloc/free: 0.0072% Comm: 0% CPU & I/O: 0.56% +Libraries: | 0.859416 | 55.3 | 3 | +Kernels: | 0.682799 | 43.9 | 433 | +Copy host to device: | 0.000688401 | 0.0 | 21 | 6.5 GB/s +Copy device to host: | 0.000780124 | 0.1 | 7 | 14.3 GB/s +Alloc/Free on device: | 0.000153926 | 0.0 | 4 | +GPU: 99% Copy H<->D: 0.094% Alloc/free: 0.0099% Comm: 0% CPU & I/O: 0.67% ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Post-resolution statistics ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -Time of the post-resolution: 0.200093 +Time of the post-resolution: 0.203241 -Total time for the whole computation 85.5745 +Total time for the whole computation 85.2374 -Power consumption: 0 kW 0 kWh 0 € (0.12€/kWh) +[Slurm] Power consumption (91 s): 0.524 kW 0.013 kWh 0.001 € (0.10€/kWh) diff --git a/tests/GPU/PETSC_KOKKOS/PETSC_KOKKOS_BENCH.TU.adastra_gfx942 b/tests/GPU/PETSC_KOKKOS/PETSC_KOKKOS_BENCH.TU.adastra_gfx942 index 3c55fe1cfc..068c7032b7 100644 --- a/tests/GPU/PETSC_KOKKOS/PETSC_KOKKOS_BENCH.TU.adastra_gfx942 +++ b/tests/GPU/PETSC_KOKKOS/PETSC_KOKKOS_BENCH.TU.adastra_gfx942 @@ -8,8 +8,8 @@ Time is given in seconds ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Context of the computation ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -Date: 17-03-2026 -- 18:08:32 -OS: a1007__Linux__x86_64__5.14.0-570.69.1.el9_6.x86_64__#1 SMP PREEMPT_DYNAMIC Tue Nov 25 01:30:14 EST 2025 +Date: 23-04-2026 -- 14:57:40 +OS: a1003__Linux__x86_64__5.14.0-570.69.1.el9_6.x86_64__#1 SMP PREEMPT_DYNAMIC Tue Nov 25 01:30:14 EST 2025 CPU model : AMD Instinct MI300A Accelerator Total number of threads:192 GPU model: AMD Instinct MI300A @@ -22,38 +22,38 @@ Total number of elements used for the calculation: 2560000 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Computation start-up statistics ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -Total time of the start-up: 87.9476 +Total time of the start-up: 69.2005 Number of calls to the linear solver per time step: 1 -Average time of the resolution of the linear problem per call: 3.59228 +Average time of the resolution of the linear problem per call: 4.08395 Average number of iteration of the linear solver per call: 0 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Time loop statistics ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ The first time step is not accounted for the computation of the time loop statistics -Total time of the time loop: 6.86811 +Total time of the time loop: 6.57321 Number of time steps: 9 Skipped time steps: 1 -Average time per time step: 0.763124 -Standard deviation between time steps: 0.111657 -Time elapsed in the skipped time steps: 26.2479 +Average time per time step: 0.730357 +Standard deviation between time steps: 0.108724 +Time elapsed in the skipped time steps: 29.3565 Standard counter description | Time/step | % loop time | Call(s)/step ------------------------------------------------------------------------------------------ -Linear solver resolutions Ax=B | 0.580801 | 76.1 | 3 -Matrix assembly for implicit scheme | 0.03724753 | 4.9 | 1 -Convection operator | 0.04440546 | 5.8 | 4 -Diffusion operator | 0.004041744 | 0.5 | 2 -Divergence operator | 0.008965229 | 1.2 | 4 -Source terms | 0.0002944104 | 0.0 | 2 -Update ::mettre_a_jour | 0.006270745 | 0.8 | 4 -Computation of the time step dt | 0.001288767 | 0.2 | 4 -Post-treatment operations | 0.01114546 | 1.5 | 1 -Other operations | 0.06866393 | 9.0 | +Linear solver resolutions Ax=B | 0.560471 | 76.7 | 3 +Matrix assembly for implicit scheme | 0.03394545 | 4.6 | 1 +Convection operator | 0.0405187 | 5.5 | 4 +Diffusion operator | 0.004115085 | 0.6 | 2 +Divergence operator | 0.006495622 | 0.9 | 4 +Source terms | 0.0003109058 | 0.0 | 2 +Update ::mettre_a_jour | 0.005631407 | 0.8 | 4 +Computation of the time step dt | 0.001290499 | 0.2 | 4 +Post-treatment operations | 0.01139088 | 1.6 | 1 +Other operations | 0.06618752 | 9.1 | -Average number of iteration of the linear solver per call: 45.1 +Average number of iteration of the linear solver per call: 45.2 ----------------------------------------------------------------------------------------------------------- @@ -61,17 +61,17 @@ Average number of iteration of the linear solver per call: 45.1 ----------------------------------------------------------------------------------------------------------- Counter description | Time per step | % loop time | Call(s)/step | Bandwidth ----------------------------------------------------------------------------------------------------------- -Libraries: | 0.571281 | 74.9 | 3 | -Kernels: | 0.179249 | 23.5 | 433 | -Copy host to device: | 0.000686672 | 0.1 | 21 | 6.5 GB/s -Copy device to host: | 0.000558109 | 0.1 | 7 | 20.0 GB/s -Alloc/Free on device: | 0.00091174 | 0.1 | 4 | -GPU: 98% Copy H<->D: 0.16% Alloc/free: 0.12% Comm: 0% CPU & I/O: 1.4% +Libraries: | 0.551041 | 75.4 | 3 | +Kernels: | 0.165837 | 22.7 | 433 | +Copy host to device: | 0.000688216 | 0.1 | 21 | 6.5 GB/s +Copy device to host: | 0.000562285 | 0.1 | 7 | 19.8 GB/s +Alloc/Free on device: | 0.000853448 | 0.1 | 4 | +GPU: 98% Copy H<->D: 0.17% Alloc/free: 0.12% Comm: 0% CPU & I/O: 1.6% ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Post-resolution statistics ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -Time of the post-resolution: 0.189987 +Time of the post-resolution: 0.207525 -Total time for the whole computation 121.254 +Total time for the whole computation 105.338 -[Slurm] Power consumption (136 s): 0.671 kW 0.025 kWh 0.003 € (0.10€/kWh) +[Slurm] Power consumption (116 s): 0.683 kW 0.022 kWh 0.002 € (0.10€/kWh) diff --git a/tests/GPU/PETSC_KOKKOS/PETSC_KOKKOS_BENCH.TU.dalianvl_cc100 b/tests/GPU/PETSC_KOKKOS/PETSC_KOKKOS_BENCH.TU.dalianvl_cc100 new file mode 100644 index 0000000000..a9f5b2197f --- /dev/null +++ b/tests/GPU/PETSC_KOKKOS/PETSC_KOKKOS_BENCH.TU.dalianvl_cc100 @@ -0,0 +1,76 @@ + # Global performance file # + +This is the global file for tracking performance in TRUST. It stores aggregated quantities. +More detailed statistics can be found in the PETSC_KOKKOS_BENCH_csv.TU file +For time loop, only standard counters of level 1 are printed alongside your custom counters +Time is given in seconds + +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Context of the computation +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Date: 22-05-2026 -- 12:18:30 +OS: dalianvl14__Linux__aarch64__5.14.0-503.40.1.el9_5.aarch64__#1 SMP PREEMPT_DYNAMIC Thu Apr 24 13:05:29 UTC 2025 +CPU model : Unknown Linux CPU +Total number of threads:144 +GPU model: NVIDIA GB200 +CUDA runtime version: 13.0 +CUDA drivers version: 13.20 +Nb procs used for the computation: 1 +TRUST version: 1.9.8_beta +Total number of elements used for the calculation: 2560000 + +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Computation start-up statistics +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Total time of the start-up: 39.7573 + +Number of calls to the linear solver per time step: 1 +Average time of the resolution of the linear problem per call: 2.71276 +Average number of iteration of the linear solver per call: 0 + +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Time loop statistics +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +The first time step is not accounted for the computation of the time loop statistics +Total time of the time loop: 2.37842 +Number of time steps: 9 +Skipped time steps: 1 +Average time per time step: 0.264269 +Standard deviation between time steps: 0.0171366 +Time elapsed in the skipped time steps: 14.8921 + + +Standard counter description | Time/step | % loop time | Call(s)/step +------------------------------------------------------------------------------------------ +Linear solver resolutions Ax=B | 0.203571 | 77.0 | 3 +Matrix assembly for implicit scheme | 0.01233993 | 4.7 | 1 +Convection operator | 0.01126966 | 4.3 | 4 +Diffusion operator | 0.001828819 | 0.7 | 2 +Divergence operator | 0.001690409 | 0.6 | 4 +Source terms | 0.0001626662 | 0.1 | 2 +Update ::mettre_a_jour | 0.002726895 | 1.0 | 4 +Computation of the time step dt | 0.0005716936 | 0.2 | 4 +Post-treatment operations | 0.005884185 | 2.2 | 1 +Other operations | 0.02422413 | 9.2 | + +Average number of iteration of the linear solver per call: 45.1 + + +----------------------------------------------------------------------------------------------------------- + GPU statistics +----------------------------------------------------------------------------------------------------------- +Counter description | Time per step | % loop time | Call(s)/step | Bandwidth +----------------------------------------------------------------------------------------------------------- +Libraries: | 0.196436 | 74.3 | 3 | +Kernels: | 0.0560966 | 21.2 | 433 | +Copy host to device: | 0.00043351 | 0.2 | 21 | 10.3 GB/s +Copy device to host: | 0.000356846 | 0.1 | 7 | 31.2 GB/s +Alloc/Free on device: | 0.00231052 | 0.9 | 4 | +GPU: 96% Copy H<->D: 0.3% Alloc/free: 0.87% Comm: 0% CPU & I/O: 3.3% +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Post-resolution statistics +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Time of the post-resolution: 0.155213 + +Total time for the whole computation 57.183 + diff --git a/tests/GPU/PETSC_KOKKOS/PETSC_KOKKOS_BENCH.TU.eureka_cc89 b/tests/GPU/PETSC_KOKKOS/PETSC_KOKKOS_BENCH.TU.eureka_cc89 new file mode 100644 index 0000000000..fcce0ed49b --- /dev/null +++ b/tests/GPU/PETSC_KOKKOS/PETSC_KOKKOS_BENCH.TU.eureka_cc89 @@ -0,0 +1,76 @@ + # Global performance file # + +This is the global file for tracking performance in TRUST. It stores aggregated quantities. +More detailed statistics can be found in the PETSC_KOKKOS_BENCH_csv.TU file +For time loop, only standard counters of level 1 are printed alongside your custom counters +Time is given in seconds + +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Context of the computation +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Date: 11-05-2026 -- 08:35:53 +OS: eureka__Linux__x86_64__6.17.0-20-generic__#20~24.04.1-Ubuntu SMP PREEMPT_DYNAMIC Thu Mar 19 01:28:37 UTC 2 +CPU model : INTEL(R) XEON(R) PLATINUM 8580 +Total number of threads:240 +GPU model: NVIDIA RTX 6000 Ada Generation +CUDA runtime version: 12.90 +CUDA drivers version: 13.10 +Nb procs used for the computation: 1 +TRUST version: 1.9.8_beta +Total number of elements used for the calculation: 2560000 + +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Computation start-up statistics +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Total time of the start-up: 48.8101 + +Number of calls to the linear solver per time step: 1 +Average time of the resolution of the linear problem per call: 3.58473 +Average number of iteration of the linear solver per call: 0 + +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Time loop statistics +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +The first time step is not accounted for the computation of the time loop statistics +Total time of the time loop: 8.85236 +Number of time steps: 9 +Skipped time steps: 1 +Average time per time step: 0.983596 +Standard deviation between time steps: 0.0820317 +Time elapsed in the skipped time steps: 37.7377 + + +Standard counter description | Time/step | % loop time | Call(s)/step +------------------------------------------------------------------------------------------ +Linear solver resolutions Ax=B | 0.759662 | 77.2 | 3 +Matrix assembly for implicit scheme | 0.03474596 | 3.5 | 1 +Convection operator | 0.03646882 | 3.7 | 4 +Diffusion operator | 0.003950059 | 0.4 | 2 +Divergence operator | 0.004843026 | 0.5 | 4 +Source terms | 0.000650252 | 0.1 | 2 +Update ::mettre_a_jour | 0.004128376 | 0.4 | 4 +Computation of the time step dt | 0.001334431 | 0.1 | 4 +Post-treatment operations | 0.007428737 | 0.8 | 1 +Other operations | 0.1303845 | 13.3 | + +Average number of iteration of the linear solver per call: 45.1 + + +----------------------------------------------------------------------------------------------------------- + GPU statistics +----------------------------------------------------------------------------------------------------------- +Counter description | Time per step | % loop time | Call(s)/step | Bandwidth +----------------------------------------------------------------------------------------------------------- +Libraries: | 0.727012 | 73.9 | 3 | +Kernels: | 0.246978 | 25.1 | 433 | +Copy host to device: | 0.00103084 | 0.1 | 21 | 4.3 GB/s +Copy device to host: | 0.00146876 | 0.1 | 7 | 7.6 GB/s +Alloc/Free on device: | 0.000581128 | 0.1 | 4 | +GPU: 99% Copy H<->D: 0.25% Alloc/free: 0.059% Comm: 0% CPU & I/O: 0.66% +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Post-resolution statistics +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Time of the post-resolution: 0.0882911 + +Total time for the whole computation 95.4885 + diff --git a/tests/GPU/PETSC_KOKKOS/PETSC_KOKKOS_BENCH.TU.is157091_cc86 b/tests/GPU/PETSC_KOKKOS/PETSC_KOKKOS_BENCH.TU.is157091_cc86 index bfa2beebaa..cca066e59c 100644 --- a/tests/GPU/PETSC_KOKKOS/PETSC_KOKKOS_BENCH.TU.is157091_cc86 +++ b/tests/GPU/PETSC_KOKKOS/PETSC_KOKKOS_BENCH.TU.is157091_cc86 @@ -8,7 +8,7 @@ Time is given in seconds ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Context of the computation ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -Date: 20-02-2026 -- 12:01:52 +Date: 22-04-2026 -- 20:47:12 OS: is157091__Linux__x86_64__5.11.12-300.fc34.x86_64__#1 SMP Wed Apr 7 16:31:13 UTC 2021 CPU model : AMD Ryzen Threadripper PRO 5975WX 32-Cores Total number of threads:64 @@ -22,38 +22,38 @@ Total number of elements used for the calculation: 2560000 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Computation start-up statistics ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -Total time of the start-up: 43.8829 +Total time of the start-up: 44.5716 Number of calls to the linear solver per time step: 1 -Average time of the resolution of the linear problem per call: 2.7629 +Average time of the resolution of the linear problem per call: 3.22022 Average number of iteration of the linear solver per call: 0 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Time loop statistics ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ The first time step is not accounted for the computation of the time loop statistics -Total time of the time loop: 14.0367 +Total time of the time loop: 12.358 Number of time steps: 9 Skipped time steps: 1 -Average time per time step: 1.55963 -Standard deviation between time steps: 0.104933 -Time elapsed in the skipped time steps: 21.595 +Average time per time step: 1.37311 +Standard deviation between time steps: 0.098936 +Time elapsed in the skipped time steps: 28.8715 Standard counter description | Time/step | % loop time | Call(s)/step ------------------------------------------------------------------------------------------ -Linear solver resolutions Ax=B | 1.02044 | 65.4 | 3 -Matrix assembly for implicit scheme | 0.09923768 | 6.4 | 1 -Convection operator | 0.09379117 | 6.0 | 4 -Diffusion operator | 0.007355902 | 0.5 | 2 -Divergence operator | 0.02657271 | 1.7 | 4 -Source terms | 0.001260578 | 0.1 | 2 -Update ::mettre_a_jour | 0.0112653 | 0.7 | 4 -Computation of the time step dt | 0.00229102 | 0.1 | 4 -Post-treatment operations | 0.0168292 | 1.1 | 1 -Other operations | 0.2805938 | 18.0 | +Linear solver resolutions Ax=B | 0.930939 | 67.8 | 3 +Matrix assembly for implicit scheme | 0.07174572 | 5.2 | 1 +Convection operator | 0.06642676 | 4.8 | 4 +Diffusion operator | 0.005771405 | 0.4 | 2 +Divergence operator | 0.01625026 | 1.2 | 4 +Source terms | 0.0007618684 | 0.1 | 2 +Update ::mettre_a_jour | 0.00867656 | 0.6 | 4 +Computation of the time step dt | 0.002122282 | 0.2 | 4 +Post-treatment operations | 0.01133303 | 0.8 | 1 +Other operations | 0.2590803 | 18.9 | -Average number of iteration of the linear solver per call: 44.6 +Average number of iteration of the linear solver per call: 45.1 ----------------------------------------------------------------------------------------------------------- @@ -61,16 +61,16 @@ Average number of iteration of the linear solver per call: 44.6 ----------------------------------------------------------------------------------------------------------- Counter description | Time per step | % loop time | Call(s)/step | Bandwidth ----------------------------------------------------------------------------------------------------------- -Libraries: | 0.980411 | 62.9 | 3 | -Kernels: | 0.569024 | 36.5 | 435 | -Copy host to device: | 0.001424 | 0.1 | 21 | 3.1 GB/s -Copy device to host: | 0.0010956 | 0.1 | 7 | 10.2 GB/s -Alloc/Free on device: | 0.000533747 | 0.0 | 4 | -GPU: 99% Copy H<->D: 0.16% Alloc/free: 0.034% Comm: 0% CPU & I/O: 0.46% +Libraries: | 0.892538 | 65.0 | 3 | +Kernels: | 0.471266 | 34.3 | 433 | +Copy host to device: | 0.00140124 | 0.1 | 21 | 3.2 GB/s +Copy device to host: | 0.00110888 | 0.1 | 7 | 10.0 GB/s +Alloc/Free on device: | 0.000523076 | 0.0 | 4 | +GPU: 99% Copy H<->D: 0.18% Alloc/free: 0.038% Comm: 0% CPU & I/O: 0.46% ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Post-resolution statistics ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -Time of the post-resolution: 0.301731 +Time of the post-resolution: 0.265491 -Total time for the whole computation 79.8164 +Total time for the whole computation 86.0666 diff --git a/tests/GPU/PETSC_KOKKOS/PETSC_KOKKOS_BENCH.TU.is159479_cc120 b/tests/GPU/PETSC_KOKKOS/PETSC_KOKKOS_BENCH.TU.is159479_cc120 new file mode 100644 index 0000000000..c21639b3b7 --- /dev/null +++ b/tests/GPU/PETSC_KOKKOS/PETSC_KOKKOS_BENCH.TU.is159479_cc120 @@ -0,0 +1,76 @@ + # Global performance file # + +This is the global file for tracking performance in TRUST. It stores aggregated quantities. +More detailed statistics can be found in the PETSC_KOKKOS_BENCH_csv.TU file +For time loop, only standard counters of level 1 are printed alongside your custom counters +Time is given in seconds + +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Context of the computation +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Date: 24-04-2026 -- 14:37:16 +OS: is159479.intra.cea.fr__Linux__x86_64__6.12.0-160000.26-default__#1 SMP PREEMPT_DYNAMIC Thu Feb 5 00:00:11 UTC 2026 (f +CPU model : AMD Ryzen Threadripper PRO 9965WX 24-Cores +Total number of threads:48 +GPU model: NVIDIA RTX PRO 6000 Blackwell Workstation Edition +CUDA runtime version: 13.0 +CUDA drivers version: 13.0 +Nb procs used for the computation: 1 +TRUST version: 1.9.8_beta +Total number of elements used for the calculation: 2560000 + +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Computation start-up statistics +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Total time of the start-up: 34.6555 + +Number of calls to the linear solver per time step: 1 +Average time of the resolution of the linear problem per call: 2.12537 +Average number of iteration of the linear solver per call: 0 + +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Time loop statistics +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +The first time step is not accounted for the computation of the time loop statistics +Total time of the time loop: 4.81023 +Number of time steps: 9 +Skipped time steps: 1 +Average time per time step: 0.53447 +Standard deviation between time steps: 0.043241 +Time elapsed in the skipped time steps: 20.3312 + + +Standard counter description | Time/step | % loop time | Call(s)/step +------------------------------------------------------------------------------------------ +Linear solver resolutions Ax=B | 0.411461 | 77.0 | 3 +Matrix assembly for implicit scheme | 0.02128278 | 4.0 | 1 +Convection operator | 0.02519871 | 4.7 | 4 +Diffusion operator | 0.002420275 | 0.5 | 2 +Divergence operator | 0.002600299 | 0.5 | 4 +Source terms | 0.0003420792 | 0.1 | 2 +Update ::mettre_a_jour | 0.002391873 | 0.4 | 4 +Computation of the time step dt | 0.0007405434 | 0.1 | 4 +Post-treatment operations | 0.005288055 | 1.0 | 1 +Other operations | 0.06274485 | 11.7 | + +Average number of iteration of the linear solver per call: 45.1 + + +----------------------------------------------------------------------------------------------------------- + GPU statistics +----------------------------------------------------------------------------------------------------------- +Counter description | Time per step | % loop time | Call(s)/step | Bandwidth +----------------------------------------------------------------------------------------------------------- +Libraries: | 0.393842 | 73.7 | 3 | +Kernels: | 0.13385 | 25.0 | 433 | +Copy host to device: | 0.000492432 | 0.1 | 21 | 9.1 GB/s +Copy device to host: | 0.00135689 | 0.3 | 7 | 8.2 GB/s +Alloc/Free on device: | 0.000450243 | 0.1 | 4 | +GPU: 99% Copy H<->D: 0.35% Alloc/free: 0.084% Comm: 0% CPU & I/O: 0.84% +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Post-resolution statistics +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Time of the post-resolution: 0.0666706 + +Total time for the whole computation 59.8636 + diff --git a/tests/GPU/PETSC_KOKKOS/PETSC_KOKKOS_BENCH.TU.is247793_gfx1100 b/tests/GPU/PETSC_KOKKOS/PETSC_KOKKOS_BENCH.TU.is247793_gfx1100 new file mode 100644 index 0000000000..f8732dd43a --- /dev/null +++ b/tests/GPU/PETSC_KOKKOS/PETSC_KOKKOS_BENCH.TU.is247793_gfx1100 @@ -0,0 +1,76 @@ + # Global performance file # + +This is the global file for tracking performance in TRUST. It stores aggregated quantities. +More detailed statistics can be found in the PETSC_KOKKOS_BENCH_csv.TU file +For time loop, only standard counters of level 1 are printed alongside your custom counters +Time is given in seconds + +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Context of the computation +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Date: 22-05-2026 -- 17:41:36 +OS: is247793__Linux__x86_64__6.8.0-107-generic__#107~22.04.1-Ubuntu SMP PREEMPT_DYNAMIC Wed Mar 18 23:40:43 UTC +CPU model : AMD Ryzen Threadripper PRO 7965WX 24-Cores +Total number of threads:48 +GPU model: AMD Radeon PRO W7900 +HIP runtime version: 7.53 +HIP drivers version: 7.53 +Nb procs used for the computation: 1 +TRUST version: 1.9.8_beta +Total number of elements used for the calculation: 2560000 + +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Computation start-up statistics +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Total time of the start-up: 38.5024 + +Number of calls to the linear solver per time step: 1 +Average time of the resolution of the linear problem per call: 4.06631 +Average number of iteration of the linear solver per call: 0 + +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Time loop statistics +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +The first time step is not accounted for the computation of the time loop statistics +Total time of the time loop: 33.9639 +Number of time steps: 9 +Skipped time steps: 1 +Average time per time step: 3.77377 +Standard deviation between time steps: 0.552431 +Time elapsed in the skipped time steps: 29.7015 + + +Standard counter description | Time/step | % loop time | Call(s)/step +------------------------------------------------------------------------------------------ +Linear solver resolutions Ax=B | 3.40042 | 90.1 | 3 +Matrix assembly for implicit scheme | 0.06572012 | 1.7 | 1 +Convection operator | 0.077525 | 2.1 | 4 +Diffusion operator | 0.008007824 | 0.2 | 2 +Divergence operator | 0.01260207 | 0.3 | 4 +Source terms | 0.0009658954 | 0.0 | 2 +Update ::mettre_a_jour | 0.007784225 | 0.2 | 4 +Computation of the time step dt | 0.01158151 | 0.3 | 4 +Post-treatment operations | 0.01737637 | 0.5 | 1 +Other operations | 0.1717834 | 4.6 | + +Average number of iteration of the linear solver per call: 44.2 + + +----------------------------------------------------------------------------------------------------------- + GPU statistics +----------------------------------------------------------------------------------------------------------- +Counter description | Time per step | % loop time | Call(s)/step | Bandwidth +----------------------------------------------------------------------------------------------------------- +Libraries: | 3.3543 | 88.9 | 3 | +Kernels: | 0.408706 | 10.8 | 433 | +Copy host to device: | 0.000746001 | 0.0 | 21 | 6.0 GB/s +Copy device to host: | 0.000728967 | 0.0 | 7 | 15.3 GB/s +Alloc/Free on device: | 0.000857389 | 0.0 | 4 | +GPU: 1e+02% Copy H<->D: 0.039% Alloc/free: 0.023% Comm: 0% CPU & I/O: 0.22% +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Post-resolution statistics +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Time of the post-resolution: 0.0997579 + +Total time for the whole computation 102.268 + diff --git a/tests/GPU/PETSC_KOKKOS/PETSC_KOKKOS_BENCH.TU.jean-zay_cc90 b/tests/GPU/PETSC_KOKKOS/PETSC_KOKKOS_BENCH.TU.jean-zay_cc90 index f2c082287e..f410cf6bb9 100644 --- a/tests/GPU/PETSC_KOKKOS/PETSC_KOKKOS_BENCH.TU.jean-zay_cc90 +++ b/tests/GPU/PETSC_KOKKOS/PETSC_KOKKOS_BENCH.TU.jean-zay_cc90 @@ -8,8 +8,8 @@ Time is given in seconds ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Context of the computation ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -Date: 21-02-2026 -- 07:39:41 -OS: jzxh025__Linux__x86_64__5.14.0-570.58.1.el9_6.x86_64__#1 SMP PREEMPT_DYNAMIC Tue Oct 21 04:15:07 EDT 2025 +Date: 13-05-2026 -- 10:01:46 +OS: jzxh116__Linux__x86_64__5.14.0-570.58.1.el9_6.x86_64__#1 SMP PREEMPT_DYNAMIC Tue Oct 21 04:15:07 EDT 2025 CPU model : Intel(R) Xeon(R) Platinum 8468 Total number of threads:192 GPU model: NVIDIA H100 80GB HBM3 @@ -22,38 +22,38 @@ Total number of elements used for the calculation: 2560000 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Computation start-up statistics ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -Total time of the start-up: 41.4787 +Total time of the start-up: 50.4068 Number of calls to the linear solver per time step: 1 -Average time of the resolution of the linear problem per call: 2.43523 +Average time of the resolution of the linear problem per call: 3.0547 Average number of iteration of the linear solver per call: 0 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Time loop statistics ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ The first time step is not accounted for the computation of the time loop statistics -Total time of the time loop: 4.31082 +Total time of the time loop: 4.01735 Number of time steps: 9 Skipped time steps: 1 -Average time per time step: 0.47898 -Standard deviation between time steps: 0.0324794 -Time elapsed in the skipped time steps: 18.8744 +Average time per time step: 0.446372 +Standard deviation between time steps: 0.0328116 +Time elapsed in the skipped time steps: 30.0468 Standard counter description | Time/step | % loop time | Call(s)/step ------------------------------------------------------------------------------------------ -Linear solver resolutions Ax=B | 0.324965 | 67.8 | 3 -Matrix assembly for implicit scheme | 0.02646704 | 5.5 | 1 -Convection operator | 0.02453415 | 5.1 | 4 -Diffusion operator | 0.003655498 | 0.8 | 2 -Divergence operator | 0.006610645 | 1.4 | 4 -Source terms | 0.0004884063 | 0.1 | 2 -Update ::mettre_a_jour | 0.004852666 | 1.0 | 4 -Computation of the time step dt | 0.0008704661 | 0.2 | 4 -Post-treatment operations | 0.00899222 | 1.9 | 1 -Other operations | 0.07754328 | 16.2 | +Linear solver resolutions Ax=B | 0.314238 | 70.4 | 3 +Matrix assembly for implicit scheme | 0.01949049 | 4.4 | 1 +Convection operator | 0.01833284 | 4.1 | 4 +Diffusion operator | 0.003106423 | 0.7 | 2 +Divergence operator | 0.003050106 | 0.7 | 4 +Source terms | 0.0002473428 | 0.1 | 2 +Update ::mettre_a_jour | 0.0037138 | 0.8 | 4 +Computation of the time step dt | 0.0007315362 | 0.2 | 4 +Post-treatment operations | 0.009454939 | 2.1 | 1 +Other operations | 0.07400625 | 16.6 | -Average number of iteration of the linear solver per call: 45 +Average number of iteration of the linear solver per call: 45.1 ----------------------------------------------------------------------------------------------------------- @@ -61,17 +61,17 @@ Average number of iteration of the linear solver per call: 45 ----------------------------------------------------------------------------------------------------------- Counter description | Time per step | % loop time | Call(s)/step | Bandwidth ----------------------------------------------------------------------------------------------------------- -Libraries: | 0.313836 | 65.5 | 3 | -Kernels: | 0.15182 | 31.7 | 435 | -Copy host to device: | 0.000786192 | 0.2 | 21 | 5.7 GB/s -Copy device to host: | 0.00176988 | 0.4 | 7 | 6.3 GB/s -Alloc/Free on device: | 0.000779791 | 0.2 | 4 | -GPU: 97% Copy H<->D: 0.53% Alloc/free: 0.16% Comm: 0% CPU & I/O: 2.1% +Libraries: | 0.303017 | 67.9 | 3 | +Kernels: | 0.128906 | 28.9 | 433 | +Copy host to device: | 0.000767944 | 0.2 | 21 | 5.8 GB/s +Copy device to host: | 0.00174116 | 0.4 | 7 | 6.4 GB/s +Alloc/Free on device: | 0.000756607 | 0.2 | 4 | +GPU: 97% Copy H<->D: 0.56% Alloc/free: 0.17% Comm: 0% CPU & I/O: 2.5% ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Post-resolution statistics ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -Time of the post-resolution: 0.193317 +Time of the post-resolution: 0.135781 -Total time for the whole computation 64.8573 +Total time for the whole computation 84.6067 -Power consumption: 0 kW 0 kWh 0 € (0.12€/kWh) +[Slurm] Power consumption (94 s): 0.440 kW 0.011 kWh 0.001 € (0.10€/kWh) diff --git a/tests/GPU/PETSC_KOKKOS/PETSC_KOKKOS_BENCH.TU.lumi_gfx90a b/tests/GPU/PETSC_KOKKOS/PETSC_KOKKOS_BENCH.TU.lumi_gfx90a index cbe9f936da..b06ce76fa0 100644 --- a/tests/GPU/PETSC_KOKKOS/PETSC_KOKKOS_BENCH.TU.lumi_gfx90a +++ b/tests/GPU/PETSC_KOKKOS/PETSC_KOKKOS_BENCH.TU.lumi_gfx90a @@ -8,8 +8,8 @@ Time is given in seconds ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Context of the computation ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -Date: 24-02-2026 -- 00:35:37 -OS: nid007971__Linux__x86_64__6.4.0-150600.23.73_15.0.14-cray_shasta_c__#1 SMP Tue Oct 21 20:32:25 UTC 2025 (89d3c98) +Date: 15-05-2026 -- 21:06:28 +OS: nid005002__Linux__x86_64__6.4.0-150600.23.73_15.0.14-cray_shasta_c__#1 SMP Tue Oct 21 20:32:25 UTC 2025 (89d3c98) CPU model : AMD EPYC 7A53 64-Core Processor Total number of threads:128 GPU model: AMD Instinct MI250X @@ -22,38 +22,38 @@ Total number of elements used for the calculation: 2560000 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Computation start-up statistics ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -Total time of the start-up: 87.1868 +Total time of the start-up: 92.6822 Number of calls to the linear solver per time step: 1 -Average time of the resolution of the linear problem per call: 6.21404 +Average time of the resolution of the linear problem per call: 7.19735 Average number of iteration of the linear solver per call: 0 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Time loop statistics ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ The first time step is not accounted for the computation of the time loop statistics -Total time of the time loop: 15.8511 +Total time of the time loop: 13.9732 Number of time steps: 9 Skipped time steps: 1 -Average time per time step: 1.76123 -Standard deviation between time steps: 0.0801091 -Time elapsed in the skipped time steps: 30.8973 +Average time per time step: 1.55257 +Standard deviation between time steps: 0.0658627 +Time elapsed in the skipped time steps: 40.3491 Standard counter description | Time/step | % loop time | Call(s)/step ------------------------------------------------------------------------------------------ -Linear solver resolutions Ax=B | 0.942021 | 18.1 | 3 -Matrix assembly for implicit scheme | 0.1707076 | 3.3 | 1 -Convection operator | 0.2086225 | 4.0 | 4 -Diffusion operator | 0.01388809 | 0.3 | 2 -Divergence operator | 0.03186915 | 0.6 | 4 -Source terms | 0.0006588372 | 0.0 | 2 -Update ::mettre_a_jour | 0.01178035 | 0.2 | 4 -Computation of the time step dt | 0.001818052 | 0.0 | 4 -Post-treatment operations | 0.02213894 | 0.4 | 1 -Other operations | 0.3577303 | 6.9 | +Linear solver resolutions Ax=B | 0.873477 | 56.3 | 3 +Matrix assembly for implicit scheme | 0.1348632 | 8.7 | 1 +Convection operator | 0.1748085 | 11.3 | 4 +Diffusion operator | 0.01119091 | 0.7 | 2 +Divergence operator | 0.01998403 | 1.3 | 4 +Source terms | 0.00281269 | 0.2 | 2 +Update ::mettre_a_jour | 0.008613155 | 0.6 | 4 +Computation of the time step dt | 0.001514032 | 0.1 | 4 +Post-treatment operations | 0.01969925 | 1.3 | 1 +Other operations | 0.3056097 | 19.7 | -Average number of iteration of the linear solver per call: 44.6 +Average number of iteration of the linear solver per call: 45.2 ----------------------------------------------------------------------------------------------------------- @@ -61,17 +61,17 @@ Average number of iteration of the linear solver per call: 44.6 ----------------------------------------------------------------------------------------------------------- Counter description | Time per step | % loop time | Call(s)/step | Bandwidth ----------------------------------------------------------------------------------------------------------- -Libraries: | 0.919261 | 52.2 | 3 | -Kernels: | 0.830634 | 47.2 | 435 | -Copy host to device: | 0.000698769 | 0.0 | 21 | 6.4 GB/s -Copy device to host: | 0.000794326 | 0.0 | 7 | 14.0 GB/s -Alloc/Free on device: | 0.000123277 | 0.0 | 4 | -GPU: 99% Copy H<->D: 0.085% Alloc/free: 0.007% Comm: 0% CPU & I/O: 0.55% +Libraries: | 0.851297 | 54.8 | 3 | +Kernels: | 0.689726 | 44.4 | 433 | +Copy host to device: | 0.00081278 | 0.1 | 21 | 5.5 GB/s +Copy device to host: | 0.000797494 | 0.1 | 7 | 14.0 GB/s +Alloc/Free on device: | 0.000133037 | 0.0 | 4 | +GPU: 99% Copy H<->D: 0.1% Alloc/free: 0.0086% Comm: 0% CPU & I/O: 0.63% ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Post-resolution statistics ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -Time of the post-resolution: 0.190694 +Time of the post-resolution: 0.181855 -Total time for the whole computation 134.126 +Total time for the whole computation 147.186 -Power consumption: 0 kW 0 kWh 0 € (0.12€/kWh) +[Slurm] Power consumption (170 s): 0.512 kW 0.024 kWh 0.002 € (0.10€/kWh) diff --git a/tests/GPU/PETSC_KOKKOS/PETSC_KOKKOS_BENCH.TU.topaze_cc80 b/tests/GPU/PETSC_KOKKOS/PETSC_KOKKOS_BENCH.TU.topaze_cc80 index 0b7897eebc..c96cace81a 100644 --- a/tests/GPU/PETSC_KOKKOS/PETSC_KOKKOS_BENCH.TU.topaze_cc80 +++ b/tests/GPU/PETSC_KOKKOS/PETSC_KOKKOS_BENCH.TU.topaze_cc80 @@ -8,8 +8,8 @@ Time is given in seconds ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Context of the computation ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -Date: 20-03-2026 -- 15:58:09 -OS: topaze7070__Linux__x86_64__4.18.0-553.89.1.el8_10.x86_64__#1 SMP Sat Nov 29 00:49:18 EST 2025 +Date: 15-05-2026 -- 13:55:52 +OS: topaze7040__Linux__x86_64__4.18.0-553.89.1.el8_10.x86_64__#1 SMP Sat Nov 29 00:49:18 EST 2025 CPU model : AMD EPYC 7763 64-Core Processor Total number of threads:256 GPU model: NVIDIA A100-SXM4-80GB @@ -22,38 +22,38 @@ Total number of elements used for the calculation: 2560000 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Computation start-up statistics ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -Total time of the start-up: 56.9018 +Total time of the start-up: 58.4819 Number of calls to the linear solver per time step: 1 -Average time of the resolution of the linear problem per call: 4.27597 +Average time of the resolution of the linear problem per call: 4.08312 Average number of iteration of the linear solver per call: 0 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Time loop statistics ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ The first time step is not accounted for the computation of the time loop statistics -Total time of the time loop: 6.81565 +Total time of the time loop: 6.22125 Number of time steps: 9 Skipped time steps: 1 -Average time per time step: 0.757294 -Standard deviation between time steps: 0.0561637 -Time elapsed in the skipped time steps: 29.5074 +Average time per time step: 0.69125 +Standard deviation between time steps: 0.0538561 +Time elapsed in the skipped time steps: 38.441 Standard counter description | Time/step | % loop time | Call(s)/step ------------------------------------------------------------------------------------------ -Linear solver resolutions Ax=B | 0.530545 | 70.1 | 3 -Matrix assembly for implicit scheme | 0.04442663 | 5.9 | 1 -Convection operator | 0.04330396 | 5.7 | 4 -Diffusion operator | 0.005314186 | 0.7 | 2 -Divergence operator | 0.008753432 | 1.2 | 4 -Source terms | 0.001030859 | 0.1 | 2 -Update ::mettre_a_jour | 0.004987524 | 0.7 | 4 -Computation of the time step dt | 0.001178208 | 0.2 | 4 -Post-treatment operations | 0.01122804 | 1.5 | 1 -Other operations | 0.1065263 | 14.1 | +Linear solver resolutions Ax=B | 0.510936 | 73.9 | 3 +Matrix assembly for implicit scheme | 0.02969765 | 4.3 | 1 +Convection operator | 0.02792525 | 4.0 | 4 +Diffusion operator | 0.004167876 | 0.6 | 2 +Divergence operator | 0.003910002 | 0.6 | 4 +Source terms | 0.000349984 | 0.1 | 2 +Update ::mettre_a_jour | 0.004006453 | 0.6 | 4 +Computation of the time step dt | 0.0009945288 | 0.1 | 4 +Post-treatment operations | 0.01014142 | 1.5 | 1 +Other operations | 0.09912006 | 14.3 | -Average number of iteration of the linear solver per call: 45 +Average number of iteration of the linear solver per call: 45.1 ----------------------------------------------------------------------------------------------------------- @@ -61,17 +61,17 @@ Average number of iteration of the linear solver per call: 45 ----------------------------------------------------------------------------------------------------------- Counter description | Time per step | % loop time | Call(s)/step | Bandwidth ----------------------------------------------------------------------------------------------------------- -Libraries: | 0.512967 | 67.7 | 3 | -Kernels: | 0.230442 | 30.4 | 434 | -Copy host to device: | 0.00184645 | 0.2 | 21 | 2.4 GB/s -Copy device to host: | 0.00140153 | 0.2 | 7 | 7.9 GB/s -Alloc/Free on device: | 0.00091814 | 0.1 | 4 | -GPU: 98% Copy H<->D: 0.43% Alloc/free: 0.12% Comm: 0% CPU & I/O: 1.3% +Libraries: | 0.493351 | 71.4 | 3 | +Kernels: | 0.183954 | 26.6 | 433 | +Copy host to device: | 0.00171774 | 0.2 | 21 | 2.6 GB/s +Copy device to host: | 0.000954437 | 0.1 | 7 | 11.7 GB/s +Alloc/Free on device: | 0.000935466 | 0.1 | 4 | +GPU: 98% Copy H<->D: 0.39% Alloc/free: 0.14% Comm: 0% CPU & I/O: 1.5% ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Post-resolution statistics ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -Time of the post-resolution: 0.308962 +Time of the post-resolution: 0.211632 -Total time for the whole computation 93.5339 +Total time for the whole computation 103.356 -[Slurm] Power consumption (131 s): 0.377 kW 0.014 kWh 0.001 € (0.10€/kWh) +[Slurm] Power consumption (135 s): 0.554 kW 0.021 kWh 0.002 € (0.10€/kWh) diff --git a/tests/GPU/PNE_LES_LHe/PNE_LES_LHe.data b/tests/GPU/PNE_LES_LHe/PNE_LES_LHe.data new file mode 100644 index 0000000000..2a66cff8f0 --- /dev/null +++ b/tests/GPU/PNE_LES_LHe/PNE_LES_LHe.data @@ -0,0 +1,245 @@ +# PARALLEL OK # +# Warning: Mesh is reduced compared to the real one # +dimension 3 + +pb_thermohydraulique_turbulent pb_fluide +Domaine dom_fluide + +# BEGIN MESH # +Mailler dom_fluide +{ + Pave Cav_leftdown + { + Origine 0. 0. 0 + Nombre_de_Noeuds 11 10 3 + Longueurs 0.02 0.1 0.03 + Facteurs 1.01 1.0 1 + } + { + Bord front Z = 0. 0. <= X <= 0.02 0. <= Y <= 0.1 + Bord back Z = 0.03 0. <= X <= 0.02 0. <= Y <= 0.1 + Bord left X = 0. 0. <= Y <= 0.1 0. <= Z <= 0.03 + bord bas Y = 0. 0. <= X <= 0.02 0. <= Z <= 0.03 + } , + Pave Cav_leftmid + { + Origine 0. 0.1 0. + Nombre_de_Noeuds 11 110 3 + Longueurs 0.02 0.95 0.03 + Facteurs 1.01 1.0 1 + } + { + Bord front Z = 0. 0. <= X <= 0.02 0.1 <= Y <= 1.05 + Bord back Z = 0.03 0. <= X <= 0.02 0.1 <= Y <= 1.05 + Bord wallheat X = 0. 0.1 <= Y <= 1.05 0. <= Z <= 0.03 + } , + + Pave Cav_leftup + { + Origine 0. 1.05 0 + Nombre_de_Noeuds 11 25 3 + Longueurs 0.02 1.5 0.03 + Facteurs 1.01 1.01 1 + } + { + Bord left X = 0. 1.05 <= Y <= 2.55 0. <= Z <= 0.03 + bord up Y = 2.55 0. <= X <= 0.02 0. <= Z <= 0.03 + Bord front Z = 0. 0. <= X <= 0.02 1.05 <= Y <= 2.55 + Bord back Z = 0.03 0. <= X <= 0.02 1.05 <= Y <= 2.55 + } , + + Pave Cav_rightdown + { + Origine 0.02 0. 0 + Nombre_de_Noeuds 4 10 3 + Longueurs 0.23 0.1 0.03 + Facteurs 1 1.0 1 + } + { + Bord front Z = 0. 0.02 <= X <= 0.25 0. <= Y <= 0.1 + Bord back Z = 0.03 0.02 <= X <= 0.25 0. <= Y <= 0.1 + Bord right X = 0.25 0. <= Y <= 0.1 0. <= Z <= 0.03 + bord bas Y = 0. 0.02 <= X <= 0.249 0. <= Z <= 0.03 + bord inlet Y = 0. 0.249 <= X <= 0.25 0. <= Z <= 0.03 + } , + Pave Cav_rightmid + { + Origine 0.02 0.1 0 + Nombre_de_Noeuds 4 110 3 + Longueurs 0.23 0.95 0.03 + Facteurs 1 1.0 1 + } + { + Bord front Z = 0. 0.02 <= X <= 0.25 0.1 <= Y <= 1.05 + Bord back Z = 0.03 0.02 <= X <= 0.25 0.1 <= Y <= 1.05 + Bord right X = 0.25 0.1 <= Y <= 1.05 0. <= Z <= 0.03 + } , + Pave Cav_rightup + { + Origine 0.02 1.05 0 + Nombre_de_Noeuds 4 25 3 + Longueurs 0.23 1.5 0.03 + Facteurs 1 1.01 1 + } + { + Bord right X = 0.25 1.05 <= Y <= 2.55 0. <= Z <= 0.03 + bord up Y = 2.55 0.02 <= X <= 0.249 0. <= Z <= 0.03 + bord outlet Y = 2.55 0.249 <= X <= 0.25 0. <= Z <= 0.03 + Bord front Z = 0. 0.02 <= X <= 0.25 1.05 <= Y <= 2.55 + Bord back Z = 0.03 0.02 <= X <= 0.25 1.05 <= Y <= 2.55 + } +} +/* raffiner_isotrope dom_fluide raffiner_isotrope dom_fluide raffiner_isotrope dom_fluide */ +RegroupeBord dom_fluide perio { front back } +Corriger_frontiere_periodique { domaine dom_fluide bord perio } +# END MESH # + +# BEGIN PARTITION +Partition dom_fluide +{ + Partition_tool metis { Nb_parts 2 } + Larg_joint 2 + zones_name DOM + single_hdf +} +END PARTITION # + +# BEGIN SCATTER +Scatter DOM.Zones dom_fluide +END SCATTER # + +VDF dis +Runge_Kutta_ordre_3 sch +Read sch +{ + nb_pas_dt_max 10 + tinit 0 + tmax 30 + dt_min 5E-7 + dt_max 1E-2 + dt_start dt_fixe 1E-6 + dt_impr 1.e-1 + dt_sauv -1 + seuil_statio 1.e-18 + diffusion_implicite 1 + Facsec 1 +} + +Associate pb_fluide dom_fluide +Associate pb_fluide sch +Discretize pb_fluide dis + + +Read pb_fluide +{ + fluide_incompressible { + gravite Champ_Uniforme 3 0. -9.81 0 + mu Champ_Fonc_Fonction pb_fluide temperature 1 -2.90526e-05*val+6.56303e-06*val^2-5.0438e-07*val^3+4.71169e-05 + rho Champ_Uniforme 1 125.9132 + lambda Champ_Fonc_Fonction pb_fluide temperature 1 0.01966864+val-val + Cp Champ_Uniforme 1 5128.20066 + beta_th Champ_Fonc_Fonction pb_fluide temperature 1 -2395.84*val+805.478*val^2-120.134*val^3+6.70735*val^4+2667.35 + } + + Navier_Stokes_Turbulent + { + /* solveur_pression AMG GCP { atol 1.e-6 impr } */ + solveur_pression petsc_gpu cli + { + -pc_type hypre + -pc_hypre_type boomeramg + -pc_hypre_boomeramg_strong_threshold 0.5 + -pc_hypre_boomeramg_agg_nl 4 + -pc_hypre_boomeramg_agg_num_paths 5 + -pc_hypre_boomeramg_max_levels 25 + -pc_hypre_boomeramg_coarsen_type PMIS + -pc_hypre_boomeramg_interp_type ext+i + -pc_hypre_boomeramg_P_max 2 + -pc_hypre_boomeramg_truncfactor 0.5 + -ksp_atol 1e-5 + -ksp_max_it 20000 + } + Parametre_equation parametre_diffusion_implicite + { + crank 1 + niter_max_diffusion_implicite 300 + preconditionnement_diag 1 + seuil_diffusion_implicite 1.e-9 + } + + convection { quick } + diffusion { } + initial_conditions { vitesse champ_uniforme 3 0 0 0 } + sources { Boussinesq_temperature { T0 4.2 verif_boussinesq 0 } } + boundary_conditions + { + left paroi_fixe + up paroi_fixe + bas paroi_fixe + right paroi_fixe + perio periodique + outlet frontiere_ouverte_pression_imposee champ_front_uniforme 1 240000 + wallheat paroi_defilante champ_front_fonc_txyz 3 0.5*(15.1875*((y-0.1)/0.025)^5-35.4375*((y-0.1)/0.025)^4+20.25*((y-0.1)/0.025)^3)*sin(2*Pi*10*t)*(0.1D: 3.2% Alloc/free: 0.0047% Comm: 0% CPU & I/O: 89% +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Post-resolution statistics +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Time of the post-resolution: 0.179095 + +Total time for the whole computation 41.6582 + diff --git a/tests/GPU/PNE_LES_LHe/PNE_LES_LHe_BENCH.TU.topaze_cc80 b/tests/GPU/PNE_LES_LHe/PNE_LES_LHe_BENCH.TU.topaze_cc80 new file mode 100644 index 0000000000..620438b849 --- /dev/null +++ b/tests/GPU/PNE_LES_LHe/PNE_LES_LHe_BENCH.TU.topaze_cc80 @@ -0,0 +1,80 @@ + # Global performance file # + +This is the global file for tracking performance in TRUST. It stores aggregated quantities. +More detailed statistics can be found in the PNE_LES_LHe_BENCH_csv.TU file +For time loop, only standard counters of level 1 are printed alongside your custom counters +Time is given in seconds + +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Context of the computation +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Date: 13-06-2026 -- 12:33:04 +OS: topaze7062__Linux__x86_64__4.18.0-553.123.1.el8_10.x86_64__#1 SMP Mon May 4 13:45:48 EDT 2026 +CPU model : AMD EPYC 7763 64-Core Processor +Total number of threads:256 +GPU model: NVIDIA A100-SXM4-80GB +CUDA runtime version: 12.90 +CUDA drivers version: 13.0 +Nb procs used for the computation: 1 +TRUST version: 1.9.8_beta +Total number of elements used for the calculation: 1890304 + +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Computation start-up statistics +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Total time of the start-up: 21.2704 + +Number of calls to the linear solver per time step: 2 +Average time of the resolution of the linear problem per call: 1.52921 +Average number of iteration of the linear solver per call: 0 + +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Time loop statistics +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +The first time step is not accounted for the computation of the time loop statistics +Total time of the time loop: 26.8942 +Number of time steps: 9 +Skipped time steps: 1 +Average time per time step: 2.98825 +Standard deviation between time steps: 0.115967 +Time elapsed in the skipped time steps: 3.76817 + + +Standard counter description | Time/step | % loop time | Call(s)/step +------------------------------------------------------------------------------------------ +Linear solver resolutions Ax=B | 0.0290735 | 1.0 | 3 +Matrix assembly for implicit scheme | 2.659207 | 89.0 | 3 +Convection operator | 0.01160732 | 0.4 | 6 +Diffusion operator | 0.1153224 | 3.9 | 30 +Gradient operator | 0.002387247 | 0.1 | 6 +Divergence operator | 0.001326333 | 0.0 | 4 +Source terms | 0.0009539784 | 0.0 | 3 +Update ::mettre_a_jour | 0.01441476 | 0.5 | 1 +Solver for implicit diffusion | 0.01719583 | 0.6 | 6 +Computation of the time step dt | 0.08637916 | 2.9 | 10 +Turbulence model::update | 0.003609511 | 0.1 | 1 +Post-treatment operations | 0.04176783 | 1.4 | 1 +Other operations | 0.005002396 | 0.2 | + +Average number of iteration of the linear solver per call: 2.52 + + +----------------------------------------------------------------------------------------------------------- + GPU statistics +----------------------------------------------------------------------------------------------------------- +Counter description | Time per step | % loop time | Call(s)/step | Bandwidth +----------------------------------------------------------------------------------------------------------- +Libraries: | 0.0284574 | 1.0 | 3 | +Kernels: | 0.0965999 | 3.2 | 1783 | +Copy host to device: | 0.0240385 | 0.8 | 83 | 16.3 GB/s +Copy device to host: | 0.0485745 | 1.6 | 79 | 14.4 GB/s +Alloc/Free on device: | 0.000201251 | 0.0 | 2236 | +GPU: 4.2% Copy H<->D: 2.4% Alloc/free: 0.0067% Comm: 0% CPU & I/O: 93% +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Post-resolution statistics +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Time of the post-resolution: 0.305996 + +Total time for the whole computation 52.2388 + +[Slurm] Power consumption (97 s): 0.379 kW 0.010 kWh 0.001 € (0.10€/kWh) diff --git a/tests/GPU/PNE_LES_LHe/check_perf.sh b/tests/GPU/PNE_LES_LHe/check_perf.sh new file mode 120000 index 0000000000..6d20411c12 --- /dev/null +++ b/tests/GPU/PNE_LES_LHe/check_perf.sh @@ -0,0 +1 @@ +../DomainFlowLES/check_perf.sh \ No newline at end of file diff --git a/tests/GPU/TRUSTSingle/TRUSTSingle.TU.ref_is157091 b/tests/GPU/TRUSTSingle/TRUSTSingle.TU.ref_is157091 new file mode 100644 index 0000000000..44b677f2d0 --- /dev/null +++ b/tests/GPU/TRUSTSingle/TRUSTSingle.TU.ref_is157091 @@ -0,0 +1,60 @@ + # Global performance file # + +This is the global file for tracking performance in TRUST. It stores aggregated quantities. +More detailed statistics can be found in the TRUSTSingle_csv.TU file +For time loop, only standard counters of level 1 are printed alongside your custom counters +Time is given in seconds + +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Context of the computation +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Date: 09-04-2026 -- 14:28:31 +OS: is157091__Linux__x86_64__5.11.12-300.fc34.x86_64__#1 SMP Wed Apr 7 16:31:13 UTC 2021 +CPU model : AMD Ryzen Threadripper PRO 5975WX 32-Cores +Total number of threads:64 +GPU model: No GPU used for the computation +Nb procs used for the computation: 1 +TRUST version: 1.9.8_beta +Total number of elements used for the calculation: 39601 + +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Computation start-up statistics +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Total time of the start-up: 0.269967 + +Number of calls to the linear solver per time step: 2 +Average time of the resolution of the linear problem per call: 0.0943477 +Average number of iteration of the linear solver per call: 1 + +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Time loop statistics +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Total time of the time loop: 3.16648 +Number of time steps: 100 +Skipped time steps: 0 +Average time per time step: 0.0316648 +Standard deviation between time steps: 0.000411583 +Time elapsed in the skipped time steps: 0 + + +Standard counter description | Time/step | % loop time | Call(s)/step +------------------------------------------------------------------------------------------ +Linear solver resolutions Ax=B | 0.014383 | 45.4 | 1 +Convection operator | 0.002030326 | 6.4 | 10 +Diffusion operator | 0.006760434 | 21.4 | 10 +Gradient operator | 0.000844102 | 2.7 | 2 +Divergence operator | 0.0005150209 | 1.6 | 2 +Update ::mettre_a_jour | 0.001913582 | 6.0 | 1 +Computation of the time step dt | 0.002560449 | 8.1 | 20 +Post-treatment operations | 0.0002793886 | 0.9 | 1 +Other operations | 0.002378514 | 7.5 | + +Average number of iteration of the linear solver per call: 1 + +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Post-resolution statistics +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Time of the post-resolution: 0.000175592 + +Total time for the whole computation 3.43674 + diff --git a/tests/GPU/TRUSTSingle/TRUSTSingle.data b/tests/GPU/TRUSTSingle/TRUSTSingle.data new file mode 100644 index 0000000000..cd8b137659 --- /dev/null +++ b/tests/GPU/TRUSTSingle/TRUSTSingle.data @@ -0,0 +1,122 @@ +# Performance of TRUSTSingle # +dimension 2 + +Pb_Thermohydraulique_Cloned_Concentration pb +Domaine dom + +# BEGIN MESH # +Mailler dom +{ + Pave Cavite + { + Origine 0. 0. + Nombre_de_Noeuds 200 200 + Longueurs 0.01 0.01 + } + { + Bord Gauche X = 0. 0. <= Y <= 0.01 + Bord Haut Y = 0.01 0. <= X <= 0.01 + Bord Bas Y = 0. 0. <= X <= 0.01 + Bord Droit X = 0.01 0. <= Y <= 0.01 + } +} + +# END MESH # +# BEGIN PARTITION +Partition dom +{ + Partition_tool tranche { tranches 2 1 } + Larg_joint 2 + zones_name DOM +} +End +END PARTITION # + +# BEGIN SCATTER +Scatter DOM.Zones dom +END SCATTER # + +vdf dis + +Schema_euler_explicite sch +Read sch +{ + nb_pas_dt_max 100 + tinit 0 + dt_max 1e-5 + dt_impr 1e-10 + dt_sauv -1 +} + +Associate pb dom +Associate pb sch +Discretize pb dis + +Read pb +{ + Fluide_Incompressible + { + mu champ_fonc_fonction pb temperature 1 1.85e-5*(1+val*0.01) + rho Champ_Uniforme 1 1. + lambda champ_fonc_fonction pb temperature 1 0.0262 + Cp Champ_Uniforme 1 1. + beta_th Champ_Uniforme 1 3.41e-3 + beta_co Champ_Uniforme 1 0.04 + gravite Champ_Uniforme 2 0 -9.81 + } + + Constituant + { + coefficient_diffusion champ_uniforme 8 0.000262 0.00262 0.0262 0.262 0.000262 0.00262 0.0262 0.262 + } + + Navier_Stokes_standard + { + solveur_pression petsc cholesky { } + + convection { amont } + diffusion { } + initial_conditions + { + vitesse Champ_Uniforme 2 0. 0. + } + boundary_conditions + { + Haut symetrie + Droit frontiere_ouverte_vitesse_imposee Champ_Front_Uniforme 2 0. 0. + Bas symetrie + Gauche frontiere_ouverte_vitesse_imposee Champ_Front_Uniforme 2 0. 0. + } + + } + Convection_Diffusion_Temperature + { + diffusion { } + convection { amont } + initial_conditions { Temperature Champ_Uniforme 1 0. } + boundary_conditions + { + Haut symetrie + Bas symetrie + Gauche frontiere_ouverte_temperature_imposee Champ_Front_Uniforme 1 0. + Droit frontiere_ouverte_temperature_imposee Champ_Front_Uniforme 1 0. + } + } + Convection_diffusion_Concentration + { + diffusion { } + convection { amont } + initial_conditions { concentration0 Champ_Uniforme 1 0. } + boundary_conditions + { + Haut symetrie + Bas symetrie + Gauche frontiere_ouverte_concentration_imposee Champ_Front_Uniforme 1 0. + Droit frontiere_ouverte_concentration_imposee Champ_Front_Uniforme 1 10. + } + } +} + +Solve pb +End + diff --git a/tests/GPU/TRUSTSingle/TRUSTSingle.lml.gz b/tests/GPU/TRUSTSingle/TRUSTSingle.lml.gz new file mode 100644 index 0000000000..df58e35069 Binary files /dev/null and b/tests/GPU/TRUSTSingle/TRUSTSingle.lml.gz differ diff --git a/tests/GPU/TaylorGreen/TaylorGreen.data b/tests/GPU/TaylorGreen/TaylorGreen.data new file mode 100644 index 0000000000..dba39d9940 --- /dev/null +++ b/tests/GPU/TaylorGreen/TaylorGreen.data @@ -0,0 +1,114 @@ +# Taylor-Green vortex DNS - Wang 2013, section 4.15 # +# 3D incompressible Navier-Stokes, Re=1600 # +# Domain: -pi*L <= x,y,z <= pi*L with L=1 # +# V0=1, tc=L/V0=1, t_final=20*tc=20 # +# Re = rho0*V0*L/mu = 1600 => mu = 1/1600 = 6.25e-4 # +# Initial conditions (Taylor-Green vortex): # +# u = V0*sin(x/L)*cos(y/L)*cos(z/L) = sin(x)*cos(y)*cos(z) # +# v = -V0*cos(x/L)*sin(y/L)*cos(z/L) = -cos(x)*sin(y)*cos(z) # +# w = 0 # +Dimension 3 +Pb_Hydraulique pb +Domaine dom + +# BEGIN MESH # +Mailler dom +{ + Pave Cube + { + Origine -3.14159265358979 -3.14159265358979 -3.14159265358979 + Nombre_de_Noeuds 6 6 6 + /* Nombre_de_Noeuds 101 101 101 */ + Longueurs 6.28318530717959 6.28318530717959 6.28318530717959 + } + { + Bord periox X = -3.14159265358979 -3.14159265358979 <= Y <= 3.14159265358979 -3.14159265358979 <= Z <= 3.14159265358979 + Bord periox X = 3.14159265358979 -3.14159265358979 <= Y <= 3.14159265358979 -3.14159265358979 <= Z <= 3.14159265358979 + Bord perioy Y = -3.14159265358979 -3.14159265358979 <= X <= 3.14159265358979 -3.14159265358979 <= Z <= 3.14159265358979 + Bord perioy Y = 3.14159265358979 -3.14159265358979 <= X <= 3.14159265358979 -3.14159265358979 <= Z <= 3.14159265358979 + Bord perioz Z = -3.14159265358979 -3.14159265358979 <= X <= 3.14159265358979 -3.14159265358979 <= Y <= 3.14159265358979 + Bord perioz Z = 3.14159265358979 -3.14159265358979 <= X <= 3.14159265358979 -3.14159265358979 <= Y <= 3.14159265358979 + } +} +Declarer_bord_perio { domaine dom bord periox } +Declarer_bord_perio { domaine dom bord perioy } +Declarer_bord_perio { domaine dom bord perioz } +# END MESH # + +# BEGIN PARTITION +Partition dom +{ + Partition_tool Metis { Nb_parts 4 } + Larg_joint 2 + single_hdf + zones_name DOM +} +End +END PARTITION # + +# BEGIN SCATTER +Scatter DOM.Zones dom +END SCATTER # + +VDF dis +Lire dis { reorder { algo hilbert } } + +Runge_Kutta_ordre_3 sch +Read sch +{ + nb_pas_dt_max 50 + tinit 0 + tmax 20 + dt_impr 1.0 + facsec 1. +} + +Associate pb dom +Associate pb sch +Discretize pb dis + +# +Domaine plan_x0 +Domaine plan_y0 +Domaine plan_z0 +Extraire_surface { domaine plan_x0 probleme pb condition_elements -0.032D: 0.31% Alloc/free: 0.63% Comm: 0% CPU & I/O: 6.2% +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Post-resolution statistics +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Time of the post-resolution: 0.037555 + +Total time for the whole computation 43.8542 + +[Slurm] Power consumption (51 s): 0.536 kW 0.008 kWh 0.001 € (0.10€/kWh) diff --git a/tests/GPU/TaylorGreen/TaylorGreen_BENCH.TU.adastra_gfx942 b/tests/GPU/TaylorGreen/TaylorGreen_BENCH.TU.adastra_gfx942 new file mode 100644 index 0000000000..e1b77192d9 --- /dev/null +++ b/tests/GPU/TaylorGreen/TaylorGreen_BENCH.TU.adastra_gfx942 @@ -0,0 +1,76 @@ + # Global performance file # + +This is the global file for tracking performance in TRUST. It stores aggregated quantities. +More detailed statistics can be found in the TaylorGreen_BENCH_csv.TU file +For time loop, only standard counters of level 1 are printed alongside your custom counters +Time is given in seconds + +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Context of the computation +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Date: 23-04-2026 -- 14:58:43 +OS: a1003__Linux__x86_64__5.14.0-570.69.1.el9_6.x86_64__#1 SMP PREEMPT_DYNAMIC Tue Nov 25 01:30:14 EST 2025 +CPU model : AMD Instinct MI300A Accelerator +Total number of threads:192 +GPU model: AMD Instinct MI300A +HIP runtime version: 6.43 +HIP drivers version: 6.43 +Nb procs used for the computation: 1 +TRUST version: 1.9.8_beta +Total number of elements used for the calculation: 1000000 + +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Computation start-up statistics +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Total time of the start-up: 11.2893 + +Number of calls to the linear solver per time step: 2 +Average time of the resolution of the linear problem per call: 0.694486 +Average number of iteration of the linear solver per call: 11 + +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Time loop statistics +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +The first time step is not accounted for the computation of the time loop statistics +Total time of the time loop: 16.38 +Number of time steps: 388 +Skipped time steps: 1 +Average time per time step: 0.0422165 +Standard deviation between time steps: 0.00388779 +Time elapsed in the skipped time steps: 0.160434 + + +Standard counter description | Time/step | % loop time | Call(s)/step +------------------------------------------------------------------------------------------ +Linear solver resolutions Ax=B | 0.0230728 | 54.7 | 3 +Convection operator | 0.003841562 | 9.1 | 3 +Diffusion operator | 0.00227114 | 5.4 | 3 +Gradient operator | 0.001197736 | 2.8 | 6 +Divergence operator | 0.001136415 | 2.7 | 4 +Update ::mettre_a_jour | 0.001148386 | 2.7 | 1 +Computation of the time step dt | 0.0002428235 | 0.6 | 2 +Post-treatment operations | 0.006814134 | 16.1 | 1 +Other operations | 0.002491502 | 5.9 | + +Average number of iteration of the linear solver per call: 13.5 + + +----------------------------------------------------------------------------------------------------------- + GPU statistics +----------------------------------------------------------------------------------------------------------- +Counter description | Time per step | % loop time | Call(s)/step | Bandwidth +----------------------------------------------------------------------------------------------------------- +Libraries: | 0.0228135 | 54.0 | 3 | +Kernels: | 0.0121658 | 28.8 | 318 | +Copy host to device: | 0.000199058 | 0.5 | 13 | 0.8 GB/s +Copy device to host: | 0.000438415 | 1.0 | 1 | 49.8 GB/s +Alloc/Free on device: | 0.000203403 | 0.5 | 311 | +GPU: 83% Copy H<->D: 1.5% Alloc/free: 0.48% Comm: 0% CPU & I/O: 15% +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Post-resolution statistics +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Time of the post-resolution: 0.0383888 + +Total time for the whole computation 27.8684 + +[Slurm] Power consumption (38 s): 0.784 kW 0.008 kWh 0.001 € (0.10€/kWh) diff --git a/tests/GPU/TaylorGreen/TaylorGreen_BENCH.TU.dalianvl_cc100 b/tests/GPU/TaylorGreen/TaylorGreen_BENCH.TU.dalianvl_cc100 new file mode 100644 index 0000000000..7da4e326b3 --- /dev/null +++ b/tests/GPU/TaylorGreen/TaylorGreen_BENCH.TU.dalianvl_cc100 @@ -0,0 +1,75 @@ + # Global performance file # + +This is the global file for tracking performance in TRUST. It stores aggregated quantities. +More detailed statistics can be found in the TaylorGreen_BENCH_csv.TU file +For time loop, only standard counters of level 1 are printed alongside your custom counters +Time is given in seconds + +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Context of the computation +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Date: 09-06-2026 -- 09:05:44 +OS: dalianvl06__Linux__aarch64__5.14.0-503.40.1.el9_5.aarch64__#1 SMP PREEMPT_DYNAMIC Thu Apr 24 13:05:29 UTC 2025 +CPU model : Unknown Linux CPU +Total number of threads:144 +GPU model: NVIDIA GB200 +CUDA runtime version: 13.0 +CUDA drivers version: 13.20 +Nb procs used for the computation: 1 +TRUST version: 1.9.8_beta +Total number of elements used for the calculation: 1000000 + +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Computation start-up statistics +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Total time of the start-up: 6.74914 + +Number of calls to the linear solver per time step: 2 +Average time of the resolution of the linear problem per call: 0.471625 +Average number of iteration of the linear solver per call: 11 + +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Time loop statistics +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +The first time step is not accounted for the computation of the time loop statistics +Total time of the time loop: 1.83892 +Number of time steps: 49 +Skipped time steps: 1 +Average time per time step: 0.037529 +Standard deviation between time steps: 0.00528329 +Time elapsed in the skipped time steps: 0.0876919 + + +Standard counter description | Time/step | % loop time | Call(s)/step +------------------------------------------------------------------------------------------ +Linear solver resolutions Ax=B | 0.0280251 | 74.7 | 3 +Convection operator | 0.001938012 | 5.2 | 3 +Diffusion operator | 0.001504543 | 4.0 | 3 +Gradient operator | 0.0009621812 | 2.6 | 6 +Divergence operator | 0.001100594 | 2.9 | 4 +Update ::mettre_a_jour | 0.0007023104 | 1.9 | 1 +Computation of the time step dt | 0.000180949 | 0.5 | 2 +Post-treatment operations | 0.0003962359 | 1.1 | 1 +Other operations | 0.00271915 | 7.2 | + +Average number of iteration of the linear solver per call: 14 + + +----------------------------------------------------------------------------------------------------------- + GPU statistics +----------------------------------------------------------------------------------------------------------- +Counter description | Time per step | % loop time | Call(s)/step | Bandwidth +----------------------------------------------------------------------------------------------------------- +Libraries: | 0.026644 | 71.0 | 3 | +Kernels: | 0.00819043 | 21.8 | 304 | +Copy host to device: | 0.000177673 | 0.5 | 13 | 0.0 GB/s +Copy device to host: | 0.00025907 | 0.7 | 12 | 1.8 GB/s +Alloc/Free on device: | 1.19745e-05 | 0.0 | 286 | +GPU: 93% Copy H<->D: 1.2% Alloc/free: 0.032% Comm: 0% CPU & I/O: 6% +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Post-resolution statistics +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Time of the post-resolution: 0.033557 + +Total time for the whole computation 8.70936 + diff --git a/tests/GPU/TaylorGreen/TaylorGreen_BENCH.TU.eureka_cc89 b/tests/GPU/TaylorGreen/TaylorGreen_BENCH.TU.eureka_cc89 new file mode 100644 index 0000000000..462a4c6d56 --- /dev/null +++ b/tests/GPU/TaylorGreen/TaylorGreen_BENCH.TU.eureka_cc89 @@ -0,0 +1,75 @@ + # Global performance file # + +This is the global file for tracking performance in TRUST. It stores aggregated quantities. +More detailed statistics can be found in the TaylorGreen_BENCH_csv.TU file +For time loop, only standard counters of level 1 are printed alongside your custom counters +Time is given in seconds + +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Context of the computation +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Date: 11-05-2026 -- 08:36:48 +OS: eureka__Linux__x86_64__6.17.0-20-generic__#20~24.04.1-Ubuntu SMP PREEMPT_DYNAMIC Thu Mar 19 01:28:37 UTC 2 +CPU model : INTEL(R) XEON(R) PLATINUM 8580 +Total number of threads:240 +GPU model: NVIDIA RTX 6000 Ada Generation +CUDA runtime version: 12.90 +CUDA drivers version: 13.10 +Nb procs used for the computation: 1 +TRUST version: 1.9.8_beta +Total number of elements used for the calculation: 1000000 + +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Computation start-up statistics +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Total time of the start-up: 5.38887 + +Number of calls to the linear solver per time step: 2 +Average time of the resolution of the linear problem per call: 0.62054 +Average number of iteration of the linear solver per call: 11 + +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Time loop statistics +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +The first time step is not accounted for the computation of the time loop statistics +Total time of the time loop: 25.6012 +Number of time steps: 388 +Skipped time steps: 1 +Average time per time step: 0.0659824 +Standard deviation between time steps: 0.0060902 +Time elapsed in the skipped time steps: 0.244451 + + +Standard counter description | Time/step | % loop time | Call(s)/step +------------------------------------------------------------------------------------------ +Linear solver resolutions Ax=B | 0.0404207 | 61.3 | 3 +Convection operator | 0.006148762 | 9.3 | 3 +Diffusion operator | 0.003561739 | 5.4 | 3 +Gradient operator | 0.001136671 | 1.7 | 6 +Divergence operator | 0.006674934 | 10.1 | 4 +Update ::mettre_a_jour | 0.001842883 | 2.8 | 1 +Computation of the time step dt | 0.0003300212 | 0.5 | 2 +Post-treatment operations | 0.004873083 | 7.4 | 1 +Other operations | 0.0009936003 | 1.5 | + +Average number of iteration of the linear solver per call: 13.6 + + +----------------------------------------------------------------------------------------------------------- + GPU statistics +----------------------------------------------------------------------------------------------------------- +Counter description | Time per step | % loop time | Call(s)/step | Bandwidth +----------------------------------------------------------------------------------------------------------- +Libraries: | 0.0402207 | 61.0 | 3 | +Kernels: | 0.0160424 | 24.3 | 315 | +Copy host to device: | 0.00310556 | 4.7 | 17 | 9.6 GB/s +Copy device to host: | 0.00392875 | 6.0 | 5 | 13.1 GB/s +Alloc/Free on device: | 0.0005011 | 0.8 | 311 | +GPU: 85% Copy H<->D: 11% Alloc/free: 0.76% Comm: 0% CPU & I/O: 3.3% +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Post-resolution statistics +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Time of the post-resolution: 0.014785 + +Total time for the whole computation 31.2497 + diff --git a/tests/GPU/TaylorGreen/TaylorGreen_BENCH.TU.irene-amd-ccrt_cc70 b/tests/GPU/TaylorGreen/TaylorGreen_BENCH.TU.irene-amd-ccrt_cc70 new file mode 100644 index 0000000000..e22a4dc95e --- /dev/null +++ b/tests/GPU/TaylorGreen/TaylorGreen_BENCH.TU.irene-amd-ccrt_cc70 @@ -0,0 +1,76 @@ + # Global performance file # + +This is the global file for tracking performance in TRUST. It stores aggregated quantities. +More detailed statistics can be found in the TaylorGreen_BENCH_csv.TU file +For time loop, only standard counters of level 1 are printed alongside your custom counters +Time is given in seconds + +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Context of the computation +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Date: 23-04-2026 -- 09:45:25 +OS: irene7050__Linux__x86_64__4.18.0-553.89.1.el8_10.x86_64__#1 SMP Sat Nov 29 00:49:18 EST 2025 +CPU model : Intel(R) Xeon(R) Gold 6230 CPU @ 2.10GHz +Total number of threads:80 +GPU model: Tesla V100-SXM2-16GB +CUDA runtime version: 12.90 +CUDA drivers version: 13.0 +Nb procs used for the computation: 1 +TRUST version: 1.9.8_beta +Total number of elements used for the calculation: 1000000 + +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Computation start-up statistics +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Total time of the start-up: 10.2371 + +Number of calls to the linear solver per time step: 2 +Average time of the resolution of the linear problem per call: 0.90031 +Average number of iteration of the linear solver per call: 11 + +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Time loop statistics +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +The first time step is not accounted for the computation of the time loop statistics +Total time of the time loop: 35.8402 +Number of time steps: 388 +Skipped time steps: 1 +Average time per time step: 0.0923716 +Standard deviation between time steps: 0.0642075 +Time elapsed in the skipped time steps: 0.434717 + + +Standard counter description | Time/step | % loop time | Call(s)/step +------------------------------------------------------------------------------------------ +Linear solver resolutions Ax=B | 0.0624471 | 67.6 | 3 +Convection operator | 0.00941783 | 10.2 | 3 +Diffusion operator | 0.004559281 | 4.9 | 3 +Gradient operator | 0.00250871 | 2.7 | 6 +Divergence operator | 0.002154565 | 2.3 | 4 +Update ::mettre_a_jour | 0.002488689 | 2.7 | 1 +Computation of the time step dt | 0.0004637799 | 0.5 | 2 +Post-treatment operations | 0.003477803 | 3.8 | 1 +Other operations | 0.00485381 | 5.3 | + +Average number of iteration of the linear solver per call: 13.6 + + +----------------------------------------------------------------------------------------------------------- + GPU statistics +----------------------------------------------------------------------------------------------------------- +Counter description | Time per step | % loop time | Call(s)/step | Bandwidth +----------------------------------------------------------------------------------------------------------- +Libraries: | 0.0620342 | 67.2 | 3 | +Kernels: | 0.0232568 | 25.2 | 303 | +Copy host to device: | 0.000207016 | 0.2 | 13 | 0.8 GB/s +Copy device to host: | 6.68813e-05 | 0.1 | 0 | 3.3 GB/s +Alloc/Free on device: | 0.000659453 | 0.7 | 310 | +GPU: 92% Copy H<->D: 0.3% Alloc/free: 0.71% Comm: 0% CPU & I/O: 6.7% +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Post-resolution statistics +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Time of the post-resolution: 0.0348589 + +Total time for the whole computation 46.5472 + +[Slurm] Power consumption (59 s): 0.183 kW 0.003 kWh 0.000 € (0.10€/kWh) diff --git a/tests/GPU/TaylorGreen/TaylorGreen_BENCH.TU.is157091_cc86 b/tests/GPU/TaylorGreen/TaylorGreen_BENCH.TU.is157091_cc86 new file mode 100644 index 0000000000..e6259a042c --- /dev/null +++ b/tests/GPU/TaylorGreen/TaylorGreen_BENCH.TU.is157091_cc86 @@ -0,0 +1,75 @@ + # Global performance file # + +This is the global file for tracking performance in TRUST. It stores aggregated quantities. +More detailed statistics can be found in the TaylorGreen_BENCH_csv.TU file +For time loop, only standard counters of level 1 are printed alongside your custom counters +Time is given in seconds + +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Context of the computation +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Date: 03-06-2026 -- 13:01:43 +OS: is157091__Linux__x86_64__5.11.12-300.fc34.x86_64__#1 SMP Wed Apr 7 16:31:13 UTC 2021 +CPU model : AMD Ryzen Threadripper PRO 5975WX 32-Cores +Total number of threads:64 +GPU model: NVIDIA RTX A6000 +CUDA runtime version: 12.90 +CUDA drivers version: 13.20 +Nb procs used for the computation: 1 +TRUST version: 1.9.8_beta +Total number of elements used for the calculation: 1000000 + +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Computation start-up statistics +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Total time of the start-up: 6.97009 + +Number of calls to the linear solver per time step: 2 +Average time of the resolution of the linear problem per call: 0.637857 +Average number of iteration of the linear solver per call: 11 + +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Time loop statistics +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +The first time step is not accounted for the computation of the time loop statistics +Total time of the time loop: 4.59486 +Number of time steps: 49 +Skipped time steps: 1 +Average time per time step: 0.0937727 +Standard deviation between time steps: 0.00213292 +Time elapsed in the skipped time steps: 0.2318 + + +Standard counter description | Time/step | % loop time | Call(s)/step +------------------------------------------------------------------------------------------ +Linear solver resolutions Ax=B | 0.0688406 | 73.4 | 3 +Convection operator | 0.00943249 | 10.1 | 3 +Diffusion operator | 0.004282985 | 4.6 | 3 +Gradient operator | 0.002661855 | 2.8 | 6 +Divergence operator | 0.001830782 | 2.0 | 4 +Update ::mettre_a_jour | 0.0006778731 | 0.7 | 1 +Computation of the time step dt | 0.000528149 | 0.6 | 2 +Post-treatment operations | 0.0002804149 | 0.3 | 1 +Other operations | 0.005237539 | 5.6 | + +Average number of iteration of the linear solver per call: 14 + + +----------------------------------------------------------------------------------------------------------- + GPU statistics +----------------------------------------------------------------------------------------------------------- +Counter description | Time per step | % loop time | Call(s)/step | Bandwidth +----------------------------------------------------------------------------------------------------------- +Libraries: | 0.0682666 | 72.8 | 3 | +Kernels: | 0.0237455 | 25.3 | 304 | +Copy host to device: | 0.000107392 | 0.1 | 13 | 0.0 GB/s +Copy device to host: | 4.20598e-05 | 0.0 | 0 | 10.8 GB/s +Alloc/Free on device: | 0.000506832 | 0.5 | 310 | +GPU: 98% Copy H<->D: 0.16% Alloc/free: 0.54% Comm: 0% CPU & I/O: 1.2% +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Post-resolution statistics +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Time of the post-resolution: 0.0153142 + +Total time for the whole computation 11.8121 + diff --git a/tests/GPU/TaylorGreen/TaylorGreen_BENCH.TU.is159479_cc120 b/tests/GPU/TaylorGreen/TaylorGreen_BENCH.TU.is159479_cc120 new file mode 100644 index 0000000000..dde5a18cbd --- /dev/null +++ b/tests/GPU/TaylorGreen/TaylorGreen_BENCH.TU.is159479_cc120 @@ -0,0 +1,75 @@ + # Global performance file # + +This is the global file for tracking performance in TRUST. It stores aggregated quantities. +More detailed statistics can be found in the TaylorGreen_BENCH_csv.TU file +For time loop, only standard counters of level 1 are printed alongside your custom counters +Time is given in seconds + +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Context of the computation +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Date: 13-05-2026 -- 07:06:42 +OS: is159479.intra.cea.fr__Linux__x86_64__6.12.0-160000.29-default__#1 SMP PREEMPT_DYNAMIC Fri May 1 12:45:19 UTC 2026 (6 +CPU model : AMD Ryzen Threadripper PRO 9965WX 24-Cores +Total number of threads:48 +GPU model: NVIDIA RTX PRO 6000 Blackwell Workstation Edition +CUDA runtime version: 13.0 +CUDA drivers version: 13.0 +Nb procs used for the computation: 1 +TRUST version: 1.9.8_beta +Total number of elements used for the calculation: 1000000 + +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Computation start-up statistics +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Total time of the start-up: 3.16518 + +Number of calls to the linear solver per time step: 2 +Average time of the resolution of the linear problem per call: 0.354792 +Average number of iteration of the linear solver per call: 11 + +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Time loop statistics +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +The first time step is not accounted for the computation of the time loop statistics +Total time of the time loop: 16.213 +Number of time steps: 388 +Skipped time steps: 1 +Average time per time step: 0.0417862 +Standard deviation between time steps: 0.00220827 +Time elapsed in the skipped time steps: 0.153858 + + +Standard counter description | Time/step | % loop time | Call(s)/step +------------------------------------------------------------------------------------------ +Linear solver resolutions Ax=B | 0.0305818 | 73.2 | 3 +Convection operator | 0.004429496 | 10.6 | 3 +Diffusion operator | 0.001778696 | 4.3 | 3 +Gradient operator | 0.0007466083 | 1.8 | 6 +Divergence operator | 0.0008054549 | 1.9 | 4 +Update ::mettre_a_jour | 0.0003303047 | 0.8 | 1 +Computation of the time step dt | 0.0001911036 | 0.5 | 2 +Post-treatment operations | 0.001386709 | 3.3 | 1 +Other operations | 0.001536038 | 3.7 | + +Average number of iteration of the linear solver per call: 13.6 + + +----------------------------------------------------------------------------------------------------------- + GPU statistics +----------------------------------------------------------------------------------------------------------- +Counter description | Time per step | % loop time | Call(s)/step | Bandwidth +----------------------------------------------------------------------------------------------------------- +Libraries: | 0.0302758 | 72.5 | 3 | +Kernels: | 0.00976492 | 23.4 | 346 | +Copy host to device: | 7.26197e-05 | 0.2 | 13 | 0.0 GB/s +Copy device to host: | 0.000337603 | 0.8 | 4 | 5.9 GB/s +Alloc/Free on device: | 0.000283992 | 0.7 | 311 | +GPU: 96% Copy H<->D: 0.98% Alloc/free: 0.68% Comm: 0% CPU & I/O: 2.5% +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Post-resolution statistics +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Time of the post-resolution: 0.010984 + +Total time for the whole computation 19.5433 + diff --git a/tests/GPU/TaylorGreen/TaylorGreen_BENCH.TU.is247793_gfx1100 b/tests/GPU/TaylorGreen/TaylorGreen_BENCH.TU.is247793_gfx1100 new file mode 100644 index 0000000000..dc699fcd7d --- /dev/null +++ b/tests/GPU/TaylorGreen/TaylorGreen_BENCH.TU.is247793_gfx1100 @@ -0,0 +1,75 @@ + # Global performance file # + +This is the global file for tracking performance in TRUST. It stores aggregated quantities. +More detailed statistics can be found in the TaylorGreen_BENCH_csv.TU file +For time loop, only standard counters of level 1 are printed alongside your custom counters +Time is given in seconds + +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Context of the computation +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Date: 22-05-2026 -- 19:06:05 +OS: is247793__Linux__x86_64__6.8.0-107-generic__#107~22.04.1-Ubuntu SMP PREEMPT_DYNAMIC Wed Mar 18 23:40:43 UTC +CPU model : AMD Ryzen Threadripper PRO 7965WX 24-Cores +Total number of threads:48 +GPU model: AMD Radeon PRO W7900 +HIP runtime version: 7.53 +HIP drivers version: 7.53 +Nb procs used for the computation: 1 +TRUST version: 1.9.8_beta +Total number of elements used for the calculation: 1000000 + +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Computation start-up statistics +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Total time of the start-up: 6.74851 + +Number of calls to the linear solver per time step: 2 +Average time of the resolution of the linear problem per call: 0.962062 +Average number of iteration of the linear solver per call: 11 + +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Time loop statistics +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +The first time step is not accounted for the computation of the time loop statistics +Total time of the time loop: 40.4216 +Number of time steps: 388 +Skipped time steps: 1 +Average time per time step: 0.104179 +Standard deviation between time steps: 0.0656626 +Time elapsed in the skipped time steps: 0.433358 + + +Standard counter description | Time/step | % loop time | Call(s)/step +------------------------------------------------------------------------------------------ +Linear solver resolutions Ax=B | 0.0663882 | 63.7 | 3 +Convection operator | 0.009758096 | 9.4 | 3 +Diffusion operator | 0.006048223 | 5.8 | 3 +Gradient operator | 0.002385386 | 2.3 | 6 +Divergence operator | 0.002598763 | 2.5 | 4 +Update ::mettre_a_jour | 0.001005287 | 1.0 | 1 +Computation of the time step dt | 0.000558219 | 0.5 | 2 +Post-treatment operations | 0.01075544 | 10.3 | 1 +Other operations | 0.00468169 | 4.5 | + +Average number of iteration of the linear solver per call: 13.5 + + +----------------------------------------------------------------------------------------------------------- + GPU statistics +----------------------------------------------------------------------------------------------------------- +Counter description | Time per step | % loop time | Call(s)/step | Bandwidth +----------------------------------------------------------------------------------------------------------- +Libraries: | 0.065981 | 63.3 | 3 | +Kernels: | 0.0356524 | 34.2 | 346 | +Copy host to device: | 0.000291844 | 0.3 | 13 | 0.0 GB/s +Copy device to host: | 0.000159518 | 0.2 | 4 | 12.5 GB/s +Alloc/Free on device: | 0.000699643 | 0.7 | 311 | +GPU: 98% Copy H<->D: 0.43% Alloc/free: 0.67% Comm: 0% CPU & I/O: 1.3% +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Post-resolution statistics +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Time of the post-resolution: 0.0170791 + +Total time for the whole computation 47.6207 + diff --git a/tests/GPU/TaylorGreen/TaylorGreen_BENCH.TU.jean-zay_cc90 b/tests/GPU/TaylorGreen/TaylorGreen_BENCH.TU.jean-zay_cc90 new file mode 100644 index 0000000000..d0336672bc --- /dev/null +++ b/tests/GPU/TaylorGreen/TaylorGreen_BENCH.TU.jean-zay_cc90 @@ -0,0 +1,76 @@ + # Global performance file # + +This is the global file for tracking performance in TRUST. It stores aggregated quantities. +More detailed statistics can be found in the TaylorGreen_BENCH_csv.TU file +For time loop, only standard counters of level 1 are printed alongside your custom counters +Time is given in seconds + +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Context of the computation +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Date: 10-06-2026 -- 10:46:31 +OS: jzxh021__Linux__x86_64__5.14.0-570.116.1.el9_6.x86_64__#1 SMP PREEMPT_DYNAMIC Tue May 19 08:44:54 EDT 2026 +CPU model : Intel(R) Xeon(R) Platinum 8468 +Total number of threads:192 +GPU model: NVIDIA H100 80GB HBM3 +CUDA runtime version: 12.60 +CUDA drivers version: 13.20 +Nb procs used for the computation: 1 +TRUST version: 1.9.8_beta +Total number of elements used for the calculation: 1000000 + +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Computation start-up statistics +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Total time of the start-up: 9.01156 + +Number of calls to the linear solver per time step: 2 +Average time of the resolution of the linear problem per call: 0.588518 +Average number of iteration of the linear solver per call: 11 + +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Time loop statistics +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +The first time step is not accounted for the computation of the time loop statistics +Total time of the time loop: 2.0473 +Number of time steps: 49 +Skipped time steps: 1 +Average time per time step: 0.0417815 +Standard deviation between time steps: 0.00209294 +Time elapsed in the skipped time steps: 0.265539 + + +Standard counter description | Time/step | % loop time | Call(s)/step +------------------------------------------------------------------------------------------ +Linear solver resolutions Ax=B | 0.0306315 | 73.3 | 3 +Convection operator | 0.002527348 | 6.0 | 3 +Diffusion operator | 0.001950996 | 4.7 | 3 +Gradient operator | 0.001071137 | 2.6 | 6 +Divergence operator | 0.001146943 | 2.7 | 4 +Update ::mettre_a_jour | 0.0008361189 | 2.0 | 1 +Computation of the time step dt | 0.0001930345 | 0.5 | 2 +Post-treatment operations | 0.0003362419 | 0.8 | 1 +Other operations | 0.003088216 | 7.4 | + +Average number of iteration of the linear solver per call: 14 + + +----------------------------------------------------------------------------------------------------------- + GPU statistics +----------------------------------------------------------------------------------------------------------- +Counter description | Time per step | % loop time | Call(s)/step | Bandwidth +----------------------------------------------------------------------------------------------------------- +Libraries: | 0.0301188 | 72.1 | 3 | +Kernels: | 0.00873178 | 20.9 | 304 | +Copy host to device: | 0.000165068 | 0.4 | 13 | 0.0 GB/s +Copy device to host: | 0.000310021 | 0.7 | 12 | 1.5 GB/s +Alloc/Free on device: | 7.35443e-06 | 0.0 | 286 | +GPU: 93% Copy H<->D: 1.1% Alloc/free: 0.018% Comm: 0% CPU & I/O: 5.9% +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Post-resolution statistics +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Time of the post-resolution: 0.022536 + +Total time for the whole computation 11.347 + +[Slurm] Power consumption (19 s): 0.422 kW 0.002 kWh 0.000 € (0.10€/kWh) diff --git a/tests/GPU/TaylorGreen/TaylorGreen_BENCH.TU.lumi_gfx90a b/tests/GPU/TaylorGreen/TaylorGreen_BENCH.TU.lumi_gfx90a new file mode 100644 index 0000000000..2c3595c17a --- /dev/null +++ b/tests/GPU/TaylorGreen/TaylorGreen_BENCH.TU.lumi_gfx90a @@ -0,0 +1,76 @@ + # Global performance file # + +This is the global file for tracking performance in TRUST. It stores aggregated quantities. +More detailed statistics can be found in the TaylorGreen_BENCH_csv.TU file +For time loop, only standard counters of level 1 are printed alongside your custom counters +Time is given in seconds + +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Context of the computation +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Date: 05-06-2026 -- 22:31:42 +OS: nid005002__Linux__x86_64__6.4.0-150600.23.73_15.0.14-cray_shasta_c__#1 SMP Tue Oct 21 20:32:25 UTC 2025 (89d3c98) +CPU model : AMD EPYC 7A53 64-Core Processor +Total number of threads:128 +GPU model: AMD Instinct MI250X +HIP runtime version: 6.43 +HIP drivers version: 6.43 +Nb procs used for the computation: 1 +TRUST version: 1.9.8_beta +Total number of elements used for the calculation: 1000000 + +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Computation start-up statistics +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Total time of the start-up: 72.1727 + +Number of calls to the linear solver per time step: 2 +Average time of the resolution of the linear problem per call: 2.25663 +Average number of iteration of the linear solver per call: 11 + +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Time loop statistics +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +The first time step is not accounted for the computation of the time loop statistics +Total time of the time loop: 4.57714 +Number of time steps: 49 +Skipped time steps: 1 +Average time per time step: 0.0934111 +Standard deviation between time steps: 0.117934 +Time elapsed in the skipped time steps: 0.636137 + + +Standard counter description | Time/step | % loop time | Call(s)/step +------------------------------------------------------------------------------------------ +Linear solver resolutions Ax=B | 0.0451235 | 48.3 | 3 +Convection operator | 0.01070777 | 11.5 | 3 +Diffusion operator | 0.005400141 | 5.8 | 3 +Gradient operator | 0.001926701 | 2.1 | 6 +Divergence operator | 0.001946797 | 2.1 | 4 +Update ::mettre_a_jour | 0.001422196 | 1.5 | 1 +Computation of the time step dt | 0.0003921027 | 0.4 | 2 +Post-treatment operations | 0.02267105 | 24.3 | 1 +Other operations | 0.003820766 | 4.1 | + +Average number of iteration of the linear solver per call: 14 + + +----------------------------------------------------------------------------------------------------------- + GPU statistics +----------------------------------------------------------------------------------------------------------- +Counter description | Time per step | % loop time | Call(s)/step | Bandwidth +----------------------------------------------------------------------------------------------------------- +Libraries: | 0.0447554 | 47.9 | 3 | +Kernels: | 0.0460029 | 49.2 | 304 | +Copy host to device: | 0.000225169 | 0.2 | 13 | 0.0 GB/s +Copy device to host: | 2.55554e-05 | 0.0 | 0 | 17.9 GB/s +Alloc/Free on device: | 0.000507781 | 0.5 | 310 | +GPU: 97% Copy H<->D: 0.27% Alloc/free: 0.54% Comm: 0% CPU & I/O: 2% +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Post-resolution statistics +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Time of the post-resolution: 0.0320638 + +Total time for the whole computation 77.4181 + +[Slurm] Power consumption (96 s): 0.507 kW 0.014 kWh 0.001 € (0.10€/kWh) diff --git a/tests/GPU/TaylorGreen/TaylorGreen_BENCH.TU.topaze_cc80 b/tests/GPU/TaylorGreen/TaylorGreen_BENCH.TU.topaze_cc80 new file mode 100644 index 0000000000..381f0b17a4 --- /dev/null +++ b/tests/GPU/TaylorGreen/TaylorGreen_BENCH.TU.topaze_cc80 @@ -0,0 +1,76 @@ + # Global performance file # + +This is the global file for tracking performance in TRUST. It stores aggregated quantities. +More detailed statistics can be found in the TaylorGreen_BENCH_csv.TU file +For time loop, only standard counters of level 1 are printed alongside your custom counters +Time is given in seconds + +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Context of the computation +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Date: 13-06-2026 -- 12:34:42 +OS: topaze7062__Linux__x86_64__4.18.0-553.123.1.el8_10.x86_64__#1 SMP Mon May 4 13:45:48 EDT 2026 +CPU model : AMD EPYC 7763 64-Core Processor +Total number of threads:256 +GPU model: NVIDIA A100-SXM4-80GB +CUDA runtime version: 12.90 +CUDA drivers version: 13.0 +Nb procs used for the computation: 1 +TRUST version: 1.9.8_beta +Total number of elements used for the calculation: 1000000 + +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Computation start-up statistics +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Total time of the start-up: 10.4623 + +Number of calls to the linear solver per time step: 2 +Average time of the resolution of the linear problem per call: 1.03733 +Average number of iteration of the linear solver per call: 11 + +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Time loop statistics +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +The first time step is not accounted for the computation of the time loop statistics +Total time of the time loop: 2.76033 +Number of time steps: 49 +Skipped time steps: 1 +Average time per time step: 0.0563332 +Standard deviation between time steps: 0.00395212 +Time elapsed in the skipped time steps: 0.283041 + + +Standard counter description | Time/step | % loop time | Call(s)/step +------------------------------------------------------------------------------------------ +Linear solver resolutions Ax=B | 0.0419811 | 74.5 | 3 +Convection operator | 0.00381971 | 6.8 | 3 +Diffusion operator | 0.002497447 | 4.4 | 3 +Gradient operator | 0.001444911 | 2.6 | 6 +Divergence operator | 0.001232232 | 2.2 | 4 +Update ::mettre_a_jour | 0.001462763 | 2.6 | 1 +Computation of the time step dt | 0.000249541 | 0.4 | 2 +Post-treatment operations | 0.0005570403 | 1.0 | 1 +Other operations | 0.003088481 | 5.5 | + +Average number of iteration of the linear solver per call: 14 + + +----------------------------------------------------------------------------------------------------------- + GPU statistics +----------------------------------------------------------------------------------------------------------- +Counter description | Time per step | % loop time | Call(s)/step | Bandwidth +----------------------------------------------------------------------------------------------------------- +Libraries: | 0.0413206 | 73.4 | 3 | +Kernels: | 0.0121898 | 21.6 | 304 | +Copy host to device: | 0.000149844 | 0.3 | 13 | 0.0 GB/s +Copy device to host: | 0.000362689 | 0.6 | 12 | 1.3 GB/s +Alloc/Free on device: | 9.97549e-06 | 0.0 | 286 | +GPU: 95% Copy H<->D: 0.91% Alloc/free: 0.018% Comm: 0% CPU & I/O: 4.1% +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Post-resolution statistics +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Time of the post-resolution: 0.0336353 + +Total time for the whole computation 13.5394 + +[Slurm] Power consumption (57 s): 0.407 kW 0.006 kWh 0.001 € (0.10€/kWh) diff --git a/tests/GPU/TaylorGreen/TaylorGreen_EC.son_ref b/tests/GPU/TaylorGreen/TaylorGreen_EC.son_ref new file mode 100644 index 0000000000..addc776b60 --- /dev/null +++ b/tests/GPU/TaylorGreen/TaylorGreen_EC.son_ref @@ -0,0 +1,14 @@ +# Temps Energie_cinetique_totale +0.00000000e+00 3.10062767e+01 +1.06625268e+00 2.47998448e+01 +2.07434992e+00 2.06924939e+01 +3.13906674e+00 1.73021176e+01 +4.31395623e+00 1.42913666e+01 +5.63905483e+00 1.16240944e+01 +7.15181113e+00 9.31943515e+00 +8.89094816e+00 7.32615578e+00 +1.08957949e+01 5.64517338e+00 +1.32159065e+01 4.28041341e+00 +1.59167146e+01 3.21474828e+00 +1.90915042e+01 2.40377983e+00 +2.28934723e+01 1.79118993e+00 diff --git a/tests/GPU/TaylorGreen/check_perf.sh b/tests/GPU/TaylorGreen/check_perf.sh new file mode 120000 index 0000000000..6d20411c12 --- /dev/null +++ b/tests/GPU/TaylorGreen/check_perf.sh @@ -0,0 +1 @@ +../DomainFlowLES/check_perf.sh \ No newline at end of file diff --git a/tests/GPU/TaylorGreen/scaling.sh b/tests/GPU/TaylorGreen/scaling.sh new file mode 120000 index 0000000000..e60cf18f07 --- /dev/null +++ b/tests/GPU/TaylorGreen/scaling.sh @@ -0,0 +1 @@ +../OpenMP_Iterateur/scaling.sh \ No newline at end of file diff --git a/tests/GPU/TaylorGreen/verifie b/tests/GPU/TaylorGreen/verifie new file mode 100755 index 0000000000..4c1ba1ff1f --- /dev/null +++ b/tests/GPU/TaylorGreen/verifie @@ -0,0 +1 @@ +compare_sonde TaylorGreen_EC.son_ref TaylorGreen_EC.son 1>verifie.log 2>&1 || exit -1 diff --git a/tests/GPU/VDF_90M_180GB/VDF_90M_180GB.data b/tests/GPU/VDF_90M_180GB/VDF_90M_180GB.data new file mode 100644 index 0000000000..3da1278cbc --- /dev/null +++ b/tests/GPU/VDF_90M_180GB/VDF_90M_180GB.data @@ -0,0 +1,114 @@ +# Test memory on device # +# PARALLEL OK # +Dimension 3 + +Pb_hydraulique_turbulent pb + +Domaine dom_perio + +# BEGIN MESH # +Mailler dom_perio +{ + Pave pave + { + /* warning dumb geometry */ + Origine -30 0. 0. + Nombre_de_Noeuds 1001 301 301 + Longueurs 30 2 10 + } + { + Bord Periox X = -30 0. <= Y <= 2. 0. <= Z <= 10. + Bord Periox X = 0 0. <= Y <= 2. 0. <= Z <= 10. + Bord LowerWall Y = 0. -30. <= X <= 0. 0. <= Z <= 10. + Bord UpperWall Y = 2. -30. <= X <= 0. 0. <= Z <= 10. + Bord Perioz Z = 0. -30. <= X <= 0. 0. <= Y <= 2. + Bord Perioz Z = 10. -30. <= X <= 0. 0. <= Y <= 2. + } +} +Declarer_bord_perio { domaine dom_perio bord Periox } +Declarer_bord_perio { domaine dom_perio bord Perioz } +# END MESH # + +# BEGIN PARTITION +Partition dom_perio +{ + Partition_tool Metis { Nb_parts 4 } + Larg_joint 2 + zones_name DOM +} +End +END PARTITION # + +# BEGIN SCATTER +Scatter DOM.Zones dom_perio +END SCATTER # + +VDF dis +Lire dis { reorder { algo hilbert } } + +Runge_Kutta_ordre_3 sch +Lire sch +{ + nb_pas_dt_max 0 + dt_sauv -1 + tinit 0 + dt_impr 1e-6 + facsec 2 + precision_impr 8 + tcpumax 23 +} + +Associer pb dom_perio +Associer pb sch + +Discretiser pb dis + +Lire pb +{ + Fluide_incompressible + { + mu champ_uniforme 1 3.5e-04 + rho champ_uniforme 1 1 + } + Navier_Stokes_turbulent + { + Solveur_pression AMG GCP { rtol 1e-15 impr } +# + solveur_pression petsc cli + { + -ksp_view + -ksp_type gmres + -ksp_norm_type unpreconditioned + -pc_type hypre + -pc_hypre_type boomeramg + -pc_mg_galerkin_mat_product_algorithm hypre + -pc_hypre_boomeramg_relax_type_all l1scaled-Jacobi + -pc_hypre_boomeramg_coarsen_type pmis + -pc_hypre_boomeramg_interp_type ext+i + -pc_hypre_boomeramg_strong_threshold 0.30 + -pc_hypre_boomeramg_print_statistics 1 + -ksp_rtol 1e-15 impr + } +# + conditions_initiales { + vitesse champ_uniforme 3 1 0 0 + pression champ_uniforme 1 0 + } + conditions_limites { + Periox periodique + Perioz periodique + LowerWall paroi_fixe + UpperWall paroi_fixe + } + convection { centre4 } + diffusion { } + sources { canal_perio { bord Periox } } + modele_turbulence null { } + } +} + +EcritureLectureSpecial 0 + +Resoudre pb + +Fin diff --git a/tests/GPU/VEF_75M_190GB/VEF_75M_190GB.data b/tests/GPU/VEF_75M_190GB/VEF_75M_190GB.data new file mode 100644 index 0000000000..39f491bb10 --- /dev/null +++ b/tests/GPU/VEF_75M_190GB/VEF_75M_190GB.data @@ -0,0 +1,121 @@ +# RAM on device # +# copyToDevice 0 # +dimension 3 +Pb_thermohydraulique_Turbulent pb1 +Domaine dom +# BEGIN MESH # +Mailler dom +{ + Pave Entree + { + Origine 0. 0. 0. + Nombre_de_Noeuds 58 58 571 + Longueurs 1 1 10 + } + { + Bord walls X = 0. 0. <= Y <= 1. 0. <= Z <= 10. + Bord walls X = 1. 0. <= Y <= 1. 0. <= Z <= 10. + Bord walls Y = 0. 0. <= X <= 1. 0. <= Z <= 10. + Bord walls Y = 1. 0. <= X <= 1. 0. <= Z <= 10. + Bord inlet Z = 0. 0. <= X <= 1. 0. <= Y <= 1. + Bord outlet Z = 10. 0. <= X <= 1. 0. <= Y <= 1. + } +} +Tetraedriser_homogene dom +VerifierCoin dom { } +# END MESH # +# BEGIN PARTITION +Partition dom +{ + Partition_tool Metis { Nb_parts 8 } + Larg_joint 2 + zones_name dom +} +End +END PARTITION # + +# BEGIN SCATTER +Scatter dom.Zones dom +END SCATTER # + +vef dis +Lire dis { reorder { algo hilbert } } + +Runge_Kutta_Rationnel_ordre_2 sch +Lire sch +{ + nb_pas_dt_max 0 + dt_sauv -1 + tinit 0. + tcpumax 47.00 + dt_impr 0.0000001 + dt_start dt_calc + tmax 2. + dt_min 1.e-10 + dt_max 6.e-3 + seuil_statio 1.e-14 + facsec 1 + diffusion_implicite 1 + seuil_diffusion_implicite 1.e-10 +} + + +Associer pb1 dom +Associer pb1 sch +Discretiser pb1 dis + +Lire pb1 +{ + + fluide_incompressible + { + gravite champ_uniforme 3 0 0 -9.81 + mu Champ_Uniforme 1 0.008 + rho Champ_Uniforme 1 995.2 + lambda Champ_Uniforme 1 0.062 + Cp Champ_Uniforme 1 4.1802 + beta_th Champ_Uniforme 1 0.002902383982248589 + } + + Navier_Stokes_turbulent + { + solveur_pression AMG GCP { rtol 1.e-6 impr } + convection { muscl } + diffusion { } + conditions_initiales { vitesse Champ_uniforme 3 0. 0. 0. } + sources { boussinesq_temperature { T0 30. } } + conditions_limites + { + inlet frontiere_ouverte_vitesse_imposee Champ_front_uniforme 3 0. 0. 0.31 + walls paroi_fixe + outlet frontiere_ouverte_pression_imposee Champ_front_uniforme 1 0 + } + + Modele_turbulence Sous_maille_wale + { + cw 0.5 + turbulence_paroi negligeable + } + } + Convection_Diffusion_Temperature_Turbulent + { + diffusion { } + convection { muscl } + conditions_initiales { temperature champ_fonc_xyz dom 1 ((x*x+y*y)[(10e-3*10e-3)*(z[0.06))*20+20 } + boundary_conditions + { + outlet frontiere_ouverte T_ext champ_front_uniforme 1 20 + inlet frontiere_ouverte_temperature_imposee champ_front_uniforme 1 40 + walls Paroi_adiabatique + } + Modele_turbulence Prandtl + { + turbulence_paroi negligeable_scalaire + } + } + +} + + +Resoudre pb1 +Fin diff --git a/tests/GPU/cuDSS_cholesky/cuDSS_cholesky.data b/tests/GPU/cuDSS_cholesky/cuDSS_cholesky.data index f3477328aa..d51cb9b4e2 100644 --- a/tests/GPU/cuDSS_cholesky/cuDSS_cholesky.data +++ b/tests/GPU/cuDSS_cholesky/cuDSS_cholesky.data @@ -40,7 +40,9 @@ END PARTITION # Scatter dom.Zones dom END SCATTER # -vef dis +vef dis +Lire dis { reorder { algo Hilbert } } + Runge_Kutta_Rationnel_ordre_2 sch Lire sch { diff --git a/tests/GPU/cuDSS_cholesky/cuDSS_cholesky_BENCH.TU.dalianvl_cc100 b/tests/GPU/cuDSS_cholesky/cuDSS_cholesky_BENCH.TU.dalianvl_cc100 new file mode 100644 index 0000000000..063ea275e2 --- /dev/null +++ b/tests/GPU/cuDSS_cholesky/cuDSS_cholesky_BENCH.TU.dalianvl_cc100 @@ -0,0 +1,78 @@ + # Global performance file # + +This is the global file for tracking performance in TRUST. It stores aggregated quantities. +More detailed statistics can be found in the cuDSS_cholesky_BENCH_csv.TU file +For time loop, only standard counters of level 1 are printed alongside your custom counters +Time is given in seconds + +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Context of the computation +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Date: 11-06-2026 -- 12:53:42 +OS: dalianvl16__Linux__aarch64__5.14.0-503.40.1.el9_5.aarch64__#1 SMP PREEMPT_DYNAMIC Thu Apr 24 13:05:29 UTC 2025 +CPU model : Unknown Linux CPU +Total number of threads:144 +GPU model: NVIDIA GB200 +CUDA runtime version: 13.0 +CUDA drivers version: 13.20 +Nb procs used for the computation: 1 +TRUST version: 1.9.8_beta +Total number of elements used for the calculation: 2332800 + +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Computation start-up statistics +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Total time of the start-up: 13.2371 + +Number of calls to the linear solver per time step: 2 +Average time of the resolution of the linear problem per call: 0.120239 +Average number of iteration of the linear solver per call: 2 + +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Time loop statistics +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +The first time step is not accounted for the computation of the time loop statistics +Total time of the time loop: 0.696072 +Number of time steps: 9 +Skipped time steps: 1 +Average time per time step: 0.0773413 +Standard deviation between time steps: 0.00315073 +Time elapsed in the skipped time steps: 0.324934 + + +Standard counter description | Time/step | % loop time | Call(s)/step +------------------------------------------------------------------------------------------ +Linear solver resolutions Ax=B | 0.000121945 | 0.2 | 2 +Convection operator | 0.008095081 | 10.5 | 4 +Diffusion operator | 0.02787197 | 36.0 | 32 +Gradient operator | 0.004787126 | 6.2 | 4 +Divergence operator | 0.001358279 | 1.8 | 3 +Source terms | 0.003056822 | 4.0 | 2 +Update ::mettre_a_jour | 0.001711101 | 2.2 | 1 +Solver for implicit diffusion | 0.01594707 | 20.6 | 4 +Computation of the time step dt | 0.002858162 | 3.7 | 8 +Turbulence model::update | 0.0005988777 | 0.8 | 1 +Post-treatment operations | 0.00524563 | 6.8 | 1 +Other operations | 0.005689237 | 7.4 | + +Average number of iteration of the linear solver per call: 2 + + +----------------------------------------------------------------------------------------------------------- + GPU statistics +----------------------------------------------------------------------------------------------------------- +Counter description | Time per step | % loop time | Call(s)/step | Bandwidth +----------------------------------------------------------------------------------------------------------- +Libraries: | 9.37107e-05 | 0.1 | 2 | +Kernels: | 0.0693074 | 89.6 | 1063 | +Copy host to device: | 0.000347382 | 0.4 | 18 | 10.3 GB/s +Copy device to host: | 0.000251933 | 0.3 | 7 | 41.9 GB/s +Alloc/Free on device: | 0.000161753 | 0.2 | 0 | +GPU: 90% Copy H<->D: 0.77% Alloc/free: 0.21% Comm: 0% CPU & I/O: 9.3% +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Post-resolution statistics +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Time of the post-resolution: 0.157463 + +Total time for the whole computation 14.4155 + diff --git a/tests/GPU/cuDSS_cholesky/cuDSS_cholesky_BENCH.TU.eureka_cc89 b/tests/GPU/cuDSS_cholesky/cuDSS_cholesky_BENCH.TU.eureka_cc89 new file mode 100644 index 0000000000..126062ed7c --- /dev/null +++ b/tests/GPU/cuDSS_cholesky/cuDSS_cholesky_BENCH.TU.eureka_cc89 @@ -0,0 +1,78 @@ + # Global performance file # + +This is the global file for tracking performance in TRUST. It stores aggregated quantities. +More detailed statistics can be found in the cuDSS_cholesky_BENCH_csv.TU file +For time loop, only standard counters of level 1 are printed alongside your custom counters +Time is given in seconds + +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Context of the computation +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Date: 11-05-2026 -- 08:38:14 +OS: eureka__Linux__x86_64__6.17.0-20-generic__#20~24.04.1-Ubuntu SMP PREEMPT_DYNAMIC Thu Mar 19 01:28:37 UTC 2 +CPU model : INTEL(R) XEON(R) PLATINUM 8580 +Total number of threads:240 +GPU model: NVIDIA RTX 6000 Ada Generation +CUDA runtime version: 12.90 +CUDA drivers version: 13.10 +Nb procs used for the computation: 1 +TRUST version: 1.9.8_beta +Total number of elements used for the calculation: 2332800 + +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Computation start-up statistics +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Total time of the start-up: 58.8161 + +Number of calls to the linear solver per time step: 2 +Average time of the resolution of the linear problem per call: 12.2049 +Average number of iteration of the linear solver per call: 2 + +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Time loop statistics +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +The first time step is not accounted for the computation of the time loop statistics +Total time of the time loop: 2.3888 +Number of time steps: 9 +Skipped time steps: 1 +Average time per time step: 0.265422 +Standard deviation between time steps: 0.00700682 +Time elapsed in the skipped time steps: 1.30429 + + +Standard counter description | Time/step | % loop time | Call(s)/step +------------------------------------------------------------------------------------------ +Linear solver resolutions Ax=B | 0.10549 | 39.7 | 2 +Convection operator | 0.0174541 | 6.6 | 4 +Diffusion operator | 0.04629194 | 17.4 | 26 +Gradient operator | 0.01120828 | 4.2 | 4 +Divergence operator | 0.003497302 | 1.3 | 3 +Source terms | 0.01068952 | 4.0 | 2 +Update ::mettre_a_jour | 0.004092763 | 1.5 | 1 +Solver for implicit diffusion | 0.03844643 | 14.5 | 4 +Computation of the time step dt | 0.007731785 | 2.9 | 8 +Turbulence model::update | 0.001743632 | 0.7 | 1 +Post-treatment operations | 0.006007037 | 2.3 | 1 +Other operations | 0.01276892 | 4.8 | + +Average number of iteration of the linear solver per call: 2 + + +----------------------------------------------------------------------------------------------------------- + GPU statistics +----------------------------------------------------------------------------------------------------------- +Counter description | Time per step | % loop time | Call(s)/step | Bandwidth +----------------------------------------------------------------------------------------------------------- +Libraries: | 0.105468 | 39.7 | 2 | +Kernels: | 0.152195 | 57.3 | 906 | +Copy host to device: | 0.000530083 | 0.2 | 18 | 6.7 GB/s +Copy device to host: | 0.00182647 | 0.7 | 7 | 5.8 GB/s +Alloc/Free on device: | 0.00010853 | 0.0 | 0 | +GPU: 97% Copy H<->D: 0.89% Alloc/free: 0.041% Comm: 0% CPU & I/O: 2% +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Post-resolution statistics +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Time of the post-resolution: 0.0813076 + +Total time for the whole computation 62.5905 + diff --git a/tests/GPU/cuDSS_cholesky/cuDSS_cholesky_BENCH.TU.irene-amd-ccrt_cc70 b/tests/GPU/cuDSS_cholesky/cuDSS_cholesky_BENCH.TU.irene-amd-ccrt_cc70 index 89723c8543..f0274bdf4c 100644 --- a/tests/GPU/cuDSS_cholesky/cuDSS_cholesky_BENCH.TU.irene-amd-ccrt_cc70 +++ b/tests/GPU/cuDSS_cholesky/cuDSS_cholesky_BENCH.TU.irene-amd-ccrt_cc70 @@ -8,8 +8,8 @@ Time is given in seconds ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Context of the computation ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -Date: 20-03-2026 -- 15:20:40 -OS: irene7067__Linux__x86_64__4.18.0-553.89.1.el8_10.x86_64__#1 SMP Sat Nov 29 00:49:18 EST 2025 +Date: 23-04-2026 -- 15:15:24 +OS: irene7051__Linux__x86_64__4.18.0-553.89.1.el8_10.x86_64__#1 SMP Sat Nov 29 00:49:18 EST 2025 CPU model : Intel(R) Xeon(R) Gold 6230 CPU @ 2.10GHz Total number of threads:80 GPU model: Tesla V100-SXM2-16GB @@ -22,38 +22,38 @@ Total number of elements used for the calculation: 2332800 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Computation start-up statistics ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -Total time of the start-up: 72.5068 +Total time of the start-up: 75.6536 Number of calls to the linear solver per time step: 2 -Average time of the resolution of the linear problem per call: 13.6316 +Average time of the resolution of the linear problem per call: 14.301 Average number of iteration of the linear solver per call: 2 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Time loop statistics ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ The first time step is not accounted for the computation of the time loop statistics -Total time of the time loop: 4.69841 +Total time of the time loop: 3.67179 Number of time steps: 9 Skipped time steps: 1 -Average time per time step: 0.522046 -Standard deviation between time steps: 0.016306 -Time elapsed in the skipped time steps: 1.94551 +Average time per time step: 0.407977 +Standard deviation between time steps: 0.0117863 +Time elapsed in the skipped time steps: 1.8132 Standard counter description | Time/step | % loop time | Call(s)/step ------------------------------------------------------------------------------------------ -Linear solver resolutions Ax=B | 0.101177 | 19.4 | 2 -Convection operator | 0.0427997 | 8.2 | 4 -Diffusion operator | 0.1635086 | 31.3 | 26 -Gradient operator | 0.03588981 | 6.9 | 4 -Divergence operator | 0.02340071 | 4.5 | 3 -Source terms | 0.03244575 | 6.2 | 2 -Update ::mettre_a_jour | 0.01491838 | 2.9 | 1 -Solver for implicit diffusion | 0.04903136 | 9.4 | 4 -Computation of the time step dt | 0.03761318 | 7.2 | 8 -Turbulence model::update | 0.004650237 | 0.9 | 1 -Post-treatment operations | 0.00782911 | 1.5 | 1 -Other operations | 0.008781795 | 1.7 | +Linear solver resolutions Ax=B | 0.12405 | 30.4 | 2 +Convection operator | 0.02512303 | 6.2 | 4 +Diffusion operator | 0.09736093 | 23.9 | 26 +Gradient operator | 0.01811066 | 4.4 | 4 +Divergence operator | 0.01293818 | 3.2 | 3 +Source terms | 0.02176257 | 5.3 | 2 +Update ::mettre_a_jour | 0.01019637 | 2.5 | 1 +Solver for implicit diffusion | 0.04906131 | 12.0 | 4 +Computation of the time step dt | 0.02536967 | 6.2 | 8 +Turbulence model::update | 0.003636164 | 0.9 | 1 +Post-treatment operations | 0.008549231 | 2.1 | 1 +Other operations | 0.01181943 | 2.9 | Average number of iteration of the linear solver per call: 2 @@ -63,17 +63,17 @@ Average number of iteration of the linear solver per call: 2 ----------------------------------------------------------------------------------------------------------- Counter description | Time per step | % loop time | Call(s)/step | Bandwidth ----------------------------------------------------------------------------------------------------------- -Libraries: | 0.10115 | 19.4 | 2 | -Kernels: | 0.410303 | 78.6 | 908 | -Copy host to device: | 0.00108591 | 0.2 | 18 | 3.3 GB/s -Copy device to host: | 0.00235037 | 0.5 | 7 | 4.5 GB/s -Alloc/Free on device: | 8.16001e-05 | 0.0 | 0 | -GPU: 98% Copy H<->D: 0.66% Alloc/free: 0.016% Comm: 0% CPU & I/O: 1.4% +Libraries: | 0.124019 | 30.4 | 2 | +Kernels: | 0.272361 | 66.8 | 906 | +Copy host to device: | 0.00113228 | 0.3 | 18 | 3.2 GB/s +Copy device to host: | 0.00262643 | 0.6 | 7 | 4.0 GB/s +Alloc/Free on device: | 9.4755e-05 | 0.0 | 0 | +GPU: 97% Copy H<->D: 0.92% Alloc/free: 0.023% Comm: 0% CPU & I/O: 1.9% ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Post-resolution statistics ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -Time of the post-resolution: 0.163355 +Time of the post-resolution: 0.174586 -Total time for the whole computation 79.3141 +Total time for the whole computation 81.3132 -[Slurm] Power consumption (104 s): 0.176 kW 0.005 kWh 0.001 € (0.10€/kWh) +[Slurm] Power consumption (99 s): 0.201 kW 0.006 kWh 0.001 € (0.10€/kWh) diff --git a/tests/GPU/cuDSS_cholesky/cuDSS_cholesky_BENCH.TU.is157091_cc86 b/tests/GPU/cuDSS_cholesky/cuDSS_cholesky_BENCH.TU.is157091_cc86 index f65252b26c..e21c107726 100644 --- a/tests/GPU/cuDSS_cholesky/cuDSS_cholesky_BENCH.TU.is157091_cc86 +++ b/tests/GPU/cuDSS_cholesky/cuDSS_cholesky_BENCH.TU.is157091_cc86 @@ -8,13 +8,13 @@ Time is given in seconds ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Context of the computation ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -Date: 10-03-2026 -- 08:45:30 +Date: 31-05-2026 -- 09:28:51 OS: is157091__Linux__x86_64__5.11.12-300.fc34.x86_64__#1 SMP Wed Apr 7 16:31:13 UTC 2021 CPU model : AMD Ryzen Threadripper PRO 5975WX 32-Cores Total number of threads:64 GPU model: NVIDIA RTX A6000 CUDA runtime version: 12.90 -CUDA drivers version: 12.70 +CUDA drivers version: 13.20 Nb procs used for the computation: 1 TRUST version: 1.9.8_beta Total number of elements used for the calculation: 2332800 @@ -22,38 +22,38 @@ Total number of elements used for the calculation: 2332800 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Computation start-up statistics ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -Total time of the start-up: 52.0033 +Total time of the start-up: 37.9452 Number of calls to the linear solver per time step: 2 -Average time of the resolution of the linear problem per call: 10.7848 +Average time of the resolution of the linear problem per call: 10.8771 Average number of iteration of the linear solver per call: 2 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Time loop statistics ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ The first time step is not accounted for the computation of the time loop statistics -Total time of the time loop: 4.45148 +Total time of the time loop: 3.85018 Number of time steps: 9 Skipped time steps: 1 -Average time per time step: 0.494609 -Standard deviation between time steps: 0.0137016 -Time elapsed in the skipped time steps: 1.21885 +Average time per time step: 0.427798 +Standard deviation between time steps: 0.0119921 +Time elapsed in the skipped time steps: 1.148 Standard counter description | Time/step | % loop time | Call(s)/step ------------------------------------------------------------------------------------------ -Linear solver resolutions Ax=B | 0.117044 | 23.7 | 2 -Convection operator | 0.04088689 | 8.3 | 4 -Diffusion operator | 0.1189428 | 24.0 | 26 -Gradient operator | 0.03040632 | 6.1 | 4 -Divergence operator | 0.01833053 | 3.7 | 3 -Source terms | 0.03895697 | 7.9 | 2 -Update ::mettre_a_jour | 0.01322621 | 2.7 | 1 -Solver for implicit diffusion | 0.05433776 | 11.0 | 4 -Computation of the time step dt | 0.04061406 | 8.2 | 8 -Turbulence model::update | 0.005045901 | 1.0 | 1 -Post-treatment operations | 0.005178893 | 1.0 | 1 -Other operations | 0.01163865 | 2.4 | +Linear solver resolutions Ax=B | 0.117641 | 27.5 | 2 +Convection operator | 0.03657583 | 8.5 | 4 +Diffusion operator | 0.09317057 | 21.8 | 26 +Gradient operator | 0.01422524 | 3.3 | 4 +Divergence operator | 0.01207189 | 2.8 | 3 +Source terms | 0.03228191 | 7.5 | 2 +Update ::mettre_a_jour | 0.01044167 | 2.4 | 1 +Solver for implicit diffusion | 0.05658411 | 13.2 | 4 +Computation of the time step dt | 0.03139697 | 7.3 | 8 +Turbulence model::update | 0.004705404 | 1.1 | 1 +Post-treatment operations | 0.005553535 | 1.3 | 1 +Other operations | 0.01315024 | 3.1 | Average number of iteration of the linear solver per call: 2 @@ -63,16 +63,16 @@ Average number of iteration of the linear solver per call: 2 ----------------------------------------------------------------------------------------------------------- Counter description | Time per step | % loop time | Call(s)/step | Bandwidth ----------------------------------------------------------------------------------------------------------- -Libraries: | 0.117025 | 23.7 | 2 | -Kernels: | 0.371188 | 75.0 | 908 | -Copy host to device: | 0.000477292 | 0.1 | 18 | 7.5 GB/s -Copy device to host: | 0.00108064 | 0.2 | 7 | 9.8 GB/s -Alloc/Free on device: | 0.000145335 | 0.0 | 0 | -GPU: 99% Copy H<->D: 0.31% Alloc/free: 0.029% Comm: 0% CPU & I/O: 0.95% +Libraries: | 0.117615 | 27.5 | 2 | +Kernels: | 0.303267 | 70.9 | 906 | +Copy host to device: | 0.000490062 | 0.1 | 18 | 7.3 GB/s +Copy device to host: | 0.00116166 | 0.3 | 7 | 9.1 GB/s +Alloc/Free on device: | 0.000149718 | 0.0 | 0 | +GPU: 98% Copy H<->D: 0.39% Alloc/free: 0.035% Comm: 0% CPU & I/O: 1.2% ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Post-resolution statistics ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -Time of the post-resolution: 0.0710111 +Time of the post-resolution: 0.0719171 -Total time for the whole computation 57.7447 +Total time for the whole computation 43.0153 diff --git a/tests/GPU/cuDSS_cholesky/cuDSS_cholesky_BENCH.TU.is159479_cc120 b/tests/GPU/cuDSS_cholesky/cuDSS_cholesky_BENCH.TU.is159479_cc120 new file mode 100644 index 0000000000..c73f9b79df --- /dev/null +++ b/tests/GPU/cuDSS_cholesky/cuDSS_cholesky_BENCH.TU.is159479_cc120 @@ -0,0 +1,78 @@ + # Global performance file # + +This is the global file for tracking performance in TRUST. It stores aggregated quantities. +More detailed statistics can be found in the cuDSS_cholesky_BENCH_csv.TU file +For time loop, only standard counters of level 1 are printed alongside your custom counters +Time is given in seconds + +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Context of the computation +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Date: 24-04-2026 -- 14:38:43 +OS: is159479.intra.cea.fr__Linux__x86_64__6.12.0-160000.26-default__#1 SMP PREEMPT_DYNAMIC Thu Feb 5 00:00:11 UTC 2026 (f +CPU model : AMD Ryzen Threadripper PRO 9965WX 24-Cores +Total number of threads:48 +GPU model: NVIDIA RTX PRO 6000 Blackwell Workstation Edition +CUDA runtime version: 13.0 +CUDA drivers version: 13.0 +Nb procs used for the computation: 1 +TRUST version: 1.9.8_beta +Total number of elements used for the calculation: 2332800 + +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Computation start-up statistics +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Total time of the start-up: 39.9352 + +Number of calls to the linear solver per time step: 2 +Average time of the resolution of the linear problem per call: 8.43065 +Average number of iteration of the linear solver per call: 2 + +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Time loop statistics +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +The first time step is not accounted for the computation of the time loop statistics +Total time of the time loop: 1.75322 +Number of time steps: 9 +Skipped time steps: 1 +Average time per time step: 0.194802 +Standard deviation between time steps: 0.00396668 +Time elapsed in the skipped time steps: 0.821774 + + +Standard counter description | Time/step | % loop time | Call(s)/step +------------------------------------------------------------------------------------------ +Linear solver resolutions Ax=B | 0.101463 | 52.1 | 2 +Convection operator | 0.01183454 | 6.1 | 4 +Diffusion operator | 0.0286888 | 14.7 | 26 +Gradient operator | 0.006335487 | 3.3 | 4 +Divergence operator | 0.001810582 | 0.9 | 3 +Source terms | 0.006615974 | 3.4 | 2 +Update ::mettre_a_jour | 0.002287811 | 1.2 | 1 +Solver for implicit diffusion | 0.01991212 | 10.2 | 4 +Computation of the time step dt | 0.004273607 | 2.2 | 8 +Turbulence model::update | 0.001026106 | 0.5 | 1 +Post-treatment operations | 0.003897243 | 2.0 | 1 +Other operations | 0.006656335 | 3.4 | + +Average number of iteration of the linear solver per call: 2 + + +----------------------------------------------------------------------------------------------------------- + GPU statistics +----------------------------------------------------------------------------------------------------------- +Counter description | Time per step | % loop time | Call(s)/step | Bandwidth +----------------------------------------------------------------------------------------------------------- +Libraries: | 0.101454 | 52.1 | 2 | +Kernels: | 0.0885208 | 45.4 | 906 | +Copy host to device: | 0.000343567 | 0.2 | 18 | 10.4 GB/s +Copy device to host: | 0.00150662 | 0.8 | 7 | 7.0 GB/s +Alloc/Free on device: | 4.49228e-05 | 0.0 | 0 | +GPU: 98% Copy H<->D: 0.95% Alloc/free: 0.023% Comm: 0% CPU & I/O: 1.5% +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Post-resolution statistics +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Time of the post-resolution: 0.0577461 + +Total time for the whole computation 42.5679 + diff --git a/tests/GPU/cuDSS_cholesky/cuDSS_cholesky_BENCH.TU.jean-zay_cc90 b/tests/GPU/cuDSS_cholesky/cuDSS_cholesky_BENCH.TU.jean-zay_cc90 index 2356b4437a..5aaf2bc0e6 100644 --- a/tests/GPU/cuDSS_cholesky/cuDSS_cholesky_BENCH.TU.jean-zay_cc90 +++ b/tests/GPU/cuDSS_cholesky/cuDSS_cholesky_BENCH.TU.jean-zay_cc90 @@ -8,13 +8,13 @@ Time is given in seconds ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Context of the computation ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -Date: 20-03-2026 -- 14:10:34 -OS: jzxh080__Linux__x86_64__5.14.0-570.58.1.el9_6.x86_64__#1 SMP PREEMPT_DYNAMIC Tue Oct 21 04:15:07 EDT 2025 +Date: 10-06-2026 -- 14:11:55 +OS: jzxh126__Linux__x86_64__5.14.0-570.116.1.el9_6.x86_64__#1 SMP PREEMPT_DYNAMIC Tue May 19 08:44:54 EDT 2026 CPU model : Intel(R) Xeon(R) Platinum 8468 Total number of threads:192 GPU model: NVIDIA H100 80GB HBM3 CUDA runtime version: 12.60 -CUDA drivers version: 13.0 +CUDA drivers version: 13.20 Nb procs used for the computation: 1 TRUST version: 1.9.8_beta Total number of elements used for the calculation: 2332800 @@ -22,38 +22,38 @@ Total number of elements used for the calculation: 2332800 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Computation start-up statistics ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -Total time of the start-up: 51.2627 +Total time of the start-up: 42.7384 Number of calls to the linear solver per time step: 2 -Average time of the resolution of the linear problem per call: 11.2967 +Average time of the resolution of the linear problem per call: 11.1825 Average number of iteration of the linear solver per call: 2 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Time loop statistics ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ The first time step is not accounted for the computation of the time loop statistics -Total time of the time loop: 1.56552 +Total time of the time loop: 1.39083 Number of time steps: 9 Skipped time steps: 1 -Average time per time step: 0.173947 -Standard deviation between time steps: 0.00701508 -Time elapsed in the skipped time steps: 1.0813 +Average time per time step: 0.154536 +Standard deviation between time steps: 0.00788354 +Time elapsed in the skipped time steps: 0.991765 Standard counter description | Time/step | % loop time | Call(s)/step ------------------------------------------------------------------------------------------ -Linear solver resolutions Ax=B | 0.0661068 | 38.0 | 2 -Convection operator | 0.01267389 | 7.3 | 4 -Diffusion operator | 0.03157946 | 18.2 | 26 -Gradient operator | 0.01002847 | 5.8 | 4 -Divergence operator | 0.004405446 | 2.5 | 3 -Source terms | 0.006924221 | 4.0 | 2 -Update ::mettre_a_jour | 0.00359039 | 2.1 | 1 -Solver for implicit diffusion | 0.01661915 | 9.6 | 4 -Computation of the time step dt | 0.005447637 | 3.1 | 8 -Turbulence model::update | 0.001024498 | 0.6 | 1 -Post-treatment operations | 0.009382817 | 5.4 | 1 -Other operations | 0.006164344 | 3.5 | +Linear solver resolutions Ax=B | 0.0572004 | 37.0 | 2 +Convection operator | 0.009876482 | 6.4 | 4 +Diffusion operator | 0.02682544 | 17.4 | 26 +Gradient operator | 0.008371504 | 5.4 | 4 +Divergence operator | 0.001878353 | 1.2 | 3 +Source terms | 0.006324317 | 4.1 | 2 +Update ::mettre_a_jour | 0.002631955 | 1.7 | 1 +Solver for implicit diffusion | 0.01911372 | 12.4 | 4 +Computation of the time step dt | 0.004478554 | 2.9 | 8 +Turbulence model::update | 0.000924802 | 0.6 | 1 +Post-treatment operations | 0.009516202 | 6.2 | 1 +Other operations | 0.007394542 | 4.8 | Average number of iteration of the linear solver per call: 2 @@ -63,17 +63,17 @@ Average number of iteration of the linear solver per call: 2 ----------------------------------------------------------------------------------------------------------- Counter description | Time per step | % loop time | Call(s)/step | Bandwidth ----------------------------------------------------------------------------------------------------------- -Libraries: | 0.0660849 | 38.0 | 2 | -Kernels: | 0.0954356 | 54.9 | 908 | -Copy host to device: | 0.000601093 | 0.3 | 18 | 5.9 GB/s -Copy device to host: | 0.00153109 | 0.9 | 7 | 6.9 GB/s -Alloc/Free on device: | 7.77541e-05 | 0.0 | 0 | -GPU: 93% Copy H<->D: 1.2% Alloc/free: 0.045% Comm: 0% CPU & I/O: 5.9% +Libraries: | 0.0571762 | 37.0 | 2 | +Kernels: | 0.0847694 | 54.9 | 906 | +Copy host to device: | 0.000616722 | 0.4 | 18 | 5.8 GB/s +Copy device to host: | 0.00155482 | 1.0 | 7 | 6.8 GB/s +Alloc/Free on device: | 7.43548e-05 | 0.0 | 0 | +GPU: 92% Copy H<->D: 1.4% Alloc/free: 0.048% Comm: 0% CPU & I/O: 6.7% ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Post-resolution statistics ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -Time of the post-resolution: 0.111628 +Time of the post-resolution: 0.125482 -Total time for the whole computation 54.0211 +Total time for the whole computation 45.2465 -[Slurm] Power consumption (63 s): 0.452 kW 0.008 kWh 0.001 € (0.10€/kWh) +[Slurm] Power consumption (59 s): 0.438 kW 0.007 kWh 0.001 € (0.10€/kWh) diff --git a/tests/GPU/cuDSS_cholesky/cuDSS_cholesky_BENCH.TU.topaze_cc80 b/tests/GPU/cuDSS_cholesky/cuDSS_cholesky_BENCH.TU.topaze_cc80 index 6cb49dcfcb..181c563808 100644 --- a/tests/GPU/cuDSS_cholesky/cuDSS_cholesky_BENCH.TU.topaze_cc80 +++ b/tests/GPU/cuDSS_cholesky/cuDSS_cholesky_BENCH.TU.topaze_cc80 @@ -8,8 +8,8 @@ Time is given in seconds ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Context of the computation ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -Date: 20-02-2026 -- 17:27:08 -OS: topaze7005__Linux__x86_64__4.18.0-553.89.1.el8_10.x86_64__#1 SMP Sat Nov 29 00:49:18 EST 2025 +Date: 13-06-2026 -- 17:17:22 +OS: topaze7048__Linux__x86_64__4.18.0-553.123.1.el8_10.x86_64__#1 SMP Mon May 4 13:45:48 EDT 2026 CPU model : AMD EPYC 7763 64-Core Processor Total number of threads:256 GPU model: NVIDIA A100-SXM4-80GB @@ -22,38 +22,38 @@ Total number of elements used for the calculation: 2332800 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Computation start-up statistics ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -Total time of the start-up: 62.3042 +Total time of the start-up: 46.3126 Number of calls to the linear solver per time step: 2 -Average time of the resolution of the linear problem per call: 12.4547 +Average time of the resolution of the linear problem per call: 12.2661 Average number of iteration of the linear solver per call: 2 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Time loop statistics ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ The first time step is not accounted for the computation of the time loop statistics -Total time of the time loop: 2.45396 +Total time of the time loop: 1.98361 Number of time steps: 9 Skipped time steps: 1 -Average time per time step: 0.272662 -Standard deviation between time steps: 0.00993732 -Time elapsed in the skipped time steps: 1.29729 +Average time per time step: 0.220401 +Standard deviation between time steps: 0.00668609 +Time elapsed in the skipped time steps: 1.13073 Standard counter description | Time/step | % loop time | Call(s)/step ------------------------------------------------------------------------------------------ -Linear solver resolutions Ax=B | 0.0858845 | 20.6 | 2 -Convection operator | 0.02602534 | 6.2 | 4 -Diffusion operator | 0.06160948 | 14.8 | 26 -Gradient operator | 0.01633848 | 3.9 | 4 -Divergence operator | 0.005998426 | 1.4 | 3 -Source terms | 0.01078654 | 2.6 | 2 -Update ::mettre_a_jour | 0.004897438 | 1.2 | 1 -Solver for implicit diffusion | 0.029447 | 7.1 | 4 -Computation of the time step dt | 0.01235725 | 3.0 | 8 -Turbulence model::update | 0.001674552 | 0.4 | 1 -Post-treatment operations | 0.008454185 | 2.0 | 1 -Other operations | 0.009188697 | 2.2 | +Linear solver resolutions Ax=B | 0.0839011 | 38.1 | 2 +Convection operator | 0.01539531 | 7.0 | 4 +Diffusion operator | 0.04144135 | 18.8 | 26 +Gradient operator | 0.01224873 | 5.6 | 4 +Divergence operator | 0.002501917 | 1.1 | 3 +Source terms | 0.00925932 | 4.2 | 2 +Update ::mettre_a_jour | 0.003451533 | 1.6 | 1 +Solver for implicit diffusion | 0.02763385 | 12.5 | 4 +Computation of the time step dt | 0.007391328 | 3.4 | 8 +Turbulence model::update | 0.001449264 | 0.7 | 1 +Post-treatment operations | 0.006868499 | 3.1 | 1 +Other operations | 0.008858503 | 4.0 | Average number of iteration of the linear solver per call: 2 @@ -63,16 +63,17 @@ Average number of iteration of the linear solver per call: 2 ----------------------------------------------------------------------------------------------------------- Counter description | Time per step | % loop time | Call(s)/step | Bandwidth ----------------------------------------------------------------------------------------------------------- -Libraries: | 0.0858593 | 31.5 | 2 | -Kernels: | 0.176086 | 64.6 | 908 | -Copy host to device: | 0.000506159 | 0.2 | 18 | 7.1 GB/s -Copy device to host: | 0.000966553 | 0.4 | 7 | 10.9 GB/s -Alloc/Free on device: | 9.52723e-05 | 0.0 | 0 | -GPU: 96% Copy H<->D: 0.54% Alloc/free: 0.035% Comm: 0% CPU & I/O: 3.4% +Libraries: | 0.0838766 | 38.1 | 2 | +Kernels: | 0.127721 | 57.9 | 906 | +Copy host to device: | 0.000500837 | 0.2 | 18 | 7.1 GB/s +Copy device to host: | 0.000939364 | 0.4 | 7 | 11.2 GB/s +Alloc/Free on device: | 9.52792e-05 | 0.0 | 0 | +GPU: 96% Copy H<->D: 0.65% Alloc/free: 0.043% Comm: 0% CPU & I/O: 3.3% ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Post-resolution statistics ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -Time of the post-resolution: 0.203789 +Time of the post-resolution: 0.188939 -Total time for the whole computation 66.2593 +Total time for the whole computation 49.6159 +[Slurm] Power consumption (79 s): 0.765 kW 0.017 kWh 0.002 € (0.10€/kWh) diff --git a/tests/GPU/thermohydraulique_VDF_DNS/check_perf.sh b/tests/GPU/thermohydraulique_VDF_DNS/check_perf.sh new file mode 120000 index 0000000000..6d20411c12 --- /dev/null +++ b/tests/GPU/thermohydraulique_VDF_DNS/check_perf.sh @@ -0,0 +1 @@ +../DomainFlowLES/check_perf.sh \ No newline at end of file diff --git a/tests/GPU/thermohydraulique_VDF_DNS/thermohydraulique_VDF_DNS.data b/tests/GPU/thermohydraulique_VDF_DNS/thermohydraulique_VDF_DNS.data new file mode 100644 index 0000000000..c0d2a26856 --- /dev/null +++ b/tests/GPU/thermohydraulique_VDF_DNS/thermohydraulique_VDF_DNS.data @@ -0,0 +1,584 @@ +# ThermoHydraulique 3D : bali metal VDF - Boussinesq # +# PARALLEL OK # +dimension 3 +Pb_Thermohydraulique pb + +Domaine dom + +# BEGIN MESH # +Mailler dom +{ + Pave Entree + { + /* warning dumb geometry */ + Origine 0. 0. 0. + Nombre_de_Noeuds 11 11 11 + /* Nombre_de_Noeuds 101 101 101 */ + Longueurs 1 1 1 + } + { + Bord gauche X = 0. 0. <= Y <= 1. 0. <= Z <= 1. + Bord droite X = 1. 0. <= Y <= 1. 0. <= Z <= 1. + Bord haut Y = 0. 0. <= X <= 1. 0. <= Z <= 1. + Bord bas Y = 1. 0. <= X <= 1. 0. <= Z <= 1. + Bord devant Z = 0. 0. <= X <= 1. 0. <= Y <= 1. + Bord derriere Z = 1. 0. <= X <= 1. 0. <= Y <= 1. + } +} +# END MESH # + +# BEGIN PARTITION +Partition dom +{ + Partition_tool Metis { Nb_parts 2 } + Larg_joint 2 + zones_name DOM +} +End +END PARTITION # + +# BEGIN SCATTER +Scatter DOM.Zones dom +END SCATTER # + +VDF dis +Lire dis { reorder { algo hilbert } } + +Scheme_euler_explicit sch_ex +Read sch_ex +{ +nb_pas_dt_max 10 + tinit 0. + tmax 3000. + dt_min 1.e-11 + dt_max 0.1 + dt_impr 5. + dt_sauv 5. + dt_start dt_calc + seuil_statio 1.e-8 + facsec 1 + diffusion_implicite 1 + tcpumax 23.5 /* Le calcul s'arretera proprement apr�s 23h30 */ +} + + + +Associate pb dom +Associate pb sch_ex + +Discretize pb dis + +# extraction des surfaces, conditions limites pour le post-traitement # +Domaine haut +Extraire_surface { domaine haut probleme pb avec_certains_bords 1 haut } +Domaine bas +Extraire_surface { domaine bas probleme pb avec_certains_bords 1 bas } +Domaine gauche +Extraire_surface { domaine gauche probleme pb avec_certains_bords 1 gauche } +Domaine droite +Extraire_surface { domaine droite probleme pb avec_certains_bords 1 droite } +Domaine milieu +Extraire_surface { domaine milieu probleme pb condition_elements 0.09D: 1.7% Alloc/free: 0.023% Comm: 0% CPU & I/O: 85% +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Post-resolution statistics +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Time of the post-resolution: 0.0747186 + +Total time for the whole computation 29.7344 + +[Slurm] Power consumption (39 s): 0.426 kW 0.005 kWh 0.000 € (0.10€/kWh) diff --git a/tests/GPU/thermohydraulique_VDF_DNS/thermohydraulique_VDF_DNS_BENCH.TU.adastra_gfx942 b/tests/GPU/thermohydraulique_VDF_DNS/thermohydraulique_VDF_DNS_BENCH.TU.adastra_gfx942 new file mode 100644 index 0000000000..2195d121df --- /dev/null +++ b/tests/GPU/thermohydraulique_VDF_DNS/thermohydraulique_VDF_DNS_BENCH.TU.adastra_gfx942 @@ -0,0 +1,78 @@ + # Global performance file # + +This is the global file for tracking performance in TRUST. It stores aggregated quantities. +More detailed statistics can be found in the thermohydraulique_VDF_DNS_BENCH_csv.TU file +For time loop, only standard counters of level 1 are printed alongside your custom counters +Time is given in seconds + +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Context of the computation +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Date: 08-06-2026 -- 14:56:53 +OS: a1002__Linux__x86_64__5.14.0-570.69.1.el9_6.x86_64__#1 SMP PREEMPT_DYNAMIC Tue Nov 25 01:30:14 EST 2025 +CPU model : AMD Instinct MI300A Accelerator +Total number of threads:192 +GPU model: AMD Instinct MI300A +HIP runtime version: 6.43 +HIP drivers version: 6.43 +Nb procs used for the computation: 1 +TRUST version: 1.9.8_beta +Total number of elements used for the calculation: 1000000 + +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Computation start-up statistics +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Total time of the start-up: 18.456 + +Number of calls to the linear solver per time step: 2 +Average time of the resolution of the linear problem per call: 0.749276 +Average number of iteration of the linear solver per call: 0 + +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Time loop statistics +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +The first time step is not accounted for the computation of the time loop statistics +Total time of the time loop: 2.8676 +Number of time steps: 9 +Skipped time steps: 1 +Average time per time step: 0.318622 +Standard deviation between time steps: 0.744298 +Time elapsed in the skipped time steps: 0.180249 + + +Standard counter description | Time/step | % loop time | Call(s)/step +------------------------------------------------------------------------------------------ +Linear solver resolutions Ax=B | 0.0146931 | 4.6 | 1 +Convection operator | 0.001107729 | 0.3 | 2 +Diffusion operator | 0.01717939 | 5.4 | 9 +Gradient operator | 0.009569378 | 3.0 | 2 +Divergence operator | 0.0002454842 | 0.1 | 2 +Source terms | 0.0002103628 | 0.1 | 1 +Update ::mettre_a_jour | 0.0004310533 | 0.1 | 1 +Solver for implicit diffusion | 0.002886217 | 0.9 | 2 +Computation of the time step dt | 0.0006566232 | 0.2 | 6 +Post-treatment operations | 0.2791213 | 87.6 | 1 +Other operations | -0.007478771 | -2.3 | + +Average number of iteration of the linear solver per call: 20 + + +----------------------------------------------------------------------------------------------------------- + GPU statistics +----------------------------------------------------------------------------------------------------------- +Counter description | Time per step | % loop time | Call(s)/step | Bandwidth +----------------------------------------------------------------------------------------------------------- +Libraries: | 0.0145741 | 4.6 | 1 | +Kernels: | 0.0155426 | 4.9 | 451 | +Copy host to device: | 0.001705 | 0.5 | 38 | 26.6 GB/s +Copy device to host: | 0.00263336 | 0.8 | 31 | 33.3 GB/s +Alloc/Free on device: | 0.000196487 | 0.1 | 386 | +GPU: 9.5% Copy H<->D: 1.4% Alloc/free: 0.062% Comm: 0% CPU & I/O: 89% +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Post-resolution statistics +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Time of the post-resolution: 0.073254 + +Total time for the whole computation 21.5772 + +[Slurm] Power consumption (29 s): 0.606 kW 0.005 kWh 0.000 € (0.10€/kWh) diff --git a/tests/GPU/thermohydraulique_VDF_DNS/thermohydraulique_VDF_DNS_BENCH.TU.dalianvl_cc100 b/tests/GPU/thermohydraulique_VDF_DNS/thermohydraulique_VDF_DNS_BENCH.TU.dalianvl_cc100 new file mode 100644 index 0000000000..5d0c4c1dc9 --- /dev/null +++ b/tests/GPU/thermohydraulique_VDF_DNS/thermohydraulique_VDF_DNS_BENCH.TU.dalianvl_cc100 @@ -0,0 +1,77 @@ + # Global performance file # + +This is the global file for tracking performance in TRUST. It stores aggregated quantities. +More detailed statistics can be found in the thermohydraulique_VDF_DNS_BENCH_csv.TU file +For time loop, only standard counters of level 1 are printed alongside your custom counters +Time is given in seconds + +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Context of the computation +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Date: 22-05-2026 -- 12:19:59 +OS: dalianvl14__Linux__aarch64__5.14.0-503.40.1.el9_5.aarch64__#1 SMP PREEMPT_DYNAMIC Thu Apr 24 13:05:29 UTC 2025 +CPU model : Unknown Linux CPU +Total number of threads:144 +GPU model: NVIDIA GB200 +CUDA runtime version: 13.0 +CUDA drivers version: 13.20 +Nb procs used for the computation: 1 +TRUST version: 1.9.8_beta +Total number of elements used for the calculation: 1000000 + +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Computation start-up statistics +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Total time of the start-up: 10.3147 + +Number of calls to the linear solver per time step: 2 +Average time of the resolution of the linear problem per call: 0.47066 +Average number of iteration of the linear solver per call: 0 + +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Time loop statistics +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +The first time step is not accounted for the computation of the time loop statistics +Total time of the time loop: 2.46847 +Number of time steps: 9 +Skipped time steps: 1 +Average time per time step: 0.274274 +Standard deviation between time steps: 0.653925 +Time elapsed in the skipped time steps: 0.53959 + + +Standard counter description | Time/step | % loop time | Call(s)/step +------------------------------------------------------------------------------------------ +Linear solver resolutions Ax=B | 0.0130017 | 4.7 | 1 +Convection operator | 0.0007011693 | 0.3 | 2 +Diffusion operator | 0.01002715 | 3.7 | 9 +Gradient operator | 0.004981093 | 1.8 | 2 +Divergence operator | 0.0002038184 | 0.1 | 2 +Source terms | 0.0001672706 | 0.1 | 1 +Update ::mettre_a_jour | 0.000417205 | 0.2 | 1 +Solver for implicit diffusion | 0.001707723 | 0.6 | 2 +Computation of the time step dt | 0.0004988829 | 0.2 | 6 +Post-treatment operations | 0.2453269 | 89.4 | 1 +Other operations | -0.002758624 | -1.0 | + +Average number of iteration of the linear solver per call: 20 + + +----------------------------------------------------------------------------------------------------------- + GPU statistics +----------------------------------------------------------------------------------------------------------- +Counter description | Time per step | % loop time | Call(s)/step | Bandwidth +----------------------------------------------------------------------------------------------------------- +Libraries: | 0.0127201 | 4.6 | 1 | +Kernels: | 0.00998931 | 3.6 | 451 | +Copy host to device: | 0.00101425 | 0.4 | 38 | 44.8 GB/s +Copy device to host: | 0.0014345 | 0.5 | 31 | 61.2 GB/s +Alloc/Free on device: | 0.000276302 | 0.1 | 386 | +GPU: 8.3% Copy H<->D: 0.89% Alloc/free: 0.1% Comm: 0% CPU & I/O: 91% +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Post-resolution statistics +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Time of the post-resolution: 0.0525768 + +Total time for the whole computation 13.3754 + diff --git a/tests/GPU/thermohydraulique_VDF_DNS/thermohydraulique_VDF_DNS_BENCH.TU.eureka_cc89 b/tests/GPU/thermohydraulique_VDF_DNS/thermohydraulique_VDF_DNS_BENCH.TU.eureka_cc89 new file mode 100644 index 0000000000..1235b5aa36 --- /dev/null +++ b/tests/GPU/thermohydraulique_VDF_DNS/thermohydraulique_VDF_DNS_BENCH.TU.eureka_cc89 @@ -0,0 +1,77 @@ + # Global performance file # + +This is the global file for tracking performance in TRUST. It stores aggregated quantities. +More detailed statistics can be found in the thermohydraulique_VDF_DNS_BENCH_csv.TU file +For time loop, only standard counters of level 1 are printed alongside your custom counters +Time is given in seconds + +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Context of the computation +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Date: 11-05-2026 -- 08:38:31 +OS: eureka__Linux__x86_64__6.17.0-20-generic__#20~24.04.1-Ubuntu SMP PREEMPT_DYNAMIC Thu Mar 19 01:28:37 UTC 2 +CPU model : INTEL(R) XEON(R) PLATINUM 8580 +Total number of threads:240 +GPU model: NVIDIA RTX 6000 Ada Generation +CUDA runtime version: 12.90 +CUDA drivers version: 13.10 +Nb procs used for the computation: 1 +TRUST version: 1.9.8_beta +Total number of elements used for the calculation: 1000000 + +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Computation start-up statistics +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Total time of the start-up: 12.0825 + +Number of calls to the linear solver per time step: 2 +Average time of the resolution of the linear problem per call: 0.808736 +Average number of iteration of the linear solver per call: 0 + +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Time loop statistics +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +The first time step is not accounted for the computation of the time loop statistics +Total time of the time loop: 2.32686 +Number of time steps: 9 +Skipped time steps: 1 +Average time per time step: 0.25854 +Standard deviation between time steps: 0.526476 +Time elapsed in the skipped time steps: 0.237277 + + +Standard counter description | Time/step | % loop time | Call(s)/step +------------------------------------------------------------------------------------------ +Linear solver resolutions Ax=B | 0.0296802 | 11.5 | 1 +Convection operator | 0.002672108 | 1.0 | 2 +Diffusion operator | 0.02081252 | 8.1 | 9 +Gradient operator | 0.006097233 | 2.4 | 2 +Divergence operator | 0.003118208 | 1.2 | 2 +Source terms | 0.0002127497 | 0.1 | 1 +Update ::mettre_a_jour | 0.001786451 | 0.7 | 1 +Solver for implicit diffusion | 0.001692247 | 0.7 | 2 +Computation of the time step dt | 0.0007426396 | 0.3 | 6 +Post-treatment operations | 0.197774 | 76.5 | 1 +Other operations | -0.006048129 | -2.3 | + +Average number of iteration of the linear solver per call: 29.3 + + +----------------------------------------------------------------------------------------------------------- + GPU statistics +----------------------------------------------------------------------------------------------------------- +Counter description | Time per step | % loop time | Call(s)/step | Bandwidth +----------------------------------------------------------------------------------------------------------- +Libraries: | 0.0295874 | 11.4 | 1 | +Kernels: | 0.01683 | 6.5 | 430 | +Copy host to device: | 0.00693595 | 2.7 | 29 | 9.9 GB/s +Copy device to host: | 0.00794341 | 3.1 | 23 | 14.1 GB/s +Alloc/Free on device: | 9.93497e-05 | 0.0 | 385 | +GPU: 18% Copy H<->D: 5.8% Alloc/free: 0.038% Comm: 0% CPU & I/O: 76% +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Post-resolution statistics +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Time of the post-resolution: 0.0105536 + +Total time for the whole computation 14.6572 + diff --git a/tests/GPU/thermohydraulique_VDF_DNS/thermohydraulique_VDF_DNS_BENCH.TU.irene-amd-ccrt_cc70 b/tests/GPU/thermohydraulique_VDF_DNS/thermohydraulique_VDF_DNS_BENCH.TU.irene-amd-ccrt_cc70 new file mode 100644 index 0000000000..ce8b04d16c --- /dev/null +++ b/tests/GPU/thermohydraulique_VDF_DNS/thermohydraulique_VDF_DNS_BENCH.TU.irene-amd-ccrt_cc70 @@ -0,0 +1,78 @@ + # Global performance file # + +This is the global file for tracking performance in TRUST. It stores aggregated quantities. +More detailed statistics can be found in the thermohydraulique_VDF_DNS_BENCH_csv.TU file +For time loop, only standard counters of level 1 are printed alongside your custom counters +Time is given in seconds + +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Context of the computation +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Date: 23-04-2026 -- 09:48:39 +OS: irene7050__Linux__x86_64__4.18.0-553.89.1.el8_10.x86_64__#1 SMP Sat Nov 29 00:49:18 EST 2025 +CPU model : Intel(R) Xeon(R) Gold 6230 CPU @ 2.10GHz +Total number of threads:80 +GPU model: Tesla V100-SXM2-16GB +CUDA runtime version: 12.90 +CUDA drivers version: 13.0 +Nb procs used for the computation: 1 +TRUST version: 1.9.8_beta +Total number of elements used for the calculation: 1000000 + +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Computation start-up statistics +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Total time of the start-up: 15.0231 + +Number of calls to the linear solver per time step: 2 +Average time of the resolution of the linear problem per call: 0.87689 +Average number of iteration of the linear solver per call: 0 + +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Time loop statistics +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +The first time step is not accounted for the computation of the time loop statistics +Total time of the time loop: 5.3121 +Number of time steps: 9 +Skipped time steps: 1 +Average time per time step: 0.590233 +Standard deviation between time steps: 1.32085 +Time elapsed in the skipped time steps: 0.448604 + + +Standard counter description | Time/step | % loop time | Call(s)/step +------------------------------------------------------------------------------------------ +Linear solver resolutions Ax=B | 0.0376823 | 6.4 | 1 +Convection operator | 0.001876149 | 0.3 | 2 +Diffusion operator | 0.04526071 | 7.7 | 9 +Gradient operator | 0.007839558 | 1.3 | 2 +Divergence operator | 0.0006154726 | 0.1 | 2 +Source terms | 0.00030727 | 0.1 | 1 +Update ::mettre_a_jour | 0.0006910987 | 0.1 | 1 +Solver for implicit diffusion | 0.003822609 | 0.6 | 2 +Computation of the time step dt | 0.001210896 | 0.2 | 6 +Post-treatment operations | 0.4956301 | 84.0 | 1 +Other operations | -0.004703008 | -0.8 | + +Average number of iteration of the linear solver per call: 24.9 + + +----------------------------------------------------------------------------------------------------------- + GPU statistics +----------------------------------------------------------------------------------------------------------- +Counter description | Time per step | % loop time | Call(s)/step | Bandwidth +----------------------------------------------------------------------------------------------------------- +Libraries: | 0.0375116 | 6.4 | 1 | +Kernels: | 0.0249549 | 4.2 | 432 | +Copy host to device: | 0.0121441 | 2.1 | 27 | 4.2 GB/s +Copy device to host: | 0.025129 | 4.3 | 23 | 4.6 GB/s +Alloc/Free on device: | 0.000107331 | 0.0 | 384 | +GPU: 11% Copy H<->D: 6.3% Alloc/free: 0.018% Comm: 0% CPU & I/O: 83% +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Post-resolution statistics +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Time of the post-resolution: 0.074717 + +Total time for the whole computation 20.8585 + +[Slurm] Power consumption (36 s): 0.246 kW 0.002 kWh 0.000 € (0.10€/kWh) diff --git a/tests/GPU/thermohydraulique_VDF_DNS/thermohydraulique_VDF_DNS_BENCH.TU.is157091_cc86 b/tests/GPU/thermohydraulique_VDF_DNS/thermohydraulique_VDF_DNS_BENCH.TU.is157091_cc86 new file mode 100644 index 0000000000..e68443d67e --- /dev/null +++ b/tests/GPU/thermohydraulique_VDF_DNS/thermohydraulique_VDF_DNS_BENCH.TU.is157091_cc86 @@ -0,0 +1,77 @@ + # Global performance file # + +This is the global file for tracking performance in TRUST. It stores aggregated quantities. +More detailed statistics can be found in the thermohydraulique_VDF_DNS_BENCH_csv.TU file +For time loop, only standard counters of level 1 are printed alongside your custom counters +Time is given in seconds + +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Context of the computation +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Date: 14-05-2026 -- 16:27:50 +OS: is157091__Linux__x86_64__5.11.12-300.fc34.x86_64__#1 SMP Wed Apr 7 16:31:13 UTC 2021 +CPU model : AMD Ryzen Threadripper PRO 5975WX 32-Cores +Total number of threads:64 +GPU model: NVIDIA RTX A6000 +CUDA runtime version: 12.90 +CUDA drivers version: 13.20 +Nb procs used for the computation: 1 +TRUST version: 1.9.8_beta +Total number of elements used for the calculation: 1000000 + +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Computation start-up statistics +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Total time of the start-up: 10.9568 + +Number of calls to the linear solver per time step: 2 +Average time of the resolution of the linear problem per call: 0.567296 +Average number of iteration of the linear solver per call: 0 + +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Time loop statistics +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +The first time step is not accounted for the computation of the time loop statistics +Total time of the time loop: 3.25948 +Number of time steps: 9 +Skipped time steps: 1 +Average time per time step: 0.362164 +Standard deviation between time steps: 0.767551 +Time elapsed in the skipped time steps: 0.197513 + + +Standard counter description | Time/step | % loop time | Call(s)/step +------------------------------------------------------------------------------------------ +Linear solver resolutions Ax=B | 0.0344571 | 9.5 | 1 +Convection operator | 0.002472719 | 0.7 | 2 +Diffusion operator | 0.02741157 | 7.6 | 9 +Gradient operator | 0.007760346 | 2.1 | 2 +Divergence operator | 0.0006208222 | 0.2 | 2 +Source terms | 0.0003084709 | 0.1 | 1 +Update ::mettre_a_jour | 0.0006127068 | 0.2 | 1 +Solver for implicit diffusion | 0.003998419 | 1.1 | 2 +Computation of the time step dt | 0.001362192 | 0.4 | 6 +Post-treatment operations | 0.2878818 | 79.5 | 1 +Other operations | -0.004721987 | -1.3 | + +Average number of iteration of the linear solver per call: 20 + + +----------------------------------------------------------------------------------------------------------- + GPU statistics +----------------------------------------------------------------------------------------------------------- +Counter description | Time per step | % loop time | Call(s)/step | Bandwidth +----------------------------------------------------------------------------------------------------------- +Libraries: | 0.0342252 | 9.5 | 1 | +Kernels: | 0.0249121 | 6.9 | 451 | +Copy host to device: | 0.00424511 | 1.2 | 38 | 10.7 GB/s +Copy device to host: | 0.00826155 | 2.3 | 31 | 10.6 GB/s +Alloc/Free on device: | 0.000147109 | 0.0 | 386 | +GPU: 16% Copy H<->D: 3.5% Alloc/free: 0.041% Comm: 0% CPU & I/O: 80% +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Post-resolution statistics +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Time of the post-resolution: 0.0714031 + +Total time for the whole computation 14.4852 + diff --git a/tests/GPU/thermohydraulique_VDF_DNS/thermohydraulique_VDF_DNS_BENCH.TU.is159479_cc120 b/tests/GPU/thermohydraulique_VDF_DNS/thermohydraulique_VDF_DNS_BENCH.TU.is159479_cc120 new file mode 100644 index 0000000000..9cde3f4150 --- /dev/null +++ b/tests/GPU/thermohydraulique_VDF_DNS/thermohydraulique_VDF_DNS_BENCH.TU.is159479_cc120 @@ -0,0 +1,77 @@ + # Global performance file # + +This is the global file for tracking performance in TRUST. It stores aggregated quantities. +More detailed statistics can be found in the thermohydraulique_VDF_DNS_BENCH_csv.TU file +For time loop, only standard counters of level 1 are printed alongside your custom counters +Time is given in seconds + +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Context of the computation +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Date: 13-05-2026 -- 07:07:50 +OS: is159479.intra.cea.fr__Linux__x86_64__6.12.0-160000.29-default__#1 SMP PREEMPT_DYNAMIC Fri May 1 12:45:19 UTC 2026 (6 +CPU model : AMD Ryzen Threadripper PRO 9965WX 24-Cores +Total number of threads:48 +GPU model: NVIDIA RTX PRO 6000 Blackwell Workstation Edition +CUDA runtime version: 13.0 +CUDA drivers version: 13.0 +Nb procs used for the computation: 1 +TRUST version: 1.9.8_beta +Total number of elements used for the calculation: 1000000 + +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Computation start-up statistics +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Total time of the start-up: 6.99314 + +Number of calls to the linear solver per time step: 2 +Average time of the resolution of the linear problem per call: 0.359449 +Average number of iteration of the linear solver per call: 0 + +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Time loop statistics +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +The first time step is not accounted for the computation of the time loop statistics +Total time of the time loop: 1.67615 +Number of time steps: 9 +Skipped time steps: 1 +Average time per time step: 0.186239 +Standard deviation between time steps: 0.390992 +Time elapsed in the skipped time steps: 0.162419 + + +Standard counter description | Time/step | % loop time | Call(s)/step +------------------------------------------------------------------------------------------ +Linear solver resolutions Ax=B | 0.0150849 | 8.1 | 1 +Convection operator | 0.0008391611 | 0.5 | 2 +Diffusion operator | 0.02027865 | 10.9 | 9 +Gradient operator | 0.006390957 | 3.4 | 2 +Divergence operator | 0.000193898 | 0.1 | 2 +Source terms | 0.0001278147 | 0.1 | 1 +Update ::mettre_a_jour | 0.0002891871 | 0.2 | 1 +Solver for implicit diffusion | 0.001259632 | 0.7 | 2 +Computation of the time step dt | 0.0004311387 | 0.2 | 6 +Post-treatment operations | 0.1466799 | 78.8 | 1 +Other operations | -0.005336367 | -2.9 | + +Average number of iteration of the linear solver per call: 20 + + +----------------------------------------------------------------------------------------------------------- + GPU statistics +----------------------------------------------------------------------------------------------------------- +Counter description | Time per step | % loop time | Call(s)/step | Bandwidth +----------------------------------------------------------------------------------------------------------- +Libraries: | 0.0149707 | 8.0 | 1 | +Kernels: | 0.00987719 | 5.3 | 451 | +Copy host to device: | 0.00382741 | 2.1 | 38 | 11.9 GB/s +Copy device to host: | 0.00974978 | 5.2 | 31 | 9.0 GB/s +Alloc/Free on device: | 0.000131351 | 0.1 | 386 | +GPU: 13% Copy H<->D: 7.3% Alloc/free: 0.071% Comm: 0% CPU & I/O: 79% +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Post-resolution statistics +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Time of the post-resolution: 0.0120666 + +Total time for the whole computation 8.84379 + diff --git a/tests/GPU/thermohydraulique_VDF_DNS/thermohydraulique_VDF_DNS_BENCH.TU.is246827_cc86 b/tests/GPU/thermohydraulique_VDF_DNS/thermohydraulique_VDF_DNS_BENCH.TU.is246827_cc86 new file mode 100644 index 0000000000..93bebc81c5 --- /dev/null +++ b/tests/GPU/thermohydraulique_VDF_DNS/thermohydraulique_VDF_DNS_BENCH.TU.is246827_cc86 @@ -0,0 +1,52 @@ +Statistiques d'initialisation du calcul + +Temps total 10.2829 + +Statistiques de resolution du probleme + +Temps total 6.3659 + + +Timesteps 10 +Secondes / pas de temps 0.636587 +Dont solveurs Ax=B 0.158250 24% (1 appel/pas de temps) +Dont solveur diffusion_implicite 0.042880 6% (2 appels/pas de temps) +Dont mettre_a_jour 0.083516 13% (1 appel/pas de temps) +Dont operateurs convection 0.016030 2% (2 appels/pas de temps) +Dont operateurs diffusion 0.062878 9% (8.9 appels/pas de temps) +Dont operateurs gradient 0.010380 1% (2.2 appels/pas de temps) +Dont operateurs divergence 0.004090 0% (2 appels/pas de temps) +Dont operateurs source 0.001426 0% (1 appel/pas de temps) +Dont operations postraitement 0.241783 37% (1 appel/pas de temps) +Dont calcul dt 0.003010 0% (6 appels/pas de temps) +Dont calcul divers 0.012344 1% (0 appels/pas de temps) +Nb solveur / pas de temps 1 +Secondes / solveur 0.15825 +Iterations / solveur 18 +GPU statistics per time step (experimental): +Libraries : 0.157395 s 24.7% 1.0 calls +Kernels : 0.105369 s 16.6% 406.9 calls +Copy H2D : 0.023752 s 3.7% 51.2 calls 9.2 GB/s +Copy D2H : 0.021397 s 3.4% 39.6 calls 10.0 GB/s +Alloc/Free: 0.005759 s 0.9% 403.0 calls +GPU: 41.2% Copy H<->D: 7% Alloc/Free: 0.9% Comm: 0% CPU & Others: 50.7% +I/O: + +Timesteps = number of time steps +Nb solveur = number of linear system resolutions +Nb assemblage implicite = number of matrix assemblies for the implicit scheme +Iterations = average number of iterations of the solver +Communications = fraction of the time spent + in communications between processors (excluding io files) +Network latency = time of one mpsum measured by an internal bench over 0.1s +Network bandwidth = maximum on all processors + of the average bandwidth of send_recv operations +Waiting time = estimation of the waiting time of the different processors + +Max_waiting_time big => probably due to a bad partitioning +Communications > 30% => too many processors or network too slow + +Statistiques de post resolution + +Temps total 0.014188 + diff --git a/tests/GPU/thermohydraulique_VDF_DNS/thermohydraulique_VDF_DNS_BENCH.TU.is247793_gfx1100 b/tests/GPU/thermohydraulique_VDF_DNS/thermohydraulique_VDF_DNS_BENCH.TU.is247793_gfx1100 new file mode 100644 index 0000000000..0a9068ceab --- /dev/null +++ b/tests/GPU/thermohydraulique_VDF_DNS/thermohydraulique_VDF_DNS_BENCH.TU.is247793_gfx1100 @@ -0,0 +1,77 @@ + # Global performance file # + +This is the global file for tracking performance in TRUST. It stores aggregated quantities. +More detailed statistics can be found in the thermohydraulique_VDF_DNS_BENCH_csv.TU file +For time loop, only standard counters of level 1 are printed alongside your custom counters +Time is given in seconds + +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Context of the computation +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Date: 22-05-2026 -- 19:07:04 +OS: is247793__Linux__x86_64__6.8.0-107-generic__#107~22.04.1-Ubuntu SMP PREEMPT_DYNAMIC Wed Mar 18 23:40:43 UTC +CPU model : AMD Ryzen Threadripper PRO 7965WX 24-Cores +Total number of threads:48 +GPU model: AMD Radeon PRO W7900 +HIP runtime version: 7.53 +HIP drivers version: 7.53 +Nb procs used for the computation: 1 +TRUST version: 1.9.8_beta +Total number of elements used for the calculation: 1000000 + +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Computation start-up statistics +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Total time of the start-up: 9.83297 + +Number of calls to the linear solver per time step: 2 +Average time of the resolution of the linear problem per call: 0.987836 +Average number of iteration of the linear solver per call: 0 + +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Time loop statistics +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +The first time step is not accounted for the computation of the time loop statistics +Total time of the time loop: 2.41001 +Number of time steps: 9 +Skipped time steps: 1 +Average time per time step: 0.267778 +Standard deviation between time steps: 0.514588 +Time elapsed in the skipped time steps: 0.258782 + + +Standard counter description | Time/step | % loop time | Call(s)/step +------------------------------------------------------------------------------------------ +Linear solver resolutions Ax=B | 0.0385379 | 14.4 | 1 +Convection operator | 0.00245368 | 0.9 | 2 +Diffusion operator | 0.02310321 | 8.6 | 9 +Gradient operator | 0.006615284 | 2.5 | 2 +Divergence operator | 0.0006505396 | 0.2 | 2 +Source terms | 0.0003957816 | 0.1 | 1 +Update ::mettre_a_jour | 0.0007646496 | 0.3 | 1 +Solver for implicit diffusion | 0.004011812 | 1.5 | 2 +Computation of the time step dt | 0.001353264 | 0.5 | 6 +Post-treatment operations | 0.193474 | 72.3 | 1 +Other operations | -0.003581728 | -1.3 | + +Average number of iteration of the linear solver per call: 20 + + +----------------------------------------------------------------------------------------------------------- + GPU statistics +----------------------------------------------------------------------------------------------------------- +Counter description | Time per step | % loop time | Call(s)/step | Bandwidth +----------------------------------------------------------------------------------------------------------- +Libraries: | 0.0383825 | 14.3 | 1 | +Kernels: | 0.0315465 | 11.8 | 451 | +Copy host to device: | 0.00322612 | 1.2 | 38 | 14.1 GB/s +Copy device to host: | 0.0041323 | 1.5 | 31 | 21.2 GB/s +Alloc/Free on device: | 0.00016324 | 0.1 | 386 | +GPU: 26% Copy H<->D: 2.7% Alloc/free: 0.061% Comm: 0% CPU & I/O: 71% +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Post-resolution statistics +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Time of the post-resolution: 0.0369687 + +Total time for the whole computation 12.5387 + diff --git a/tests/GPU/thermohydraulique_VDF_DNS/thermohydraulique_VDF_DNS_BENCH.TU.jean-zay_cc90 b/tests/GPU/thermohydraulique_VDF_DNS/thermohydraulique_VDF_DNS_BENCH.TU.jean-zay_cc90 new file mode 100644 index 0000000000..555ca2462e --- /dev/null +++ b/tests/GPU/thermohydraulique_VDF_DNS/thermohydraulique_VDF_DNS_BENCH.TU.jean-zay_cc90 @@ -0,0 +1,78 @@ + # Global performance file # + +This is the global file for tracking performance in TRUST. It stores aggregated quantities. +More detailed statistics can be found in the thermohydraulique_VDF_DNS_BENCH_csv.TU file +For time loop, only standard counters of level 1 are printed alongside your custom counters +Time is given in seconds + +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Context of the computation +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Date: 10-06-2026 -- 10:48:07 +OS: jzxh021__Linux__x86_64__5.14.0-570.116.1.el9_6.x86_64__#1 SMP PREEMPT_DYNAMIC Tue May 19 08:44:54 EDT 2026 +CPU model : Intel(R) Xeon(R) Platinum 8468 +Total number of threads:192 +GPU model: NVIDIA H100 80GB HBM3 +CUDA runtime version: 12.60 +CUDA drivers version: 13.20 +Nb procs used for the computation: 1 +TRUST version: 1.9.8_beta +Total number of elements used for the calculation: 1000000 + +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Computation start-up statistics +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Total time of the start-up: 16.2378 + +Number of calls to the linear solver per time step: 2 +Average time of the resolution of the linear problem per call: 0.573866 +Average number of iteration of the linear solver per call: 0 + +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Time loop statistics +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +The first time step is not accounted for the computation of the time loop statistics +Total time of the time loop: 3.01618 +Number of time steps: 9 +Skipped time steps: 1 +Average time per time step: 0.335131 +Standard deviation between time steps: 0.769179 +Time elapsed in the skipped time steps: 0.269408 + + +Standard counter description | Time/step | % loop time | Call(s)/step +------------------------------------------------------------------------------------------ +Linear solver resolutions Ax=B | 0.0156995 | 4.7 | 1 +Convection operator | 0.0008767052 | 0.3 | 2 +Diffusion operator | 0.02406568 | 7.2 | 9 +Gradient operator | 0.009106745 | 2.7 | 2 +Divergence operator | 0.0002421182 | 0.1 | 2 +Source terms | 0.000181716 | 0.1 | 1 +Update ::mettre_a_jour | 0.0004172798 | 0.1 | 1 +Solver for implicit diffusion | 0.00191558 | 0.6 | 2 +Computation of the time step dt | 0.0005445146 | 0.2 | 6 +Post-treatment operations | 0.2885673 | 86.1 | 1 +Other operations | -0.006486277 | -1.9 | + +Average number of iteration of the linear solver per call: 20 + + +----------------------------------------------------------------------------------------------------------- + GPU statistics +----------------------------------------------------------------------------------------------------------- +Counter description | Time per step | % loop time | Call(s)/step | Bandwidth +----------------------------------------------------------------------------------------------------------- +Libraries: | 0.0154814 | 4.6 | 1 | +Kernels: | 0.0136429 | 4.1 | 451 | +Copy host to device: | 0.00519298 | 1.5 | 38 | 8.7 GB/s +Copy device to host: | 0.0112822 | 3.4 | 31 | 7.8 GB/s +Alloc/Free on device: | 0.000159493 | 0.0 | 386 | +GPU: 8.7% Copy H<->D: 4.9% Alloc/free: 0.048% Comm: 0% CPU & I/O: 86% +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Post-resolution statistics +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Time of the post-resolution: 0.0593804 + +Total time for the whole computation 19.5828 + +[Slurm] Power consumption (28 s): 0.432 kW 0.003 kWh 0.000 € (0.10€/kWh) diff --git a/tests/GPU/thermohydraulique_VDF_DNS/thermohydraulique_VDF_DNS_BENCH.TU.lumi_gfx90a b/tests/GPU/thermohydraulique_VDF_DNS/thermohydraulique_VDF_DNS_BENCH.TU.lumi_gfx90a new file mode 100644 index 0000000000..28ea817dc8 --- /dev/null +++ b/tests/GPU/thermohydraulique_VDF_DNS/thermohydraulique_VDF_DNS_BENCH.TU.lumi_gfx90a @@ -0,0 +1,78 @@ + # Global performance file # + +This is the global file for tracking performance in TRUST. It stores aggregated quantities. +More detailed statistics can be found in the thermohydraulique_VDF_DNS_BENCH_csv.TU file +For time loop, only standard counters of level 1 are printed alongside your custom counters +Time is given in seconds + +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Context of the computation +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Date: 15-05-2026 -- 21:14:43 +OS: nid005002__Linux__x86_64__6.4.0-150600.23.73_15.0.14-cray_shasta_c__#1 SMP Tue Oct 21 20:32:25 UTC 2025 (89d3c98) +CPU model : AMD EPYC 7A53 64-Core Processor +Total number of threads:128 +GPU model: AMD Instinct MI250X +HIP runtime version: 6.43 +HIP drivers version: 6.43 +Nb procs used for the computation: 1 +TRUST version: 1.9.8_beta +Total number of elements used for the calculation: 1000000 + +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Computation start-up statistics +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Total time of the start-up: 57.6288 + +Number of calls to the linear solver per time step: 2 +Average time of the resolution of the linear problem per call: 1.5162 +Average number of iteration of the linear solver per call: 0 + +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Time loop statistics +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +The first time step is not accounted for the computation of the time loop statistics +Total time of the time loop: 3.5764 +Number of time steps: 9 +Skipped time steps: 1 +Average time per time step: 0.397378 +Standard deviation between time steps: 0.910304 +Time elapsed in the skipped time steps: 0.2462 + + +Standard counter description | Time/step | % loop time | Call(s)/step +------------------------------------------------------------------------------------------ +Linear solver resolutions Ax=B | 0.0196565 | 4.9 | 1 +Convection operator | 0.002379149 | 0.6 | 2 +Diffusion operator | 0.025488 | 6.4 | 9 +Gradient operator | 0.007580062 | 1.9 | 2 +Divergence operator | 0.0005331556 | 0.1 | 2 +Source terms | 0.0003113889 | 0.1 | 1 +Update ::mettre_a_jour | 0.0006967458 | 0.2 | 1 +Solver for implicit diffusion | 0.003233921 | 0.8 | 2 +Computation of the time step dt | 0.001146262 | 0.3 | 6 +Post-treatment operations | 0.3413949 | 85.9 | 1 +Other operations | -0.005041789 | -1.3 | + +Average number of iteration of the linear solver per call: 20 + + +----------------------------------------------------------------------------------------------------------- + GPU statistics +----------------------------------------------------------------------------------------------------------- +Counter description | Time per step | % loop time | Call(s)/step | Bandwidth +----------------------------------------------------------------------------------------------------------- +Libraries: | 0.0195123 | 4.9 | 1 | +Kernels: | 0.0348402 | 8.8 | 451 | +Copy host to device: | 0.00272454 | 0.7 | 38 | 16.7 GB/s +Copy device to host: | 0.00472031 | 1.2 | 31 | 18.6 GB/s +Alloc/Free on device: | 9.4555e-05 | 0.0 | 386 | +GPU: 14% Copy H<->D: 1.9% Alloc/free: 0.024% Comm: 0% CPU & I/O: 84% +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Post-resolution statistics +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Time of the post-resolution: 0.0660653 + +Total time for the whole computation 61.5175 + +[Slurm] Power consumption (82 s): 0.461 kW 0.010 kWh 0.001 € (0.10€/kWh) diff --git a/tests/GPU/thermohydraulique_VDF_DNS/thermohydraulique_VDF_DNS_BENCH.TU.topaze_cc80 b/tests/GPU/thermohydraulique_VDF_DNS/thermohydraulique_VDF_DNS_BENCH.TU.topaze_cc80 new file mode 100644 index 0000000000..98731ede4d --- /dev/null +++ b/tests/GPU/thermohydraulique_VDF_DNS/thermohydraulique_VDF_DNS_BENCH.TU.topaze_cc80 @@ -0,0 +1,78 @@ + # Global performance file # + +This is the global file for tracking performance in TRUST. It stores aggregated quantities. +More detailed statistics can be found in the thermohydraulique_VDF_DNS_BENCH_csv.TU file +For time loop, only standard counters of level 1 are printed alongside your custom counters +Time is given in seconds + +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Context of the computation +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Date: 15-05-2026 -- 14:01:08 +OS: topaze7040__Linux__x86_64__4.18.0-553.89.1.el8_10.x86_64__#1 SMP Sat Nov 29 00:49:18 EST 2025 +CPU model : AMD EPYC 7763 64-Core Processor +Total number of threads:256 +GPU model: NVIDIA A100-SXM4-80GB +CUDA runtime version: 12.90 +CUDA drivers version: 13.0 +Nb procs used for the computation: 1 +TRUST version: 1.9.8_beta +Total number of elements used for the calculation: 1000000 + +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Computation start-up statistics +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Total time of the start-up: 15.3546 + +Number of calls to the linear solver per time step: 2 +Average time of the resolution of the linear problem per call: 0.753724 +Average number of iteration of the linear solver per call: 0 + +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Time loop statistics +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +The first time step is not accounted for the computation of the time loop statistics +Total time of the time loop: 3.64246 +Number of time steps: 9 +Skipped time steps: 1 +Average time per time step: 0.404717 +Standard deviation between time steps: 0.950988 +Time elapsed in the skipped time steps: 0.357684 + + +Standard counter description | Time/step | % loop time | Call(s)/step +------------------------------------------------------------------------------------------ +Linear solver resolutions Ax=B | 0.019992 | 4.9 | 1 +Convection operator | 0.001205197 | 0.3 | 2 +Diffusion operator | 0.02073481 | 5.1 | 9 +Gradient operator | 0.007618354 | 1.9 | 2 +Divergence operator | 0.0003536566 | 0.1 | 2 +Source terms | 0.0002187154 | 0.1 | 1 +Update ::mettre_a_jour | 0.0004763972 | 0.1 | 1 +Solver for implicit diffusion | 0.002284928 | 0.6 | 2 +Computation of the time step dt | 0.0007091229 | 0.2 | 6 +Post-treatment operations | 0.3566838 | 88.1 | 1 +Other operations | -0.005559456 | -1.4 | + +Average number of iteration of the linear solver per call: 20 + + +----------------------------------------------------------------------------------------------------------- + GPU statistics +----------------------------------------------------------------------------------------------------------- +Counter description | Time per step | % loop time | Call(s)/step | Bandwidth +----------------------------------------------------------------------------------------------------------- +Libraries: | 0.019737 | 4.9 | 1 | +Kernels: | 0.0149773 | 3.7 | 451 | +Copy host to device: | 0.00328589 | 0.8 | 38 | 13.8 GB/s +Copy device to host: | 0.0061287 | 1.5 | 31 | 14.3 GB/s +Alloc/Free on device: | 0.000240941 | 0.1 | 386 | +GPU: 8.6% Copy H<->D: 2.3% Alloc/free: 0.06% Comm: 0% CPU & I/O: 89% +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Post-resolution statistics +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Time of the post-resolution: 0.0820633 + +Total time for the whole computation 19.4368 + +[Slurm] Power consumption (62 s): 0.435 kW 0.007 kWh 0.001 € (0.10€/kWh) diff --git a/tests/GPU/thermohydraulique_VEF_DNS/thermohydraulique_VEF_DNS.data b/tests/GPU/thermohydraulique_VEF_DNS/thermohydraulique_VEF_DNS.data index 5143a22e67..901bfc3f8b 100644 --- a/tests/GPU/thermohydraulique_VEF_DNS/thermohydraulique_VEF_DNS.data +++ b/tests/GPU/thermohydraulique_VEF_DNS/thermohydraulique_VEF_DNS.data @@ -41,7 +41,8 @@ END PARTITION # Scatter DOM.Zones dom END SCATTER # -VEFPreP1B dis +VEFPreP1B dis +Lire dis { reorder { algo Hilbert } } runge_kutta_ordre_3 sch_ex Read sch_ex diff --git a/tests/GPU/thermohydraulique_VEF_DNS/thermohydraulique_VEF_DNS_BENCH.TU.adastra_gfx90a b/tests/GPU/thermohydraulique_VEF_DNS/thermohydraulique_VEF_DNS_BENCH.TU.adastra_gfx90a index abc6fb98e2..9a6ba42432 100644 --- a/tests/GPU/thermohydraulique_VEF_DNS/thermohydraulique_VEF_DNS_BENCH.TU.adastra_gfx90a +++ b/tests/GPU/thermohydraulique_VEF_DNS/thermohydraulique_VEF_DNS_BENCH.TU.adastra_gfx90a @@ -8,8 +8,8 @@ Time is given in seconds ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Context of the computation ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -Date: 20-02-2026 -- 18:18:45 -OS: g1081__Linux__x86_64__5.14.0-570.69.1.el9_6.x86_64__#1 SMP PREEMPT_DYNAMIC Tue Nov 25 01:30:14 EST 2025 +Date: 23-04-2026 -- 09:39:29 +OS: g1182__Linux__x86_64__5.14.0-570.69.1.el9_6.x86_64__#1 SMP PREEMPT_DYNAMIC Tue Nov 25 01:30:14 EST 2025 CPU model : AMD EPYC 7A53 64-Core Processor Total number of threads:128 GPU model: AMD Instinct MI250X @@ -22,37 +22,37 @@ Total number of elements used for the calculation: 1638400 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Computation start-up statistics ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -Total time of the start-up: 36.3455 +Total time of the start-up: 38.5416 Number of calls to the linear solver per time step: 2 -Average time of the resolution of the linear problem per call: 0.817213 +Average time of the resolution of the linear problem per call: 1.01807 Average number of iteration of the linear solver per call: 0 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Time loop statistics ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ The first time step is not accounted for the computation of the time loop statistics -Total time of the time loop: 8.34709 +Total time of the time loop: 7.03316 Number of time steps: 9 Skipped time steps: 1 -Average time per time step: 0.927454 -Standard deviation between time steps: 0.186576 -Time elapsed in the skipped time steps: 1.10284 +Average time per time step: 0.781463 +Standard deviation between time steps: 0.174335 +Time elapsed in the skipped time steps: 0.919797 Standard counter description | Time/step | % loop time | Call(s)/step ------------------------------------------------------------------------------------------ -Linear solver resolutions Ax=B | 0.0353757 | 3.4 | 3 -Convection operator | 0.2771735 | 26.4 | 6 -Diffusion operator | 0.1017226 | 9.7 | 24 -Gradient operator | 0.08935723 | 8.5 | 6 -Divergence operator | 0.01984002 | 1.9 | 4 -Source terms | 0.1596252 | 15.2 | 6 -Update ::mettre_a_jour | 0.02635399 | 2.5 | 1 -Solver for implicit diffusion | 0.02084072 | 2.0 | 6 -Computation of the time step dt | 0.003133064 | 0.3 | 10 -Post-treatment operations | 0.1788569 | 17.0 | 1 -Other operations | 0.01517503 | 1.4 | +Linear solver resolutions Ax=B | 0.0294224 | 3.8 | 3 +Convection operator | 0.2453152 | 31.4 | 6 +Diffusion operator | 0.08785235 | 11.2 | 24 +Gradient operator | 0.04586587 | 5.9 | 6 +Divergence operator | 0.01216054 | 1.6 | 4 +Source terms | 0.1386433 | 17.7 | 6 +Update ::mettre_a_jour | 0.02083962 | 2.7 | 1 +Solver for implicit diffusion | 0.01898591 | 2.4 | 6 +Computation of the time step dt | 0.00293547 | 0.4 | 10 +Post-treatment operations | 0.1645779 | 21.1 | 1 +Other operations | 0.01486426 | 1.9 | Average number of iteration of the linear solver per call: 8 @@ -62,17 +62,17 @@ Average number of iteration of the linear solver per call: 8 ----------------------------------------------------------------------------------------------------------- Counter description | Time per step | % loop time | Call(s)/step | Bandwidth ----------------------------------------------------------------------------------------------------------- -Libraries: | 0.0348663 | 3.8 | 3 | -Kernels: | 0.828195 | 89.3 | 1807 | -Copy host to device: | 0.000956014 | 0.1 | 43 | 3.1 GB/s -Copy device to host: | 0.00563713 | 0.6 | 98 | 11.0 GB/s -Alloc/Free on device: | 0.000106537 | 0.0 | 4 | -GPU: 93% Copy H<->D: 0.71% Alloc/free: 0.011% Comm: 0% CPU & I/O: 6.2% +Libraries: | 0.0288868 | 3.7 | 3 | +Kernels: | 0.688197 | 88.1 | 1804 | +Copy host to device: | 0.000964265 | 0.1 | 43 | 3.0 GB/s +Copy device to host: | 0.0056269 | 0.7 | 98 | 11.0 GB/s +Alloc/Free on device: | 0.000106762 | 0.0 | 4 | +GPU: 92% Copy H<->D: 0.84% Alloc/free: 0.014% Comm: 0% CPU & I/O: 7.4% ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Post-resolution statistics ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -Time of the post-resolution: 1.18684 +Time of the post-resolution: 1.18641 -Total time for the whole computation 46.9823 +Total time for the whole computation 47.681 -Power consumption: 0 kW 0 kWh 0 € (0.12€/kWh) +[Slurm] Power consumption (55 s): 0.473 kW 0.007 kWh 0.001 € (0.10€/kWh) diff --git a/tests/GPU/thermohydraulique_VEF_DNS/thermohydraulique_VEF_DNS_BENCH.TU.adastra_gfx942 b/tests/GPU/thermohydraulique_VEF_DNS/thermohydraulique_VEF_DNS_BENCH.TU.adastra_gfx942 index d45c73d182..8c6d846b16 100644 --- a/tests/GPU/thermohydraulique_VEF_DNS/thermohydraulique_VEF_DNS_BENCH.TU.adastra_gfx942 +++ b/tests/GPU/thermohydraulique_VEF_DNS/thermohydraulique_VEF_DNS_BENCH.TU.adastra_gfx942 @@ -8,8 +8,8 @@ Time is given in seconds ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Context of the computation ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -Date: 17-03-2026 -- 18:10:43 -OS: a1007__Linux__x86_64__5.14.0-570.69.1.el9_6.x86_64__#1 SMP PREEMPT_DYNAMIC Tue Nov 25 01:30:14 EST 2025 +Date: 23-04-2026 -- 15:01:05 +OS: a1003__Linux__x86_64__5.14.0-570.69.1.el9_6.x86_64__#1 SMP PREEMPT_DYNAMIC Tue Nov 25 01:30:14 EST 2025 CPU model : AMD Instinct MI300A Accelerator Total number of threads:192 GPU model: AMD Instinct MI300A @@ -22,37 +22,37 @@ Total number of elements used for the calculation: 1638400 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Computation start-up statistics ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -Total time of the start-up: 55.4268 +Total time of the start-up: 45.1512 Number of calls to the linear solver per time step: 2 -Average time of the resolution of the linear problem per call: 1.14745 +Average time of the resolution of the linear problem per call: 1.25045 Average number of iteration of the linear solver per call: 0 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Time loop statistics ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ The first time step is not accounted for the computation of the time loop statistics -Total time of the time loop: 2.77306 +Total time of the time loop: 2.72452 Number of time steps: 9 Skipped time steps: 1 -Average time per time step: 0.308118 -Standard deviation between time steps: 0.141243 -Time elapsed in the skipped time steps: 0.601196 +Average time per time step: 0.302725 +Standard deviation between time steps: 0.144304 +Time elapsed in the skipped time steps: 0.608773 Standard counter description | Time/step | % loop time | Call(s)/step ------------------------------------------------------------------------------------------ -Linear solver resolutions Ax=B | 0.0180691 | 5.9 | 3 -Convection operator | 0.05506105 | 17.9 | 6 -Diffusion operator | 0.02738197 | 8.9 | 24 -Gradient operator | 0.01970771 | 6.4 | 6 -Divergence operator | 0.005881149 | 1.9 | 4 -Source terms | 0.0142448 | 4.6 | 6 -Update ::mettre_a_jour | 0.01042488 | 3.4 | 1 -Solver for implicit diffusion | 0.01383581 | 4.5 | 6 -Computation of the time step dt | 0.002407566 | 0.8 | 10 -Post-treatment operations | 0.1322655 | 42.9 | 1 -Other operations | 0.008838375 | 2.9 | +Linear solver resolutions Ax=B | 0.016609 | 5.5 | 3 +Convection operator | 0.05170634 | 17.1 | 6 +Diffusion operator | 0.027014 | 8.9 | 24 +Gradient operator | 0.0191032 | 6.3 | 6 +Divergence operator | 0.004540961 | 1.5 | 4 +Source terms | 0.01447686 | 4.8 | 6 +Update ::mettre_a_jour | 0.009973257 | 3.3 | 1 +Solver for implicit diffusion | 0.01398563 | 4.6 | 6 +Computation of the time step dt | 0.002454303 | 0.8 | 10 +Post-treatment operations | 0.1340098 | 44.3 | 1 +Other operations | 0.008851631 | 2.9 | Average number of iteration of the linear solver per call: 8 @@ -62,17 +62,17 @@ Average number of iteration of the linear solver per call: 8 ----------------------------------------------------------------------------------------------------------- Counter description | Time per step | % loop time | Call(s)/step | Bandwidth ----------------------------------------------------------------------------------------------------------- -Libraries: | 0.0175936 | 5.7 | 3 | -Kernels: | 0.23352 | 75.8 | 1807 | -Copy host to device: | 0.000902083 | 0.3 | 43 | 3.2 GB/s -Copy device to host: | 0.00451282 | 1.5 | 98 | 13.7 GB/s -Alloc/Free on device: | 4.65706e-05 | 0.0 | 4 | -GPU: 81% Copy H<->D: 1.8% Alloc/free: 0.015% Comm: 0% CPU & I/O: 17% +Libraries: | 0.0162141 | 5.4 | 3 | +Kernels: | 0.227653 | 75.2 | 1804 | +Copy host to device: | 0.000941173 | 0.3 | 43 | 3.1 GB/s +Copy device to host: | 0.00467396 | 1.5 | 98 | 13.3 GB/s +Alloc/Free on device: | 4.81214e-05 | 0.0 | 4 | +GPU: 81% Copy H<->D: 1.9% Alloc/free: 0.016% Comm: 0% CPU & I/O: 18% ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Post-resolution statistics ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -Time of the post-resolution: 1.11388 +Time of the post-resolution: 1.02665 -Total time for the whole computation 59.915 +Total time for the whole computation 49.5112 -[Slurm] Power consumption (69 s): 0.680 kW 0.013 kWh 0.001 € (0.10€/kWh) +[Slurm] Power consumption (59 s): 0.654 kW 0.011 kWh 0.001 € (0.10€/kWh) diff --git a/tests/GPU/thermohydraulique_VEF_DNS/thermohydraulique_VEF_DNS_BENCH.TU.dalianvl_cc100 b/tests/GPU/thermohydraulique_VEF_DNS/thermohydraulique_VEF_DNS_BENCH.TU.dalianvl_cc100 new file mode 100644 index 0000000000..15616e173d --- /dev/null +++ b/tests/GPU/thermohydraulique_VEF_DNS/thermohydraulique_VEF_DNS_BENCH.TU.dalianvl_cc100 @@ -0,0 +1,77 @@ + # Global performance file # + +This is the global file for tracking performance in TRUST. It stores aggregated quantities. +More detailed statistics can be found in the thermohydraulique_VEF_DNS_BENCH_csv.TU file +For time loop, only standard counters of level 1 are printed alongside your custom counters +Time is given in seconds + +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Context of the computation +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Date: 22-05-2026 -- 12:20:32 +OS: dalianvl14__Linux__aarch64__5.14.0-503.40.1.el9_5.aarch64__#1 SMP PREEMPT_DYNAMIC Thu Apr 24 13:05:29 UTC 2025 +CPU model : Unknown Linux CPU +Total number of threads:144 +GPU model: NVIDIA GB200 +CUDA runtime version: 13.0 +CUDA drivers version: 13.20 +Nb procs used for the computation: 1 +TRUST version: 1.9.8_beta +Total number of elements used for the calculation: 1638400 + +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Computation start-up statistics +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Total time of the start-up: 24.612 + +Number of calls to the linear solver per time step: 2 +Average time of the resolution of the linear problem per call: 0.896434 +Average number of iteration of the linear solver per call: 0 + +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Time loop statistics +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +The first time step is not accounted for the computation of the time loop statistics +Total time of the time loop: 1.50874 +Number of time steps: 9 +Skipped time steps: 1 +Average time per time step: 0.167637 +Standard deviation between time steps: 0.112427 +Time elapsed in the skipped time steps: 0.291127 + + +Standard counter description | Time/step | % loop time | Call(s)/step +------------------------------------------------------------------------------------------ +Linear solver resolutions Ax=B | 0.016792 | 10.0 | 3 +Convection operator | 0.01345494 | 8.0 | 6 +Diffusion operator | 0.0119605 | 7.1 | 24 +Gradient operator | 0.005281847 | 3.2 | 6 +Divergence operator | 0.00148673 | 0.9 | 4 +Source terms | 0.0224019 | 13.4 | 6 +Update ::mettre_a_jour | 0.005473263 | 3.3 | 1 +Solver for implicit diffusion | 0.007801915 | 4.7 | 6 +Computation of the time step dt | 0.001045642 | 0.6 | 10 +Post-treatment operations | 0.07443955 | 44.4 | 1 +Other operations | 0.007499072 | 4.5 | + +Average number of iteration of the linear solver per call: 8 + + +----------------------------------------------------------------------------------------------------------- + GPU statistics +----------------------------------------------------------------------------------------------------------- +Counter description | Time per step | % loop time | Call(s)/step | Bandwidth +----------------------------------------------------------------------------------------------------------- +Libraries: | 0.0164142 | 9.8 | 3 | +Kernels: | 0.0958071 | 57.2 | 1632 | +Copy host to device: | 0.000911807 | 0.5 | 51 | 3.2 GB/s +Copy device to host: | 0.00192771 | 1.1 | 40 | 32.1 GB/s +Alloc/Free on device: | 0.00105571 | 0.6 | 23 | +GPU: 67% Copy H<->D: 1.7% Alloc/free: 0.63% Comm: 0% CPU & I/O: 31% +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Post-resolution statistics +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Time of the post-resolution: 0.793499 + +Total time for the whole computation 27.2054 + diff --git a/tests/GPU/thermohydraulique_VEF_DNS/thermohydraulique_VEF_DNS_BENCH.TU.eureka_cc89 b/tests/GPU/thermohydraulique_VEF_DNS/thermohydraulique_VEF_DNS_BENCH.TU.eureka_cc89 new file mode 100644 index 0000000000..d1437d5c09 --- /dev/null +++ b/tests/GPU/thermohydraulique_VEF_DNS/thermohydraulique_VEF_DNS_BENCH.TU.eureka_cc89 @@ -0,0 +1,77 @@ + # Global performance file # + +This is the global file for tracking performance in TRUST. It stores aggregated quantities. +More detailed statistics can be found in the thermohydraulique_VEF_DNS_BENCH_csv.TU file +For time loop, only standard counters of level 1 are printed alongside your custom counters +Time is given in seconds + +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Context of the computation +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Date: 11-05-2026 -- 08:39:12 +OS: eureka__Linux__x86_64__6.17.0-20-generic__#20~24.04.1-Ubuntu SMP PREEMPT_DYNAMIC Thu Mar 19 01:28:37 UTC 2 +CPU model : INTEL(R) XEON(R) PLATINUM 8580 +Total number of threads:240 +GPU model: NVIDIA RTX 6000 Ada Generation +CUDA runtime version: 12.90 +CUDA drivers version: 13.10 +Nb procs used for the computation: 1 +TRUST version: 1.9.8_beta +Total number of elements used for the calculation: 1638400 + +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Computation start-up statistics +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Total time of the start-up: 36.5273 + +Number of calls to the linear solver per time step: 2 +Average time of the resolution of the linear problem per call: 1.08866 +Average number of iteration of the linear solver per call: 0 + +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Time loop statistics +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +The first time step is not accounted for the computation of the time loop statistics +Total time of the time loop: 6.4139 +Number of time steps: 9 +Skipped time steps: 1 +Average time per time step: 0.712655 +Standard deviation between time steps: 0.291946 +Time elapsed in the skipped time steps: 1.19671 + + +Standard counter description | Time/step | % loop time | Call(s)/step +------------------------------------------------------------------------------------------ +Linear solver resolutions Ax=B | 0.0288126 | 4.0 | 3 +Convection operator | 0.04361792 | 6.1 | 6 +Diffusion operator | 0.02553571 | 3.6 | 24 +Gradient operator | 0.01168674 | 1.6 | 6 +Divergence operator | 0.003601375 | 0.5 | 4 +Source terms | 0.02653658 | 3.7 | 6 +Update ::mettre_a_jour | 0.01541138 | 2.2 | 1 +Solver for implicit diffusion | 0.01887595 | 2.6 | 6 +Computation of the time step dt | 0.003067977 | 0.4 | 10 +Post-treatment operations | 0.5206003 | 73.1 | 1 +Other operations | 0.01490858 | 2.1 | + +Average number of iteration of the linear solver per call: 8 + + +----------------------------------------------------------------------------------------------------------- + GPU statistics +----------------------------------------------------------------------------------------------------------- +Counter description | Time per step | % loop time | Call(s)/step | Bandwidth +----------------------------------------------------------------------------------------------------------- +Libraries: | 0.028531 | 4.0 | 3 | +Kernels: | 0.620912 | 87.1 | 1745 | +Copy host to device: | 0.000704046 | 0.1 | 43 | 4.1 GB/s +Copy device to host: | 0.00607506 | 0.9 | 102 | 10.2 GB/s +Alloc/Free on device: | 0.000138622 | 0.0 | 4 | +GPU: 91% Copy H<->D: 0.95% Alloc/free: 0.019% Comm: 0% CPU & I/O: 7.9% +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Post-resolution statistics +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Time of the post-resolution: 0.3616 + +Total time for the whole computation 44.4995 + diff --git a/tests/GPU/thermohydraulique_VEF_DNS/thermohydraulique_VEF_DNS_BENCH.TU.irene-amd-ccrt_cc70 b/tests/GPU/thermohydraulique_VEF_DNS/thermohydraulique_VEF_DNS_BENCH.TU.irene-amd-ccrt_cc70 index 41f24b0a65..659cabc7f3 100644 --- a/tests/GPU/thermohydraulique_VEF_DNS/thermohydraulique_VEF_DNS_BENCH.TU.irene-amd-ccrt_cc70 +++ b/tests/GPU/thermohydraulique_VEF_DNS/thermohydraulique_VEF_DNS_BENCH.TU.irene-amd-ccrt_cc70 @@ -8,8 +8,8 @@ Time is given in seconds ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Context of the computation ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -Date: 20-03-2026 -- 15:22:09 -OS: irene7067__Linux__x86_64__4.18.0-553.89.1.el8_10.x86_64__#1 SMP Sat Nov 29 00:49:18 EST 2025 +Date: 23-04-2026 -- 15:17:50 +OS: irene7051__Linux__x86_64__4.18.0-553.89.1.el8_10.x86_64__#1 SMP Sat Nov 29 00:49:18 EST 2025 CPU model : Intel(R) Xeon(R) Gold 6230 CPU @ 2.10GHz Total number of threads:80 GPU model: Tesla V100-SXM2-16GB @@ -22,37 +22,37 @@ Total number of elements used for the calculation: 1638400 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Computation start-up statistics ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -Total time of the start-up: 50.0545 +Total time of the start-up: 52.508 Number of calls to the linear solver per time step: 2 -Average time of the resolution of the linear problem per call: 2.13913 +Average time of the resolution of the linear problem per call: 1.6874 Average number of iteration of the linear solver per call: 0 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Time loop statistics ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ The first time step is not accounted for the computation of the time loop statistics -Total time of the time loop: 5.19813 +Total time of the time loop: 4.33648 Number of time steps: 9 Skipped time steps: 1 -Average time per time step: 0.57757 -Standard deviation between time steps: 0.16654 -Time elapsed in the skipped time steps: 1.45793 +Average time per time step: 0.481832 +Standard deviation between time steps: 0.178332 +Time elapsed in the skipped time steps: 1.23974 Standard counter description | Time/step | % loop time | Call(s)/step ------------------------------------------------------------------------------------------ -Linear solver resolutions Ax=B | 0.0491332 | 8.5 | 3 -Convection operator | 0.1156491 | 20.0 | 6 -Diffusion operator | 0.06374568 | 11.0 | 24 -Gradient operator | 0.03774611 | 6.5 | 6 -Divergence operator | 0.02187735 | 3.8 | 4 -Source terms | 0.05641414 | 9.8 | 6 -Update ::mettre_a_jour | 0.03031997 | 5.2 | 1 -Solver for implicit diffusion | 0.02578301 | 4.5 | 6 -Computation of the time step dt | 0.003343664 | 0.6 | 10 -Post-treatment operations | 0.1561775 | 27.0 | 1 -Other operations | 0.01738065 | 3.0 | +Linear solver resolutions Ax=B | 0.0441547 | 9.2 | 3 +Convection operator | 0.08919729 | 18.5 | 6 +Diffusion operator | 0.03954753 | 8.2 | 24 +Gradient operator | 0.01919536 | 4.0 | 6 +Divergence operator | 0.0121496 | 2.5 | 4 +Source terms | 0.04608156 | 9.6 | 6 +Update ::mettre_a_jour | 0.02597848 | 5.4 | 1 +Solver for implicit diffusion | 0.02597088 | 5.4 | 6 +Computation of the time step dt | 0.003337014 | 0.7 | 10 +Post-treatment operations | 0.1580431 | 32.8 | 1 +Other operations | 0.01817609 | 3.8 | Average number of iteration of the linear solver per call: 8 @@ -62,17 +62,17 @@ Average number of iteration of the linear solver per call: 8 ----------------------------------------------------------------------------------------------------------- Counter description | Time per step | % loop time | Call(s)/step | Bandwidth ----------------------------------------------------------------------------------------------------------- -Libraries: | 0.0484481 | 8.4 | 3 | -Kernels: | 0.457903 | 79.3 | 1807 | -Copy host to device: | 0.0013404 | 0.2 | 43 | 2.2 GB/s -Copy device to host: | 0.0153024 | 2.6 | 98 | 4.0 GB/s -Alloc/Free on device: | 0.000149768 | 0.0 | 4 | -GPU: 88% Copy H<->D: 2.9% Alloc/free: 0.026% Comm: 0% CPU & I/O: 9.4% +Libraries: | 0.0434439 | 9.0 | 3 | +Kernels: | 0.363077 | 75.4 | 1804 | +Copy host to device: | 0.00140484 | 0.3 | 43 | 2.1 GB/s +Copy device to host: | 0.0171422 | 3.6 | 98 | 3.6 GB/s +Alloc/Free on device: | 0.000169156 | 0.0 | 4 | +GPU: 84% Copy H<->D: 3.8% Alloc/free: 0.035% Comm: 0% CPU & I/O: 12% ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Post-resolution statistics ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -Time of the post-resolution: 1.0736 +Time of the post-resolution: 0.982788 -Total time for the whole computation 57.7842 +Total time for the whole computation 59.0671 -[Slurm] Power consumption (82 s): 0.178 kW 0.004 kWh 0.000 € (0.10€/kWh) +[Slurm] Power consumption (73 s): 0.229 kW 0.005 kWh 0.000 € (0.10€/kWh) diff --git a/tests/GPU/thermohydraulique_VEF_DNS/thermohydraulique_VEF_DNS_BENCH.TU.is157091_cc86 b/tests/GPU/thermohydraulique_VEF_DNS/thermohydraulique_VEF_DNS_BENCH.TU.is157091_cc86 index fee1e8c0b2..e656412694 100644 --- a/tests/GPU/thermohydraulique_VEF_DNS/thermohydraulique_VEF_DNS_BENCH.TU.is157091_cc86 +++ b/tests/GPU/thermohydraulique_VEF_DNS/thermohydraulique_VEF_DNS_BENCH.TU.is157091_cc86 @@ -8,13 +8,13 @@ Time is given in seconds ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Context of the computation ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -Date: 20-02-2026 -- 18:01:42 +Date: 07-05-2026 -- 14:32:18 OS: is157091__Linux__x86_64__5.11.12-300.fc34.x86_64__#1 SMP Wed Apr 7 16:31:13 UTC 2021 CPU model : AMD Ryzen Threadripper PRO 5975WX 32-Cores Total number of threads:64 GPU model: NVIDIA RTX A6000 CUDA runtime version: 12.90 -CUDA drivers version: 12.70 +CUDA drivers version: 13.20 Nb procs used for the computation: 1 TRUST version: 1.9.8_beta Total number of elements used for the calculation: 1638400 @@ -22,37 +22,37 @@ Total number of elements used for the calculation: 1638400 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Computation start-up statistics ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -Total time of the start-up: 29.542 +Total time of the start-up: 29.1674 Number of calls to the linear solver per time step: 2 -Average time of the resolution of the linear problem per call: 0.859958 +Average time of the resolution of the linear problem per call: 0.982441 Average number of iteration of the linear solver per call: 0 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Time loop statistics ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ The first time step is not accounted for the computation of the time loop statistics -Total time of the time loop: 4.91682 +Total time of the time loop: 4.07933 Number of time steps: 9 Skipped time steps: 1 -Average time per time step: 0.546314 -Standard deviation between time steps: 0.113084 -Time elapsed in the skipped time steps: 0.906668 +Average time per time step: 0.453259 +Standard deviation between time steps: 0.108612 +Time elapsed in the skipped time steps: 0.790546 Standard counter description | Time/step | % loop time | Call(s)/step ------------------------------------------------------------------------------------------ -Linear solver resolutions Ax=B | 0.0500233 | 9.2 | 3 -Convection operator | 0.1068827 | 19.6 | 6 -Diffusion operator | 0.05186786 | 9.5 | 24 -Gradient operator | 0.03487695 | 6.4 | 6 -Divergence operator | 0.01704209 | 3.1 | 4 -Source terms | 0.05857455 | 10.7 | 6 -Update ::mettre_a_jour | 0.0282653 | 5.2 | 1 -Solver for implicit diffusion | 0.03199751 | 5.9 | 6 -Computation of the time step dt | 0.004401211 | 0.8 | 10 -Post-treatment operations | 0.141302 | 25.9 | 1 -Other operations | 0.02108005 | 3.9 | +Linear solver resolutions Ax=B | 0.0483038 | 10.7 | 3 +Convection operator | 0.08223569 | 18.1 | 6 +Diffusion operator | 0.04077174 | 9.0 | 24 +Gradient operator | 0.0149534 | 3.3 | 6 +Divergence operator | 0.01122735 | 2.5 | 4 +Source terms | 0.05184123 | 11.4 | 6 +Update ::mettre_a_jour | 0.02289992 | 5.1 | 1 +Solver for implicit diffusion | 0.03037772 | 6.7 | 6 +Computation of the time step dt | 0.004375326 | 1.0 | 10 +Post-treatment operations | 0.126597 | 27.9 | 1 +Other operations | 0.01967598 | 4.3 | Average number of iteration of the linear solver per call: 8 @@ -62,16 +62,16 @@ Average number of iteration of the linear solver per call: 8 ----------------------------------------------------------------------------------------------------------- Counter description | Time per step | % loop time | Call(s)/step | Bandwidth ----------------------------------------------------------------------------------------------------------- -Libraries: | 0.0493887 | 9.0 | 3 | -Kernels: | 0.455456 | 83.4 | 1807 | -Copy host to device: | 0.000654241 | 0.1 | 43 | 4.5 GB/s -Copy device to host: | 0.0061757 | 1.1 | 98 | 10.0 GB/s -Alloc/Free on device: | 9.24147e-05 | 0.0 | 4 | -GPU: 92% Copy H<->D: 1.3% Alloc/free: 0.017% Comm: 0% CPU & I/O: 6.3% +Libraries: | 0.0476381 | 10.5 | 3 | +Kernels: | 0.366554 | 80.9 | 1733 | +Copy host to device: | 0.000658838 | 0.1 | 43 | 4.4 GB/s +Copy device to host: | 0.00639428 | 1.4 | 98 | 9.7 GB/s +Alloc/Free on device: | 0.000101789 | 0.0 | 4 | +GPU: 91% Copy H<->D: 1.6% Alloc/free: 0.022% Comm: 0% CPU & I/O: 7% ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Post-resolution statistics ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -Time of the post-resolution: 0.432622 +Time of the post-resolution: 0.427963 -Total time for the whole computation 35.7981 +Total time for the whole computation 34.4652 diff --git a/tests/GPU/thermohydraulique_VEF_DNS/thermohydraulique_VEF_DNS_BENCH.TU.is159479_cc120 b/tests/GPU/thermohydraulique_VEF_DNS/thermohydraulique_VEF_DNS_BENCH.TU.is159479_cc120 new file mode 100644 index 0000000000..018703a961 --- /dev/null +++ b/tests/GPU/thermohydraulique_VEF_DNS/thermohydraulique_VEF_DNS_BENCH.TU.is159479_cc120 @@ -0,0 +1,77 @@ + # Global performance file # + +This is the global file for tracking performance in TRUST. It stores aggregated quantities. +More detailed statistics can be found in the thermohydraulique_VEF_DNS_BENCH_csv.TU file +For time loop, only standard counters of level 1 are printed alongside your custom counters +Time is given in seconds + +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Context of the computation +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Date: 24-04-2026 -- 14:39:17 +OS: is159479.intra.cea.fr__Linux__x86_64__6.12.0-160000.26-default__#1 SMP PREEMPT_DYNAMIC Thu Feb 5 00:00:11 UTC 2026 (f +CPU model : AMD Ryzen Threadripper PRO 9965WX 24-Cores +Total number of threads:48 +GPU model: NVIDIA RTX PRO 6000 Blackwell Workstation Edition +CUDA runtime version: 13.0 +CUDA drivers version: 13.0 +Nb procs used for the computation: 1 +TRUST version: 1.9.8_beta +Total number of elements used for the calculation: 1638400 + +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Computation start-up statistics +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Total time of the start-up: 23.2994 + +Number of calls to the linear solver per time step: 2 +Average time of the resolution of the linear problem per call: 0.611811 +Average number of iteration of the linear solver per call: 0 + +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Time loop statistics +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +The first time step is not accounted for the computation of the time loop statistics +Total time of the time loop: 1.61824 +Number of time steps: 9 +Skipped time steps: 1 +Average time per time step: 0.179805 +Standard deviation between time steps: 0.0577892 +Time elapsed in the skipped time steps: 0.513971 + + +Standard counter description | Time/step | % loop time | Call(s)/step +------------------------------------------------------------------------------------------ +Linear solver resolutions Ax=B | 0.0210945 | 11.7 | 3 +Convection operator | 0.03025967 | 16.8 | 6 +Diffusion operator | 0.01563452 | 8.7 | 24 +Gradient operator | 0.006605338 | 3.7 | 6 +Divergence operator | 0.001920241 | 1.1 | 4 +Source terms | 0.02015723 | 11.2 | 6 +Update ::mettre_a_jour | 0.008509181 | 4.7 | 1 +Solver for implicit diffusion | 0.009471892 | 5.3 | 6 +Computation of the time step dt | 0.00156277 | 0.9 | 10 +Post-treatment operations | 0.05698094 | 31.7 | 1 +Other operations | 0.007608653 | 4.2 | + +Average number of iteration of the linear solver per call: 8 + + +----------------------------------------------------------------------------------------------------------- + GPU statistics +----------------------------------------------------------------------------------------------------------- +Counter description | Time per step | % loop time | Call(s)/step | Bandwidth +----------------------------------------------------------------------------------------------------------- +Libraries: | 0.020927 | 11.6 | 3 | +Kernels: | 0.132036 | 73.4 | 1804 | +Copy host to device: | 0.00046937 | 0.3 | 43 | 6.2 GB/s +Copy device to host: | 0.00734308 | 4.1 | 98 | 8.4 GB/s +Alloc/Free on device: | 6.39967e-05 | 0.0 | 4 | +GPU: 85% Copy H<->D: 4.3% Alloc/free: 0.036% Comm: 0% CPU & I/O: 11% +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Post-resolution statistics +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Time of the post-resolution: 0.372868 + +Total time for the whole computation 25.8045 + diff --git a/tests/GPU/thermohydraulique_VEF_DNS/thermohydraulique_VEF_DNS_BENCH.TU.is247793_gfx1100 b/tests/GPU/thermohydraulique_VEF_DNS/thermohydraulique_VEF_DNS_BENCH.TU.is247793_gfx1100 index d91331f1d9..db1ff81fe9 100644 --- a/tests/GPU/thermohydraulique_VEF_DNS/thermohydraulique_VEF_DNS_BENCH.TU.is247793_gfx1100 +++ b/tests/GPU/thermohydraulique_VEF_DNS/thermohydraulique_VEF_DNS_BENCH.TU.is247793_gfx1100 @@ -1,52 +1,77 @@ -Statistiques d'initialisation du calcul - -Temps total 54.7659 - -Statistiques de resolution du probleme - -Temps total 21.8014 - - -Timesteps 10 -Secondes / pas de temps 2.18013 -Dont solveurs Ax=B 0.047244 2% (3 appels/pas de temps) -Dont solveur diffusion_implicite 0.044039 2% (6 appels/pas de temps) -Dont mettre_a_jour 0.156435 7% (1 appel/pas de temps) -Dont operateurs convection 0.169475 7% (6 appels/pas de temps) -Dont operateurs diffusion 0.109363 5% (24 appels/pas de temps) -Dont operateurs gradient 0.041460 1% (6 appels/pas de temps) -Dont operateurs divergence 0.011335 0% (4 appels/pas de temps) -Dont operateurs source 0.278923 12% (6 appels/pas de temps) -Dont operations postraitement 1.232232 56% (1 appel/pas de temps) -Dont calcul dt 0.005455 0% (10 appels/pas de temps) -Dont calcul divers 0.084165 3% (0 appels/pas de temps) -Nb solveur / pas de temps 3 -Secondes / solveur 0.0157481 -Iterations / solveur 5.13333 -GPU statistics per time step (experimental): -Libraries : 0.046677 s 2.1% 3.0 calls -Kernels : 0.585664 s 26.9% 8259112.2 calls -Copy H2D : 0.059068 s 2.7% 123.4 calls 16.4 GB/s -Copy D2H : 0.120714 s 5.5% 174.3 calls 24.5 GB/s -Alloc/Free: 0.005446 s 0.2% 42.4 calls -GPU: 29% Copy H<->D: 8.2% Alloc/Free: 0.2% Comm: 0% CPU & Others: 62.4% -I/O: - -Timesteps = number of time steps -Nb solveur = number of linear system resolutions -Nb assemblage implicite = number of matrix assemblies for the implicit scheme -Iterations = average number of iterations of the solver -Communications = fraction of the time spent - in communications between processors (excluding io files) -Network latency = time of one mpsum measured by an internal bench over 0.1s -Network bandwidth = maximum on all processors - of the average bandwidth of send_recv operations -Waiting time = estimation of the waiting time of the different processors - -Max_waiting_time big => probably due to a bad partitioning -Communications > 30% => too many processors or network too slow - -Statistiques de post resolution - -Temps total 1.97435 + # Global performance file # + +This is the global file for tracking performance in TRUST. It stores aggregated quantities. +More detailed statistics can be found in the thermohydraulique_VEF_DNS_BENCH_csv.TU file +For time loop, only standard counters of level 1 are printed alongside your custom counters +Time is given in seconds + +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Context of the computation +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Date: 22-05-2026 -- 17:43:39 +OS: is247793__Linux__x86_64__6.8.0-107-generic__#107~22.04.1-Ubuntu SMP PREEMPT_DYNAMIC Wed Mar 18 23:40:43 UTC +CPU model : AMD Ryzen Threadripper PRO 7965WX 24-Cores +Total number of threads:48 +GPU model: AMD Radeon PRO W7900 +HIP runtime version: 7.53 +HIP drivers version: 7.53 +Nb procs used for the computation: 1 +TRUST version: 1.9.8_beta +Total number of elements used for the calculation: 1638400 + +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Computation start-up statistics +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Total time of the start-up: 26.899 + +Number of calls to the linear solver per time step: 2 +Average time of the resolution of the linear problem per call: 1.26318 +Average number of iteration of the linear solver per call: 0 + +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Time loop statistics +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +The first time step is not accounted for the computation of the time loop statistics +Total time of the time loop: 5.13364 +Number of time steps: 9 +Skipped time steps: 1 +Average time per time step: 0.570404 +Standard deviation between time steps: 0.176546 +Time elapsed in the skipped time steps: 1.05177 + + +Standard counter description | Time/step | % loop time | Call(s)/step +------------------------------------------------------------------------------------------ +Linear solver resolutions Ax=B | 0.0608076 | 10.7 | 3 +Convection operator | 0.09816364 | 17.2 | 6 +Diffusion operator | 0.05411814 | 9.5 | 24 +Gradient operator | 0.02642929 | 4.6 | 6 +Divergence operator | 0.007925149 | 1.4 | 4 +Source terms | 0.08792285 | 15.4 | 6 +Update ::mettre_a_jour | 0.02459289 | 4.3 | 1 +Solver for implicit diffusion | 0.03142004 | 5.5 | 6 +Computation of the time step dt | 0.004884953 | 0.9 | 10 +Post-treatment operations | 0.149868 | 26.3 | 1 +Other operations | 0.02427166 | 4.3 | + +Average number of iteration of the linear solver per call: 7 + + +----------------------------------------------------------------------------------------------------------- + GPU statistics +----------------------------------------------------------------------------------------------------------- +Counter description | Time per step | % loop time | Call(s)/step | Bandwidth +----------------------------------------------------------------------------------------------------------- +Libraries: | 0.0588719 | 10.3 | 3 | +Kernels: | 0.470853 | 82.5 | 1632 | +Copy host to device: | 0.00133171 | 0.2 | 51 | 2.2 GB/s +Copy device to host: | 0.00484713 | 0.8 | 40 | 12.8 GB/s +Alloc/Free on device: | 0.000724319 | 0.1 | 23 | +GPU: 93% Copy H<->D: 1.1% Alloc/free: 0.13% Comm: 0% CPU & I/O: 5.9% +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Post-resolution statistics +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Time of the post-resolution: 0.663809 + +Total time for the whole computation 33.7482 diff --git a/tests/GPU/thermohydraulique_VEF_DNS/thermohydraulique_VEF_DNS_BENCH.TU.lumi_gfx90a b/tests/GPU/thermohydraulique_VEF_DNS/thermohydraulique_VEF_DNS_BENCH.TU.lumi_gfx90a index 40ba048fd3..ce8211b76c 100644 --- a/tests/GPU/thermohydraulique_VEF_DNS/thermohydraulique_VEF_DNS_BENCH.TU.lumi_gfx90a +++ b/tests/GPU/thermohydraulique_VEF_DNS/thermohydraulique_VEF_DNS_BENCH.TU.lumi_gfx90a @@ -8,13 +8,13 @@ Time is given in seconds ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Context of the computation ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -Date: 20-02-2026 -- 13:11:49 -OS: nid005020__Linux__x86_64__6.4.0-150600.23.73_15.0.14-cray_shasta_c__#1 SMP Tue Oct 21 20:32:25 UTC 2025 (89d3c98) +Date: 15-05-2026 -- 21:17:21 +OS: nid007956__Linux__x86_64__6.4.0-150600.23.73_15.0.14-cray_shasta_c__#1 SMP Tue Oct 21 20:32:25 UTC 2025 (89d3c98) CPU model : AMD EPYC 7A53 64-Core Processor Total number of threads:128 GPU model: AMD Instinct MI250X -HIP runtime version: 6.42 -HIP drivers version: 6.42 +HIP runtime version: 6.43 +HIP drivers version: 6.43 Nb procs used for the computation: 1 TRUST version: 1.9.8_beta Total number of elements used for the calculation: 1638400 @@ -22,37 +22,37 @@ Total number of elements used for the calculation: 1638400 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Computation start-up statistics ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -Total time of the start-up: 41.2855 +Total time of the start-up: 76.7767 Number of calls to the linear solver per time step: 2 -Average time of the resolution of the linear problem per call: 1.36088 +Average time of the resolution of the linear problem per call: 2.71566 Average number of iteration of the linear solver per call: 0 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Time loop statistics ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ The first time step is not accounted for the computation of the time loop statistics -Total time of the time loop: 8.43206 +Total time of the time loop: 6.97185 Number of time steps: 9 Skipped time steps: 1 -Average time per time step: 0.936896 -Standard deviation between time steps: 0.1774 -Time elapsed in the skipped time steps: 1.36652 +Average time per time step: 0.77465 +Standard deviation between time steps: 0.278049 +Time elapsed in the skipped time steps: 1.17363 Standard counter description | Time/step | % loop time | Call(s)/step ------------------------------------------------------------------------------------------ -Linear solver resolutions Ax=B | 0.0714963 | 6.6 | 3 -Convection operator | 0.2741781 | 25.2 | 6 -Diffusion operator | 0.09669706 | 8.9 | 24 -Gradient operator | 0.08605537 | 7.9 | 6 -Divergence operator | 0.0199693 | 1.8 | 4 -Source terms | 0.1487263 | 13.7 | 6 -Update ::mettre_a_jour | 0.02601599 | 2.4 | 1 -Solver for implicit diffusion | 0.02026094 | 1.9 | 6 -Computation of the time step dt | 0.003127665 | 0.3 | 10 -Post-treatment operations | 0.1756955 | 16.1 | 1 -Other operations | 0.01467348 | 1.3 | +Linear solver resolutions Ax=B | 0.028532 | 3.7 | 3 +Convection operator | 0.2375367 | 30.7 | 6 +Diffusion operator | 0.08060514 | 10.4 | 24 +Gradient operator | 0.04153123 | 5.4 | 6 +Divergence operator | 0.01144389 | 1.5 | 4 +Source terms | 0.1300165 | 16.8 | 6 +Update ::mettre_a_jour | 0.01896836 | 2.4 | 1 +Solver for implicit diffusion | 0.01883069 | 2.4 | 6 +Computation of the time step dt | 0.00290556 | 0.4 | 10 +Post-treatment operations | 0.1897926 | 24.5 | 1 +Other operations | 0.01448759 | 1.9 | Average number of iteration of the linear solver per call: 8 @@ -62,17 +62,17 @@ Average number of iteration of the linear solver per call: 8 ----------------------------------------------------------------------------------------------------------- Counter description | Time per step | % loop time | Call(s)/step | Bandwidth ----------------------------------------------------------------------------------------------------------- -Libraries: | 0.0687561 | 7.3 | 3 | -Kernels: | 0.805209 | 85.9 | 1807 | -Copy host to device: | 0.00104428 | 0.1 | 43 | 2.8 GB/s -Copy device to host: | 0.00508409 | 0.5 | 98 | 12.2 GB/s -Alloc/Free on device: | 0.000108709 | 0.0 | 4 | -GPU: 93% Copy H<->D: 0.65% Alloc/free: 0.012% Comm: 0% CPU & I/O: 6.1% +Libraries: | 0.0279793 | 3.6 | 3 | +Kernels: | 0.680184 | 87.8 | 1632 | +Copy host to device: | 0.0011041 | 0.1 | 51 | 2.6 GB/s +Copy device to host: | 0.00390742 | 0.5 | 40 | 15.9 GB/s +Alloc/Free on device: | 0.000569724 | 0.1 | 23 | +GPU: 91% Copy H<->D: 0.65% Alloc/free: 0.074% Comm: 0% CPU & I/O: 7.9% ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Post-resolution statistics ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -Time of the post-resolution: 1.51324 +Time of the post-resolution: 1.70629 -Total time for the whole computation 52.5973 +Total time for the whole computation 86.6285 -Power consumption: 0 kW 0 kWh 0 € (0.12€/kWh) +[Slurm] Power consumption (107 s): 0.497 kW 0.015 kWh 0.001 € (0.10€/kWh) diff --git a/tests/GPU/thermohydraulique_VEF_DNS/thermohydraulique_VEF_DNS_BENCH.TU.topaze_cc80 b/tests/GPU/thermohydraulique_VEF_DNS/thermohydraulique_VEF_DNS_BENCH.TU.topaze_cc80 index c45966936b..1c6a1ecb9a 100644 --- a/tests/GPU/thermohydraulique_VEF_DNS/thermohydraulique_VEF_DNS_BENCH.TU.topaze_cc80 +++ b/tests/GPU/thermohydraulique_VEF_DNS/thermohydraulique_VEF_DNS_BENCH.TU.topaze_cc80 @@ -8,8 +8,8 @@ Time is given in seconds ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Context of the computation ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -Date: 20-02-2026 -- 18:14:24 -OS: topaze7059__Linux__x86_64__4.18.0-553.89.1.el8_10.x86_64__#1 SMP Sat Nov 29 00:49:18 EST 2025 +Date: 15-05-2026 -- 14:02:44 +OS: topaze7040__Linux__x86_64__4.18.0-553.89.1.el8_10.x86_64__#1 SMP Sat Nov 29 00:49:18 EST 2025 CPU model : AMD EPYC 7763 64-Core Processor Total number of threads:256 GPU model: NVIDIA A100-SXM4-80GB @@ -22,37 +22,37 @@ Total number of elements used for the calculation: 1638400 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Computation start-up statistics ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -Total time of the start-up: 39.1244 +Total time of the start-up: 39.6475 Number of calls to the linear solver per time step: 2 -Average time of the resolution of the linear problem per call: 1.31573 +Average time of the resolution of the linear problem per call: 1.26543 Average number of iteration of the linear solver per call: 0 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Time loop statistics ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ The first time step is not accounted for the computation of the time loop statistics -Total time of the time loop: 3.01555 +Total time of the time loop: 2.7949 Number of time steps: 9 Skipped time steps: 1 -Average time per time step: 0.335062 -Standard deviation between time steps: 0.151722 -Time elapsed in the skipped time steps: 0.911653 +Average time per time step: 0.310545 +Standard deviation between time steps: 0.237972 +Time elapsed in the skipped time steps: 0.784626 Standard counter description | Time/step | % loop time | Call(s)/step ------------------------------------------------------------------------------------------ -Linear solver resolutions Ax=B | 0.0285602 | 6.5 | 3 -Convection operator | 0.0496885 | 11.4 | 6 -Diffusion operator | 0.03493974 | 8.0 | 24 -Gradient operator | 0.01687099 | 3.9 | 6 -Divergence operator | 0.005575172 | 1.3 | 4 -Source terms | 0.03593161 | 8.2 | 6 -Update ::mettre_a_jour | 0.01548032 | 3.5 | 1 -Solver for implicit diffusion | 0.01588839 | 3.6 | 6 -Computation of the time step dt | 0.002096975 | 0.5 | 10 -Post-treatment operations | 0.1171378 | 26.8 | 1 -Other operations | 0.01289186 | 3.0 | +Linear solver resolutions Ax=B | 0.0277935 | 8.9 | 3 +Convection operator | 0.0337385 | 10.9 | 6 +Diffusion operator | 0.02719853 | 8.8 | 24 +Gradient operator | 0.01307838 | 4.2 | 6 +Divergence operator | 0.002931471 | 0.9 | 4 +Source terms | 0.03489879 | 11.2 | 6 +Update ::mettre_a_jour | 0.01111204 | 3.6 | 1 +Solver for implicit diffusion | 0.01428297 | 4.6 | 6 +Computation of the time step dt | 0.001949491 | 0.6 | 10 +Post-treatment operations | 0.1323786 | 42.6 | 1 +Other operations | 0.01118269 | 3.6 | Average number of iteration of the linear solver per call: 8 @@ -62,16 +62,17 @@ Average number of iteration of the linear solver per call: 8 ----------------------------------------------------------------------------------------------------------- Counter description | Time per step | % loop time | Call(s)/step | Bandwidth ----------------------------------------------------------------------------------------------------------- -Libraries: | 0.0281164 | 8.4 | 3 | -Kernels: | 0.237447 | 70.9 | 1807 | -Copy host to device: | 0.000842697 | 0.3 | 43 | 3.5 GB/s -Copy device to host: | 0.00602455 | 1.8 | 98 | 10.3 GB/s -Alloc/Free on device: | 0.000168023 | 0.1 | 4 | -GPU: 79% Copy H<->D: 2% Alloc/free: 0.05% Comm: 0% CPU & I/O: 19% +Libraries: | 0.027341 | 8.8 | 3 | +Kernels: | 0.212071 | 68.3 | 1632 | +Copy host to device: | 0.00092218 | 0.3 | 51 | 3.2 GB/s +Copy device to host: | 0.00497458 | 1.6 | 40 | 12.5 GB/s +Alloc/Free on device: | 0.000821941 | 0.3 | 23 | +GPU: 77% Copy H<->D: 1.9% Alloc/free: 0.26% Comm: 0% CPU & I/O: 21% ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Post-resolution statistics ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -Time of the post-resolution: 1.31988 +Time of the post-resolution: 1.08213 -Total time for the whole computation 44.3715 +Total time for the whole computation 44.3091 +[Slurm] Power consumption (79 s): 0.511 kW 0.011 kWh 0.001 € (0.10€/kWh) diff --git a/tests/Reference/Correlation_triple_VDF/Correlation_triple_VDF.data b/tests/Reference/Correlation_triple_VDF/Correlation_triple_VDF.data new file mode 100755 index 0000000000..02951dea58 --- /dev/null +++ b/tests/Reference/Correlation_triple_VDF/Correlation_triple_VDF.data @@ -0,0 +1,126 @@ +Dimension 3 + +Pb_hydraulique pb + +Domaine dom +Mailler dom +{ + pave bloc + { + origine 0 0 0 + longueurs 1 1 1 + nombre_de_noeuds 6 6 6 + } + { + bord frontiere X = 0 0 <= Y <= 1 0 <= Z <= 1 + bord frontiere X = 1 0 <= Y <= 1 0 <= Z <= 1 + bord frontiere Y = 0 0 <= X <= 1 0 <= Z <= 1 + bord frontiere Y = 1 0 <= X <= 1 0 <= Z <= 1 + bord frontiere Z = 0 0 <= X <= 1 0 <= Y <= 1 + bord frontiere Z = 1 0 <= X <= 1 0 <= Y <= 1 + } +} + +VDF dis + +Schema_Euler_explicite sch +Lire sch +{ + nb_pas_dt_max 10 + tinit 0 + tmax 1 + dt_sauv -1 +} + +Associer pb dom +Associer pb sch + +Discretiser pb dis + +Lire pb +{ + Fluide_incompressible + { + mu champ_uniforme 1 1 + rho champ_uniforme 1 1 + } + Navier_Stokes_standard + { + solveur_pression petsc Cholesky { } + conditions_initiales { vitesse champ_uniforme 3 1 0 0 } + conditions_limites { + frontiere paroi_fixe + } + convection { centre } + diffusion { } + sources { source_qdm champ_fonc_xyz dom 3 cos(2*pi*x)*cos(2*pi*y)*cos(2*pi*z) cos(2*pi*x)*cos(2*pi*y)*sin(2*pi*z) cos(2*pi*x)*sin(2*pi*y)*sin(2*pi*z) } + } + Postraitement + { + definition_champs { + ui refChamp { pb_champ pb vitesse } + u1 transformation { methode composante numero 0 localisation elem sources_reference { ui } } + u2 transformation { methode composante numero 1 localisation elem sources_reference { ui } } + u3 transformation { methode composante numero 2 localisation elem sources_reference { ui } } + moy_u1 moyenne { t_deb 0 t_fin 1e+6 sources_reference { u1 } } + moy_u2 moyenne { t_deb 0 t_fin 1e+6 sources_reference { u2 } } + moy_u3 moyenne { t_deb 0 t_fin 1e+6 sources_reference { u3 } } + u1prime transformation { methode formule expression 1 u1-moy_u1 localisation elem sources_reference { u1 , moy_u1 } } + u2prime transformation { methode formule expression 1 u2-moy_u2 localisation elem sources_reference { u2 , moy_u2 } } + u3prime transformation { methode formule expression 1 u3-moy_u3 localisation elem sources_reference { u3 , moy_u3 } } + + u1u1u1_methode1 moyenne { t_deb 0 t_fin 1e+6 sources { transformation { methode formule expression 1 u1prime*u1prime*u1prime localisation elem sources_reference { u1prime } } } } + u1u1u2_methode1 moyenne { t_deb 0 t_fin 1e+6 sources { transformation { methode formule expression 1 u1prime*u1prime*u2prime localisation elem sources_reference { u1prime , u2prime } } } } + u1u1u3_methode1 moyenne { t_deb 0 t_fin 1e+6 sources { transformation { methode formule expression 1 u1prime*u1prime*u3prime localisation elem sources_reference { u1prime , u3prime } } } } + u1u2u2_methode1 moyenne { t_deb 0 t_fin 1e+6 sources { transformation { methode formule expression 1 u1prime*u2prime*u2prime localisation elem sources_reference { u1prime , u2prime } } } } + u1u2u3_methode1 moyenne { t_deb 0 t_fin 1e+6 sources { transformation { methode formule expression 1 u1prime*u2prime*u3prime localisation elem sources_reference { u1prime , u2prime , u3prime } } } } + u1u3u3_methode1 moyenne { t_deb 0 t_fin 1e+6 sources { transformation { methode formule expression 1 u1prime*u3prime*u3prime localisation elem sources_reference { u1prime , u3prime } } } } + u2u2u2_methode1 moyenne { t_deb 0 t_fin 1e+6 sources { transformation { methode formule expression 1 u2prime*u2prime*u2prime localisation elem sources_reference { u2prime } } } } + u2u2u3_methode1 moyenne { t_deb 0 t_fin 1e+6 sources { transformation { methode formule expression 1 u2prime*u2prime*u3prime localisation elem sources_reference { u2prime , u3prime } } } } + u2u3u3_methode1 moyenne { t_deb 0 t_fin 1e+6 sources { transformation { methode formule expression 1 u2prime*u3prime*u3prime localisation elem sources_reference { u2prime , u3prime } } } } + u3u3u3_methode1 moyenne { t_deb 0 t_fin 1e+6 sources { transformation { methode formule expression 1 u3prime*u3prime*u3prime localisation elem sources_reference { u3prime } } } } + + # uiujuk = vecteur 27 composantes : composante (i,j,k) -> colonne 9*i+3*j+k-13 # + uiujuk correlation_triple { t_deb 0 t_fin 1e+6 sources_reference { ui , ui , ui } } + u1u1u1_methode2 transformation { methode composante numero 0 localisation elem sources_reference { uiujuk } } + u1u1u2_methode2 transformation { methode composante numero 1 localisation elem sources_reference { uiujuk } } + u1u1u3_methode2 transformation { methode composante numero 2 localisation elem sources_reference { uiujuk } } + u1u2u2_methode2 transformation { methode composante numero 4 localisation elem sources_reference { uiujuk } } + u1u2u3_methode2 transformation { methode composante numero 5 localisation elem sources_reference { uiujuk } } + u1u3u3_methode2 transformation { methode composante numero 8 localisation elem sources_reference { uiujuk } } + u2u2u2_methode2 transformation { methode composante numero 13 localisation elem sources_reference { uiujuk } } + u2u2u3_methode2 transformation { methode composante numero 14 localisation elem sources_reference { uiujuk } } + u2u3u3_methode2 transformation { methode composante numero 17 localisation elem sources_reference { uiujuk } } + u3u3u3_methode2 transformation { methode composante numero 26 localisation elem sources_reference { uiujuk } } + } + sondes { + u1u1u1_methode1 u1u1u1_methode1 periode 1e-6 point 1 0.5 0.5 0.5 + u1u1u2_methode1 u1u1u2_methode1 periode 1e-6 position_like u1u1u1_methode1 + u1u1u3_methode1 u1u1u3_methode1 periode 1e-6 position_like u1u1u1_methode1 + u1u2u2_methode1 u1u2u2_methode1 periode 1e-6 position_like u1u1u1_methode1 + u1u2u3_methode1 u1u2u3_methode1 periode 1e-6 position_like u1u1u1_methode1 + u1u3u3_methode1 u1u3u3_methode1 periode 1e-6 position_like u1u1u1_methode1 + u2u2u2_methode1 u2u2u2_methode1 periode 1e-6 position_like u1u1u1_methode1 + u2u2u3_methode1 u2u2u3_methode1 periode 1e-6 position_like u1u1u1_methode1 + u2u3u3_methode1 u2u3u3_methode1 periode 1e-6 position_like u1u1u1_methode1 + u3u3u3_methode1 u3u3u3_methode1 periode 1e-6 position_like u1u1u1_methode1 + + u1u1u1_methode2 u1u1u1_methode2 periode 1e-6 position_like u1u1u1_methode1 + u1u1u2_methode2 u1u1u2_methode2 periode 1e-6 position_like u1u1u1_methode1 + u1u1u3_methode2 u1u1u3_methode2 periode 1e-6 position_like u1u1u1_methode1 + u1u2u2_methode2 u1u2u2_methode2 periode 1e-6 position_like u1u1u1_methode1 + u1u2u3_methode2 u1u2u3_methode2 periode 1e-6 position_like u1u1u1_methode1 + u1u3u3_methode2 u1u3u3_methode2 periode 1e-6 position_like u1u1u1_methode1 + u2u2u2_methode2 u2u2u2_methode2 periode 1e-6 position_like u1u1u1_methode1 + u2u2u3_methode2 u2u2u3_methode2 periode 1e-6 position_like u1u1u1_methode1 + u2u3u3_methode2 u2u3u3_methode2 periode 1e-6 position_like u1u1u1_methode1 + u3u3u3_methode2 u3u3u3_methode2 periode 1e-6 position_like u1u1u1_methode1 + } + } +} + +EcritureLectureSpecial 0 + +Resoudre pb + +Fin diff --git a/tests/Reference/Correlation_triple_VDF/Correlation_triple_VDF.lml.gz b/tests/Reference/Correlation_triple_VDF/Correlation_triple_VDF.lml.gz new file mode 100644 index 0000000000..b73bcf440a Binary files /dev/null and b/tests/Reference/Correlation_triple_VDF/Correlation_triple_VDF.lml.gz differ diff --git a/tests/Reference/Correlation_triple_VDF/Correlation_triple_VDF_U1U1U1_METHODE1.son.ref b/tests/Reference/Correlation_triple_VDF/Correlation_triple_VDF_U1U1U1_METHODE1.son.ref new file mode 100644 index 0000000000..5b9124bf11 --- /dev/null +++ b/tests/Reference/Correlation_triple_VDF/Correlation_triple_VDF_U1U1U1_METHODE1.son.ref @@ -0,0 +1,15 @@ +# Correlation_triple_VDF_U1U1U1_METHODE1.son +# Temps x= 5.00000000e-01 y= 5.00000000e-01 z= 5.00000000e-01 +# Champ U1U1U1_METHODE1 [??] +# Type POINT +0.00000000e+00 0.00000000e+00 +6.66666667e-03 0.00000000e+00 +1.33320901e-02 -8.18407828e-11 +1.99971806e-02 -1.46044746e-10 +2.66621435e-02 -1.82050285e-10 +3.33270417e-02 -1.92896806e-10 +3.99919067e-02 -1.90502981e-10 +4.66567528e-02 -1.82207513e-10 +5.33215866e-02 -1.71361748e-10 +5.99864138e-02 -1.60277265e-10 +6.66512360e-02 -1.49535492e-10 diff --git a/tests/Reference/Correlation_triple_VDF/Correlation_triple_VDF_U1U1U1_METHODE2.son.ref b/tests/Reference/Correlation_triple_VDF/Correlation_triple_VDF_U1U1U1_METHODE2.son.ref new file mode 100644 index 0000000000..06ba690bb8 --- /dev/null +++ b/tests/Reference/Correlation_triple_VDF/Correlation_triple_VDF_U1U1U1_METHODE2.son.ref @@ -0,0 +1,15 @@ +# Correlation_triple_VDF_U1U1U1_METHODE2.son +# Temps x= 5.00000000e-01 y= 5.00000000e-01 z= 5.00000000e-01 +# Champ U1U1U1_METHODE2 [??] +# Type POINT +0.00000000e+00 0.00000000e+00 +6.66666667e-03 0.00000000e+00 +1.33320901e-02 -8.18407828e-11 +1.99971806e-02 -1.46044746e-10 +2.66621435e-02 -1.82050285e-10 +3.33270417e-02 -1.92896806e-10 +3.99919067e-02 -1.90502981e-10 +4.66567528e-02 -1.82207513e-10 +5.33215866e-02 -1.71361748e-10 +5.99864138e-02 -1.60277265e-10 +6.66512360e-02 -1.49535492e-10 diff --git a/tests/Reference/Correlation_triple_VDF/Correlation_triple_VDF_U1U1U2_METHODE1.son.ref b/tests/Reference/Correlation_triple_VDF/Correlation_triple_VDF_U1U1U2_METHODE1.son.ref new file mode 100644 index 0000000000..a1d24db973 --- /dev/null +++ b/tests/Reference/Correlation_triple_VDF/Correlation_triple_VDF_U1U1U2_METHODE1.son.ref @@ -0,0 +1,15 @@ +# Correlation_triple_VDF_U1U1U2_METHODE1.son +# Temps x= 5.00000000e-01 y= 5.00000000e-01 z= 5.00000000e-01 +# Champ U1U1U2_METHODE1 [??] +# Type POINT +0.00000000e+00 0.00000000e+00 +6.66666667e-03 0.00000000e+00 +1.33320901e-02 -4.09203914e-11 +1.99971806e-02 -7.30223733e-11 +2.66621435e-02 -9.10251430e-11 +3.33270417e-02 -9.64484038e-11 +3.99919067e-02 -9.52514912e-11 +4.66567528e-02 -9.11037572e-11 +5.33215866e-02 -8.56808748e-11 +5.99864138e-02 -8.01386335e-11 +6.66512360e-02 -7.47677468e-11 diff --git a/tests/Reference/Correlation_triple_VDF/Correlation_triple_VDF_U1U1U2_METHODE2.son.ref b/tests/Reference/Correlation_triple_VDF/Correlation_triple_VDF_U1U1U2_METHODE2.son.ref new file mode 100644 index 0000000000..622fe5415c --- /dev/null +++ b/tests/Reference/Correlation_triple_VDF/Correlation_triple_VDF_U1U1U2_METHODE2.son.ref @@ -0,0 +1,15 @@ +# Correlation_triple_VDF_U1U1U2_METHODE2.son +# Temps x= 5.00000000e-01 y= 5.00000000e-01 z= 5.00000000e-01 +# Champ U1U1U2_METHODE2 [??] +# Type POINT +0.00000000e+00 0.00000000e+00 +6.66666667e-03 0.00000000e+00 +1.33320901e-02 -4.09203914e-11 +1.99971806e-02 -7.30223733e-11 +2.66621435e-02 -9.10251430e-11 +3.33270417e-02 -9.64484038e-11 +3.99919067e-02 -9.52514912e-11 +4.66567528e-02 -9.11037572e-11 +5.33215866e-02 -8.56808748e-11 +5.99864138e-02 -8.01386335e-11 +6.66512360e-02 -7.47677468e-11 diff --git a/tests/Reference/Correlation_triple_VDF/Correlation_triple_VDF_U1U1U3_METHODE1.son.ref b/tests/Reference/Correlation_triple_VDF/Correlation_triple_VDF_U1U1U3_METHODE1.son.ref new file mode 100644 index 0000000000..f36fa51c38 --- /dev/null +++ b/tests/Reference/Correlation_triple_VDF/Correlation_triple_VDF_U1U1U3_METHODE1.son.ref @@ -0,0 +1,15 @@ +# Correlation_triple_VDF_U1U1U3_METHODE1.son +# Temps x= 5.00000000e-01 y= 5.00000000e-01 z= 5.00000000e-01 +# Champ U1U1U3_METHODE1 [??] +# Type POINT +0.00000000e+00 0.00000000e+00 +6.66666667e-03 0.00000000e+00 +1.33320901e-02 -6.51020414e-15 +1.99971806e-02 -1.79543497e-14 +2.66621435e-02 -2.58186455e-14 +3.33270417e-02 -2.93005285e-14 +3.99919067e-02 -3.01466580e-14 +4.66567528e-02 -2.95220709e-14 +5.33215866e-02 -2.82337440e-14 +5.99864138e-02 -2.66926033e-14 +6.66512360e-02 -2.51088448e-14 diff --git a/tests/Reference/Correlation_triple_VDF/Correlation_triple_VDF_U1U1U3_METHODE2.son.ref b/tests/Reference/Correlation_triple_VDF/Correlation_triple_VDF_U1U1U3_METHODE2.son.ref new file mode 100644 index 0000000000..2f77bcd18e --- /dev/null +++ b/tests/Reference/Correlation_triple_VDF/Correlation_triple_VDF_U1U1U3_METHODE2.son.ref @@ -0,0 +1,15 @@ +# Correlation_triple_VDF_U1U1U3_METHODE2.son +# Temps x= 5.00000000e-01 y= 5.00000000e-01 z= 5.00000000e-01 +# Champ U1U1U3_METHODE2 [??] +# Type POINT +0.00000000e+00 0.00000000e+00 +6.66666667e-03 0.00000000e+00 +1.33320901e-02 -6.51020414e-15 +1.99971806e-02 -1.79543497e-14 +2.66621435e-02 -2.58186455e-14 +3.33270417e-02 -2.93005285e-14 +3.99919067e-02 -3.01466580e-14 +4.66567528e-02 -2.95220709e-14 +5.33215866e-02 -2.82337440e-14 +5.99864138e-02 -2.66926033e-14 +6.66512360e-02 -2.51088448e-14 diff --git a/tests/Reference/Correlation_triple_VDF/Correlation_triple_VDF_U1U2U2_METHODE1.son.ref b/tests/Reference/Correlation_triple_VDF/Correlation_triple_VDF_U1U2U2_METHODE1.son.ref new file mode 100644 index 0000000000..e42032e0fb --- /dev/null +++ b/tests/Reference/Correlation_triple_VDF/Correlation_triple_VDF_U1U2U2_METHODE1.son.ref @@ -0,0 +1,15 @@ +# Correlation_triple_VDF_U1U2U2_METHODE1.son +# Temps x= 5.00000000e-01 y= 5.00000000e-01 z= 5.00000000e-01 +# Champ U1U2U2_METHODE1 [??] +# Type POINT +0.00000000e+00 0.00000000e+00 +6.66666667e-03 0.00000000e+00 +1.33320901e-02 -2.04601957e-11 +1.99971806e-02 -3.65111867e-11 +2.66621435e-02 -4.55125718e-11 +3.33270417e-02 -4.82242023e-11 +3.99919067e-02 -4.76257460e-11 +4.66567528e-02 -4.55518790e-11 +5.33215866e-02 -4.28404378e-11 +5.99864138e-02 -4.00693172e-11 +6.66512360e-02 -3.73838738e-11 diff --git a/tests/Reference/Correlation_triple_VDF/Correlation_triple_VDF_U1U2U2_METHODE2.son.ref b/tests/Reference/Correlation_triple_VDF/Correlation_triple_VDF_U1U2U2_METHODE2.son.ref new file mode 100644 index 0000000000..672aee1070 --- /dev/null +++ b/tests/Reference/Correlation_triple_VDF/Correlation_triple_VDF_U1U2U2_METHODE2.son.ref @@ -0,0 +1,15 @@ +# Correlation_triple_VDF_U1U2U2_METHODE2.son +# Temps x= 5.00000000e-01 y= 5.00000000e-01 z= 5.00000000e-01 +# Champ U1U2U2_METHODE2 [??] +# Type POINT +0.00000000e+00 0.00000000e+00 +6.66666667e-03 0.00000000e+00 +1.33320901e-02 -2.04601957e-11 +1.99971806e-02 -3.65111867e-11 +2.66621435e-02 -4.55125718e-11 +3.33270417e-02 -4.82242023e-11 +3.99919067e-02 -4.76257460e-11 +4.66567528e-02 -4.55518790e-11 +5.33215866e-02 -4.28404378e-11 +5.99864138e-02 -4.00693172e-11 +6.66512360e-02 -3.73838738e-11 diff --git a/tests/Reference/Correlation_triple_VDF/Correlation_triple_VDF_U1U2U3_METHODE1.son.ref b/tests/Reference/Correlation_triple_VDF/Correlation_triple_VDF_U1U2U3_METHODE1.son.ref new file mode 100644 index 0000000000..da3a8b7bb8 --- /dev/null +++ b/tests/Reference/Correlation_triple_VDF/Correlation_triple_VDF_U1U2U3_METHODE1.son.ref @@ -0,0 +1,15 @@ +# Correlation_triple_VDF_U1U2U3_METHODE1.son +# Temps x= 5.00000000e-01 y= 5.00000000e-01 z= 5.00000000e-01 +# Champ U1U2U3_METHODE1 [??] +# Type POINT +0.00000000e+00 0.00000000e+00 +6.66666667e-03 0.00000000e+00 +1.33320901e-02 -3.25510207e-15 +1.99971806e-02 -8.97717487e-15 +2.66621435e-02 -1.29093228e-14 +3.33270417e-02 -1.46502644e-14 +3.99919067e-02 -1.50733291e-14 +4.66567528e-02 -1.47610356e-14 +5.33215866e-02 -1.41168722e-14 +5.99864138e-02 -1.33463018e-14 +6.66512360e-02 -1.25544225e-14 diff --git a/tests/Reference/Correlation_triple_VDF/Correlation_triple_VDF_U1U2U3_METHODE2.son.ref b/tests/Reference/Correlation_triple_VDF/Correlation_triple_VDF_U1U2U3_METHODE2.son.ref new file mode 100644 index 0000000000..0b120ccc4b --- /dev/null +++ b/tests/Reference/Correlation_triple_VDF/Correlation_triple_VDF_U1U2U3_METHODE2.son.ref @@ -0,0 +1,15 @@ +# Correlation_triple_VDF_U1U2U3_METHODE2.son +# Temps x= 5.00000000e-01 y= 5.00000000e-01 z= 5.00000000e-01 +# Champ U1U2U3_METHODE2 [??] +# Type POINT +0.00000000e+00 0.00000000e+00 +6.66666667e-03 0.00000000e+00 +1.33320901e-02 -3.25510207e-15 +1.99971806e-02 -8.97717487e-15 +2.66621435e-02 -1.29093228e-14 +3.33270417e-02 -1.46502644e-14 +3.99919067e-02 -1.50733291e-14 +4.66567528e-02 -1.47610356e-14 +5.33215866e-02 -1.41168722e-14 +5.99864138e-02 -1.33463018e-14 +6.66512360e-02 -1.25544225e-14 diff --git a/tests/Reference/Correlation_triple_VDF/Correlation_triple_VDF_U1U3U3_METHODE1.son.ref b/tests/Reference/Correlation_triple_VDF/Correlation_triple_VDF_U1U3U3_METHODE1.son.ref new file mode 100644 index 0000000000..9134f77668 --- /dev/null +++ b/tests/Reference/Correlation_triple_VDF/Correlation_triple_VDF_U1U3U3_METHODE1.son.ref @@ -0,0 +1,15 @@ +# Correlation_triple_VDF_U1U3U3_METHODE1.son +# Temps x= 5.00000000e-01 y= 5.00000000e-01 z= 5.00000000e-01 +# Champ U1U3U3_METHODE1 [??] +# Type POINT +0.00000000e+00 0.00000000e+00 +6.66666667e-03 0.00000000e+00 +1.33320901e-02 -5.17868432e-19 +1.99971806e-02 -2.37125564e-18 +2.66621435e-02 -3.88270790e-18 +3.33270417e-02 -4.68789302e-18 +3.99919067e-02 -5.00982847e-18 +4.66567528e-02 -5.01075198e-18 +5.33215866e-02 -4.86799847e-18 +5.99864138e-02 -4.64726336e-18 +6.66512360e-02 -4.40545326e-18 diff --git a/tests/Reference/Correlation_triple_VDF/Correlation_triple_VDF_U1U3U3_METHODE2.son.ref b/tests/Reference/Correlation_triple_VDF/Correlation_triple_VDF_U1U3U3_METHODE2.son.ref new file mode 100644 index 0000000000..7582db2bbe --- /dev/null +++ b/tests/Reference/Correlation_triple_VDF/Correlation_triple_VDF_U1U3U3_METHODE2.son.ref @@ -0,0 +1,15 @@ +# Correlation_triple_VDF_U1U3U3_METHODE2.son +# Temps x= 5.00000000e-01 y= 5.00000000e-01 z= 5.00000000e-01 +# Champ U1U3U3_METHODE2 [??] +# Type POINT +0.00000000e+00 0.00000000e+00 +6.66666667e-03 0.00000000e+00 +1.33320901e-02 -5.17868432e-19 +1.99971806e-02 -2.37125564e-18 +2.66621435e-02 -3.88270790e-18 +3.33270417e-02 -4.68789302e-18 +3.99919067e-02 -5.00982847e-18 +4.66567528e-02 -5.01075198e-18 +5.33215866e-02 -4.86799847e-18 +5.99864138e-02 -4.64726336e-18 +6.66512360e-02 -4.40545326e-18 diff --git a/tests/Reference/Correlation_triple_VDF/Correlation_triple_VDF_U2U2U2_METHODE1.son.ref b/tests/Reference/Correlation_triple_VDF/Correlation_triple_VDF_U2U2U2_METHODE1.son.ref new file mode 100644 index 0000000000..f1f5819c22 --- /dev/null +++ b/tests/Reference/Correlation_triple_VDF/Correlation_triple_VDF_U2U2U2_METHODE1.son.ref @@ -0,0 +1,15 @@ +# Correlation_triple_VDF_U2U2U2_METHODE1.son +# Temps x= 5.00000000e-01 y= 5.00000000e-01 z= 5.00000000e-01 +# Champ U2U2U2_METHODE1 [??] +# Type POINT +0.00000000e+00 0.00000000e+00 +6.66666667e-03 0.00000000e+00 +1.33320901e-02 -1.02300979e-11 +1.99971806e-02 -1.82555934e-11 +2.66621435e-02 -2.27562860e-11 +3.33270417e-02 -2.41121013e-11 +3.99919067e-02 -2.38128732e-11 +4.66567528e-02 -2.27759397e-11 +5.33215866e-02 -2.14202191e-11 +5.99864138e-02 -2.00346588e-11 +6.66512360e-02 -1.86919371e-11 diff --git a/tests/Reference/Correlation_triple_VDF/Correlation_triple_VDF_U2U2U2_METHODE2.son.ref b/tests/Reference/Correlation_triple_VDF/Correlation_triple_VDF_U2U2U2_METHODE2.son.ref new file mode 100644 index 0000000000..425c540bcc --- /dev/null +++ b/tests/Reference/Correlation_triple_VDF/Correlation_triple_VDF_U2U2U2_METHODE2.son.ref @@ -0,0 +1,15 @@ +# Correlation_triple_VDF_U2U2U2_METHODE2.son +# Temps x= 5.00000000e-01 y= 5.00000000e-01 z= 5.00000000e-01 +# Champ U2U2U2_METHODE2 [??] +# Type POINT +0.00000000e+00 0.00000000e+00 +6.66666667e-03 0.00000000e+00 +1.33320901e-02 -1.02300979e-11 +1.99971806e-02 -1.82555934e-11 +2.66621435e-02 -2.27562860e-11 +3.33270417e-02 -2.41121013e-11 +3.99919067e-02 -2.38128732e-11 +4.66567528e-02 -2.27759397e-11 +5.33215866e-02 -2.14202191e-11 +5.99864138e-02 -2.00346588e-11 +6.66512360e-02 -1.86919371e-11 diff --git a/tests/Reference/Correlation_triple_VDF/Correlation_triple_VDF_U2U2U3_METHODE1.son.ref b/tests/Reference/Correlation_triple_VDF/Correlation_triple_VDF_U2U2U3_METHODE1.son.ref new file mode 100644 index 0000000000..b726e58409 --- /dev/null +++ b/tests/Reference/Correlation_triple_VDF/Correlation_triple_VDF_U2U2U3_METHODE1.son.ref @@ -0,0 +1,15 @@ +# Correlation_triple_VDF_U2U2U3_METHODE1.son +# Temps x= 5.00000000e-01 y= 5.00000000e-01 z= 5.00000000e-01 +# Champ U2U2U3_METHODE1 [??] +# Type POINT +0.00000000e+00 0.00000000e+00 +6.66666667e-03 0.00000000e+00 +1.33320901e-02 -1.62755103e-15 +1.99971806e-02 -4.48858744e-15 +2.66621435e-02 -6.45466146e-15 +3.33270417e-02 -7.32513225e-15 +3.99919067e-02 -7.53666465e-15 +4.66567528e-02 -7.38051788e-15 +5.33215866e-02 -7.05843616e-15 +5.99864138e-02 -6.67315098e-15 +6.66512360e-02 -6.27721135e-15 diff --git a/tests/Reference/Correlation_triple_VDF/Correlation_triple_VDF_U2U2U3_METHODE2.son.ref b/tests/Reference/Correlation_triple_VDF/Correlation_triple_VDF_U2U2U3_METHODE2.son.ref new file mode 100644 index 0000000000..475fca6de8 --- /dev/null +++ b/tests/Reference/Correlation_triple_VDF/Correlation_triple_VDF_U2U2U3_METHODE2.son.ref @@ -0,0 +1,15 @@ +# Correlation_triple_VDF_U2U2U3_METHODE2.son +# Temps x= 5.00000000e-01 y= 5.00000000e-01 z= 5.00000000e-01 +# Champ U2U2U3_METHODE2 [??] +# Type POINT +0.00000000e+00 0.00000000e+00 +6.66666667e-03 0.00000000e+00 +1.33320901e-02 -1.62755103e-15 +1.99971806e-02 -4.48858744e-15 +2.66621435e-02 -6.45466146e-15 +3.33270417e-02 -7.32513225e-15 +3.99919067e-02 -7.53666465e-15 +4.66567528e-02 -7.38051788e-15 +5.33215866e-02 -7.05843616e-15 +5.99864138e-02 -6.67315098e-15 +6.66512360e-02 -6.27721135e-15 diff --git a/tests/Reference/Correlation_triple_VDF/Correlation_triple_VDF_U2U3U3_METHODE1.son.ref b/tests/Reference/Correlation_triple_VDF/Correlation_triple_VDF_U2U3U3_METHODE1.son.ref new file mode 100644 index 0000000000..bb0117ccfa --- /dev/null +++ b/tests/Reference/Correlation_triple_VDF/Correlation_triple_VDF_U2U3U3_METHODE1.son.ref @@ -0,0 +1,15 @@ +# Correlation_triple_VDF_U2U3U3_METHODE1.son +# Temps x= 5.00000000e-01 y= 5.00000000e-01 z= 5.00000000e-01 +# Champ U2U3U3_METHODE1 [??] +# Type POINT +0.00000000e+00 0.00000000e+00 +6.66666667e-03 0.00000000e+00 +1.33320901e-02 -2.58934216e-19 +1.99971806e-02 -1.18562782e-18 +2.66621435e-02 -1.94135396e-18 +3.33270417e-02 -2.34394653e-18 +3.99919067e-02 -2.50491426e-18 +4.66567528e-02 -2.50537602e-18 +5.33215866e-02 -2.43399927e-18 +5.99864138e-02 -2.32363171e-18 +6.66512360e-02 -2.20272666e-18 diff --git a/tests/Reference/Correlation_triple_VDF/Correlation_triple_VDF_U2U3U3_METHODE2.son.ref b/tests/Reference/Correlation_triple_VDF/Correlation_triple_VDF_U2U3U3_METHODE2.son.ref new file mode 100644 index 0000000000..b83645a934 --- /dev/null +++ b/tests/Reference/Correlation_triple_VDF/Correlation_triple_VDF_U2U3U3_METHODE2.son.ref @@ -0,0 +1,15 @@ +# Correlation_triple_VDF_U2U3U3_METHODE2.son +# Temps x= 5.00000000e-01 y= 5.00000000e-01 z= 5.00000000e-01 +# Champ U2U3U3_METHODE2 [??] +# Type POINT +0.00000000e+00 0.00000000e+00 +6.66666667e-03 0.00000000e+00 +1.33320901e-02 -2.58934216e-19 +1.99971806e-02 -1.18562782e-18 +2.66621435e-02 -1.94135396e-18 +3.33270417e-02 -2.34394653e-18 +3.99919067e-02 -2.50491426e-18 +4.66567528e-02 -2.50537602e-18 +5.33215866e-02 -2.43399927e-18 +5.99864138e-02 -2.32363171e-18 +6.66512360e-02 -2.20272666e-18 diff --git a/tests/Reference/Correlation_triple_VDF/Correlation_triple_VDF_U3U3U3_METHODE1.son.ref b/tests/Reference/Correlation_triple_VDF/Correlation_triple_VDF_U3U3U3_METHODE1.son.ref new file mode 100644 index 0000000000..29887e50fa --- /dev/null +++ b/tests/Reference/Correlation_triple_VDF/Correlation_triple_VDF_U3U3U3_METHODE1.son.ref @@ -0,0 +1,15 @@ +# Correlation_triple_VDF_U3U3U3_METHODE1.son +# Temps x= 5.00000000e-01 y= 5.00000000e-01 z= 5.00000000e-01 +# Champ U3U3U3_METHODE1 [??] +# Type POINT +0.00000000e+00 0.00000000e+00 +6.66666667e-03 1.49813643e-95 +1.33320901e-02 -4.11949775e-23 +1.99971806e-02 -3.28966603e-22 +2.66621435e-02 -6.05179509e-22 +3.33270417e-02 -7.73523441e-22 +3.99919067e-02 -8.57039449e-22 +4.66567528e-02 -8.74074347e-22 +5.33215866e-02 -8.62178607e-22 +5.99864138e-02 -8.30605582e-22 +6.66512360e-02 -7.93322822e-22 diff --git a/tests/Reference/Correlation_triple_VDF/Correlation_triple_VDF_U3U3U3_METHODE2.son.ref b/tests/Reference/Correlation_triple_VDF/Correlation_triple_VDF_U3U3U3_METHODE2.son.ref new file mode 100644 index 0000000000..e9f9c3f4a9 --- /dev/null +++ b/tests/Reference/Correlation_triple_VDF/Correlation_triple_VDF_U3U3U3_METHODE2.son.ref @@ -0,0 +1,15 @@ +# Correlation_triple_VDF_U3U3U3_METHODE2.son +# Temps x= 5.00000000e-01 y= 5.00000000e-01 z= 5.00000000e-01 +# Champ U3U3U3_METHODE2 [??] +# Type POINT +0.00000000e+00 0.00000000e+00 +6.66666667e-03 1.49813643e-95 +1.33320901e-02 -4.11949775e-23 +1.99971806e-02 -3.28966603e-22 +2.66621435e-02 -6.05179509e-22 +3.33270417e-02 -7.73523441e-22 +3.99919067e-02 -8.57039449e-22 +4.66567528e-02 -8.74074347e-22 +5.33215866e-02 -8.62178607e-22 +5.99864138e-02 -8.30605582e-22 +6.66512360e-02 -7.93322822e-22 diff --git a/tests/Reference/Correlation_triple_VDF/verifie b/tests/Reference/Correlation_triple_VDF/verifie new file mode 100755 index 0000000000..f989b60130 --- /dev/null +++ b/tests/Reference/Correlation_triple_VDF/verifie @@ -0,0 +1,17 @@ +message() +{ + [ $1 != $2 ] && echo $ECHO_OPTS "Error ($1!=$2) when checking:\n $msg" && err=1 + #echo $msg +} + +##################################### +# Comparaison non regression des .son (reduction) +##################################### +err=0 +for file in `ls *.son.ref 2>/dev/null` +do + msg="compare_sonde $file ${file%.ref}" + eval $msg 1>verifie.log 2>&1 + message $? 0 +done +exit $err diff --git a/tests/Reference/Correlation_triple_VEF/Correlation_triple_VEF.data b/tests/Reference/Correlation_triple_VEF/Correlation_triple_VEF.data new file mode 100755 index 0000000000..eeaf320247 --- /dev/null +++ b/tests/Reference/Correlation_triple_VEF/Correlation_triple_VEF.data @@ -0,0 +1,127 @@ +Dimension 3 + +Pb_hydraulique pb + +Domaine dom +Mailler dom +{ + pave bloc + { + origine 0 0 0 + longueurs 1 1 1 + nombre_de_noeuds 3 3 3 + } + { + bord frontiere X = 0 0 <= Y <= 1 0 <= Z <= 1 + bord frontiere X = 1 0 <= Y <= 1 0 <= Z <= 1 + bord frontiere Y = 0 0 <= X <= 1 0 <= Z <= 1 + bord frontiere Y = 1 0 <= X <= 1 0 <= Z <= 1 + bord frontiere Z = 0 0 <= X <= 1 0 <= Y <= 1 + bord frontiere Z = 1 0 <= X <= 1 0 <= Y <= 1 + } +} +Tetraedriser_homogene_fin dom + +VEFPreP1b dis + +Schema_Euler_explicite sch +Lire sch +{ + nb_pas_dt_max 10 + tinit 0 + tmax 1 + dt_sauv -1 +} + +Associer pb dom +Associer pb sch + +Discretiser pb dis + +Lire pb +{ + Fluide_incompressible + { + mu champ_uniforme 1 1 + rho champ_uniforme 1 1 + } + Navier_Stokes_standard + { + solveur_pression petsc Cholesky { } + conditions_initiales { vitesse champ_uniforme 3 1 0 0 } + conditions_limites { + frontiere paroi_fixe + } + convection { centre } + diffusion { } + sources { source_qdm champ_fonc_xyz dom 3 cos(2*pi*x)*cos(2*pi*y)*cos(2*pi*z) cos(2*pi*x)*cos(2*pi*y)*sin(2*pi*z) cos(2*pi*x)*sin(2*pi*y)*sin(2*pi*z) } + } + Postraitement + { + definition_champs { + ui refChamp { pb_champ pb vitesse } + u1 transformation { methode composante numero 0 localisation elem sources_reference { ui } } + u2 transformation { methode composante numero 1 localisation elem sources_reference { ui } } + u3 transformation { methode composante numero 2 localisation elem sources_reference { ui } } + moy_u1 moyenne { t_deb 0 t_fin 1e+6 sources_reference { u1 } } + moy_u2 moyenne { t_deb 0 t_fin 1e+6 sources_reference { u2 } } + moy_u3 moyenne { t_deb 0 t_fin 1e+6 sources_reference { u3 } } + u1prime transformation { methode formule expression 1 u1-moy_u1 localisation elem sources_reference { u1 , moy_u1 } } + u2prime transformation { methode formule expression 1 u2-moy_u2 localisation elem sources_reference { u2 , moy_u2 } } + u3prime transformation { methode formule expression 1 u3-moy_u3 localisation elem sources_reference { u3 , moy_u3 } } + + u1u1u1_methode1 moyenne { t_deb 0 t_fin 1e+6 sources { transformation { methode formule expression 1 u1prime*u1prime*u1prime localisation elem sources_reference { u1prime } } } } + u1u1u2_methode1 moyenne { t_deb 0 t_fin 1e+6 sources { transformation { methode formule expression 1 u1prime*u1prime*u2prime localisation elem sources_reference { u1prime , u2prime } } } } + u1u1u3_methode1 moyenne { t_deb 0 t_fin 1e+6 sources { transformation { methode formule expression 1 u1prime*u1prime*u3prime localisation elem sources_reference { u1prime , u3prime } } } } + u1u2u2_methode1 moyenne { t_deb 0 t_fin 1e+6 sources { transformation { methode formule expression 1 u1prime*u2prime*u2prime localisation elem sources_reference { u1prime , u2prime } } } } + u1u2u3_methode1 moyenne { t_deb 0 t_fin 1e+6 sources { transformation { methode formule expression 1 u1prime*u2prime*u3prime localisation elem sources_reference { u1prime , u2prime , u3prime } } } } + u1u3u3_methode1 moyenne { t_deb 0 t_fin 1e+6 sources { transformation { methode formule expression 1 u1prime*u3prime*u3prime localisation elem sources_reference { u1prime , u3prime } } } } + u2u2u2_methode1 moyenne { t_deb 0 t_fin 1e+6 sources { transformation { methode formule expression 1 u2prime*u2prime*u2prime localisation elem sources_reference { u2prime } } } } + u2u2u3_methode1 moyenne { t_deb 0 t_fin 1e+6 sources { transformation { methode formule expression 1 u2prime*u2prime*u3prime localisation elem sources_reference { u2prime , u3prime } } } } + u2u3u3_methode1 moyenne { t_deb 0 t_fin 1e+6 sources { transformation { methode formule expression 1 u2prime*u3prime*u3prime localisation elem sources_reference { u2prime , u3prime } } } } + u3u3u3_methode1 moyenne { t_deb 0 t_fin 1e+6 sources { transformation { methode formule expression 1 u3prime*u3prime*u3prime localisation elem sources_reference { u3prime } } } } + + # uiujuk = vecteur 27 composantes : composante (i,j,k) -> colonne 9*i+3*j+k-13 # + uiujuk correlation_triple { t_deb 0 t_fin 1e+6 sources_reference { ui , ui , ui } } + u1u1u1_methode2 transformation { methode composante numero 0 localisation elem sources_reference { uiujuk } } + u1u1u2_methode2 transformation { methode composante numero 1 localisation elem sources_reference { uiujuk } } + u1u1u3_methode2 transformation { methode composante numero 2 localisation elem sources_reference { uiujuk } } + u1u2u2_methode2 transformation { methode composante numero 4 localisation elem sources_reference { uiujuk } } + u1u2u3_methode2 transformation { methode composante numero 5 localisation elem sources_reference { uiujuk } } + u1u3u3_methode2 transformation { methode composante numero 8 localisation elem sources_reference { uiujuk } } + u2u2u2_methode2 transformation { methode composante numero 13 localisation elem sources_reference { uiujuk } } + u2u2u3_methode2 transformation { methode composante numero 14 localisation elem sources_reference { uiujuk } } + u2u3u3_methode2 transformation { methode composante numero 17 localisation elem sources_reference { uiujuk } } + u3u3u3_methode2 transformation { methode composante numero 26 localisation elem sources_reference { uiujuk } } + } + sondes { + u1u1u1_methode1 u1u1u1_methode1 periode 1e-6 point 1 0.5 0.5 0.5 + u1u1u2_methode1 u1u1u2_methode1 periode 1e-6 position_like u1u1u1_methode1 + u1u1u3_methode1 u1u1u3_methode1 periode 1e-6 position_like u1u1u1_methode1 + u1u2u2_methode1 u1u2u2_methode1 periode 1e-6 position_like u1u1u1_methode1 + u1u2u3_methode1 u1u2u3_methode1 periode 1e-6 position_like u1u1u1_methode1 + u1u3u3_methode1 u1u3u3_methode1 periode 1e-6 position_like u1u1u1_methode1 + u2u2u2_methode1 u2u2u2_methode1 periode 1e-6 position_like u1u1u1_methode1 + u2u2u3_methode1 u2u2u3_methode1 periode 1e-6 position_like u1u1u1_methode1 + u2u3u3_methode1 u2u3u3_methode1 periode 1e-6 position_like u1u1u1_methode1 + u3u3u3_methode1 u3u3u3_methode1 periode 1e-6 position_like u1u1u1_methode1 + + u1u1u1_methode2 u1u1u1_methode2 periode 1e-6 position_like u1u1u1_methode1 + u1u1u2_methode2 u1u1u2_methode2 periode 1e-6 position_like u1u1u1_methode1 + u1u1u3_methode2 u1u1u3_methode2 periode 1e-6 position_like u1u1u1_methode1 + u1u2u2_methode2 u1u2u2_methode2 periode 1e-6 position_like u1u1u1_methode1 + u1u2u3_methode2 u1u2u3_methode2 periode 1e-6 position_like u1u1u1_methode1 + u1u3u3_methode2 u1u3u3_methode2 periode 1e-6 position_like u1u1u1_methode1 + u2u2u2_methode2 u2u2u2_methode2 periode 1e-6 position_like u1u1u1_methode1 + u2u2u3_methode2 u2u2u3_methode2 periode 1e-6 position_like u1u1u1_methode1 + u2u3u3_methode2 u2u3u3_methode2 periode 1e-6 position_like u1u1u1_methode1 + u3u3u3_methode2 u3u3u3_methode2 periode 1e-6 position_like u1u1u1_methode1 + } + } +} + +EcritureLectureSpecial 0 + +Resoudre pb + +Fin diff --git a/tests/Reference/Correlation_triple_VEF/Correlation_triple_VEF.lml.gz b/tests/Reference/Correlation_triple_VEF/Correlation_triple_VEF.lml.gz new file mode 100644 index 0000000000..6dbb3f8b4c Binary files /dev/null and b/tests/Reference/Correlation_triple_VEF/Correlation_triple_VEF.lml.gz differ diff --git a/tests/Reference/Correlation_triple_VEF/Correlation_triple_VEF_U1U1U1_METHODE1.son.ref b/tests/Reference/Correlation_triple_VEF/Correlation_triple_VEF_U1U1U1_METHODE1.son.ref new file mode 100644 index 0000000000..cbeded905f --- /dev/null +++ b/tests/Reference/Correlation_triple_VEF/Correlation_triple_VEF_U1U1U1_METHODE1.son.ref @@ -0,0 +1,15 @@ +# Correlation_triple_VEF_U1U1U1_METHODE1.son +# Temps x= 5.00000000e-01 y= 5.00000000e-01 z= 5.00000000e-01 +# Champ U1U1U1_METHODE1 [??] +# Type POINT +0.00000000e+00 0.00000000e+00 +5.78703704e-04 0.00000000e+00 +1.15740323e-03 -2.26527027e-14 +1.73609871e-03 -1.15311174e-13 +2.31479037e-03 -2.99001596e-13 +2.89347850e-03 -5.80417457e-13 +3.47216334e-03 -9.58580792e-13 +4.05084513e-03 -1.42858148e-12 +4.62952411e-03 -1.98362480e-12 +5.20820048e-03 -2.61613726e-12 +5.78687442e-03 -3.31835602e-12 diff --git a/tests/Reference/Correlation_triple_VEF/Correlation_triple_VEF_U1U1U1_METHODE2.son.ref b/tests/Reference/Correlation_triple_VEF/Correlation_triple_VEF_U1U1U1_METHODE2.son.ref new file mode 100644 index 0000000000..5bac1202a9 --- /dev/null +++ b/tests/Reference/Correlation_triple_VEF/Correlation_triple_VEF_U1U1U1_METHODE2.son.ref @@ -0,0 +1,15 @@ +# Correlation_triple_VEF_U1U1U1_METHODE2.son +# Temps x= 5.00000000e-01 y= 5.00000000e-01 z= 5.00000000e-01 +# Champ U1U1U1_METHODE2 [??] +# Type POINT +0.00000000e+00 0.00000000e+00 +5.78703704e-04 0.00000000e+00 +1.15740323e-03 -2.26527027e-14 +1.73609871e-03 -1.15311174e-13 +2.31479037e-03 -2.99001596e-13 +2.89347850e-03 -5.80417457e-13 +3.47216334e-03 -9.58580792e-13 +4.05084513e-03 -1.42858148e-12 +4.62952411e-03 -1.98362480e-12 +5.20820048e-03 -2.61613726e-12 +5.78687442e-03 -3.31835602e-12 diff --git a/tests/Reference/Correlation_triple_VEF/Correlation_triple_VEF_U1U1U2_METHODE1.son.ref b/tests/Reference/Correlation_triple_VEF/Correlation_triple_VEF_U1U1U2_METHODE1.son.ref new file mode 100644 index 0000000000..46b5c3e48b --- /dev/null +++ b/tests/Reference/Correlation_triple_VEF/Correlation_triple_VEF_U1U1U2_METHODE1.son.ref @@ -0,0 +1,15 @@ +# Correlation_triple_VEF_U1U1U2_METHODE1.son +# Temps x= 5.00000000e-01 y= 5.00000000e-01 z= 5.00000000e-01 +# Champ U1U1U2_METHODE1 [??] +# Type POINT +0.00000000e+00 0.00000000e+00 +5.78703704e-04 0.00000000e+00 +1.15740323e-03 -4.56931963e-14 +1.73609871e-03 -2.36989146e-13 +2.31479037e-03 -6.25205318e-13 +2.89347850e-03 -1.23230794e-12 +3.47216334e-03 -2.06213704e-12 +4.05084513e-03 -3.10754383e-12 +4.62952411e-03 -4.35491800e-12 +5.20820048e-03 -5.78705876e-12 +5.78687442e-03 -7.38499816e-12 diff --git a/tests/Reference/Correlation_triple_VEF/Correlation_triple_VEF_U1U1U2_METHODE2.son.ref b/tests/Reference/Correlation_triple_VEF/Correlation_triple_VEF_U1U1U2_METHODE2.son.ref new file mode 100644 index 0000000000..4971af5fb7 --- /dev/null +++ b/tests/Reference/Correlation_triple_VEF/Correlation_triple_VEF_U1U1U2_METHODE2.son.ref @@ -0,0 +1,15 @@ +# Correlation_triple_VEF_U1U1U2_METHODE2.son +# Temps x= 5.00000000e-01 y= 5.00000000e-01 z= 5.00000000e-01 +# Champ U1U1U2_METHODE2 [??] +# Type POINT +0.00000000e+00 0.00000000e+00 +5.78703704e-04 0.00000000e+00 +1.15740323e-03 -4.56931963e-14 +1.73609871e-03 -2.36989146e-13 +2.31479037e-03 -6.25205318e-13 +2.89347850e-03 -1.23230794e-12 +3.47216334e-03 -2.06213704e-12 +4.05084513e-03 -3.10754383e-12 +4.62952411e-03 -4.35491800e-12 +5.20820048e-03 -5.78705876e-12 +5.78687442e-03 -7.38499816e-12 diff --git a/tests/Reference/Correlation_triple_VEF/Correlation_triple_VEF_U1U1U3_METHODE1.son.ref b/tests/Reference/Correlation_triple_VEF/Correlation_triple_VEF_U1U1U3_METHODE1.son.ref new file mode 100644 index 0000000000..a022eb32e7 --- /dev/null +++ b/tests/Reference/Correlation_triple_VEF/Correlation_triple_VEF_U1U1U3_METHODE1.son.ref @@ -0,0 +1,15 @@ +# Correlation_triple_VEF_U1U1U3_METHODE1.son +# Temps x= 5.00000000e-01 y= 5.00000000e-01 z= 5.00000000e-01 +# Champ U1U1U3_METHODE1 [??] +# Type POINT +0.00000000e+00 0.00000000e+00 +5.78703704e-04 0.00000000e+00 +1.15740323e-03 -5.05224794e-15 +1.73609871e-03 -2.61954964e-14 +2.31479037e-03 -6.86578915e-14 +2.89347850e-03 -1.33490753e-13 +3.47216334e-03 -2.18917985e-13 +4.05084513e-03 -3.21461679e-13 +4.62952411e-03 -4.36772000e-13 +5.20820048e-03 -5.60195372e-13 +5.78687442e-03 -6.87142168e-13 diff --git a/tests/Reference/Correlation_triple_VEF/Correlation_triple_VEF_U1U1U3_METHODE2.son.ref b/tests/Reference/Correlation_triple_VEF/Correlation_triple_VEF_U1U1U3_METHODE2.son.ref new file mode 100644 index 0000000000..e2d49a4973 --- /dev/null +++ b/tests/Reference/Correlation_triple_VEF/Correlation_triple_VEF_U1U1U3_METHODE2.son.ref @@ -0,0 +1,15 @@ +# Correlation_triple_VEF_U1U1U3_METHODE2.son +# Temps x= 5.00000000e-01 y= 5.00000000e-01 z= 5.00000000e-01 +# Champ U1U1U3_METHODE2 [??] +# Type POINT +0.00000000e+00 0.00000000e+00 +5.78703704e-04 0.00000000e+00 +1.15740323e-03 -5.05224794e-15 +1.73609871e-03 -2.61954964e-14 +2.31479037e-03 -6.86578915e-14 +2.89347850e-03 -1.33490753e-13 +3.47216334e-03 -2.18917985e-13 +4.05084513e-03 -3.21461679e-13 +4.62952411e-03 -4.36772000e-13 +5.20820048e-03 -5.60195372e-13 +5.78687442e-03 -6.87142168e-13 diff --git a/tests/Reference/Correlation_triple_VEF/Correlation_triple_VEF_U1U2U2_METHODE1.son.ref b/tests/Reference/Correlation_triple_VEF/Correlation_triple_VEF_U1U2U2_METHODE1.son.ref new file mode 100644 index 0000000000..6cbb349dff --- /dev/null +++ b/tests/Reference/Correlation_triple_VEF/Correlation_triple_VEF_U1U2U2_METHODE1.son.ref @@ -0,0 +1,15 @@ +# Correlation_triple_VEF_U1U2U2_METHODE1.son +# Temps x= 5.00000000e-01 y= 5.00000000e-01 z= 5.00000000e-01 +# Champ U1U2U2_METHODE1 [??] +# Type POINT +0.00000000e+00 0.00000000e+00 +5.78703704e-04 0.00000000e+00 +1.15740323e-03 -9.21686130e-14 +1.73609871e-03 -4.87088644e-13 +2.31479037e-03 -1.30746418e-12 +2.89347850e-03 -2.61692373e-12 +3.47216334e-03 -4.43738903e-12 +4.05084513e-03 -6.76190998e-12 +4.62952411e-03 -9.56421672e-12 +5.20820048e-03 -1.28057460e-11 +5.78687442e-03 -1.64407399e-11 diff --git a/tests/Reference/Correlation_triple_VEF/Correlation_triple_VEF_U1U2U2_METHODE2.son.ref b/tests/Reference/Correlation_triple_VEF/Correlation_triple_VEF_U1U2U2_METHODE2.son.ref new file mode 100644 index 0000000000..f2510e0645 --- /dev/null +++ b/tests/Reference/Correlation_triple_VEF/Correlation_triple_VEF_U1U2U2_METHODE2.son.ref @@ -0,0 +1,15 @@ +# Correlation_triple_VEF_U1U2U2_METHODE2.son +# Temps x= 5.00000000e-01 y= 5.00000000e-01 z= 5.00000000e-01 +# Champ U1U2U2_METHODE2 [??] +# Type POINT +0.00000000e+00 0.00000000e+00 +5.78703704e-04 0.00000000e+00 +1.15740323e-03 -9.21686130e-14 +1.73609871e-03 -4.87088644e-13 +2.31479037e-03 -1.30746418e-12 +2.89347850e-03 -2.61692373e-12 +3.47216334e-03 -4.43738903e-12 +4.05084513e-03 -6.76190998e-12 +4.62952411e-03 -9.56421672e-12 +5.20820048e-03 -1.28057460e-11 +5.78687442e-03 -1.64407399e-11 diff --git a/tests/Reference/Correlation_triple_VEF/Correlation_triple_VEF_U1U2U3_METHODE1.son.ref b/tests/Reference/Correlation_triple_VEF/Correlation_triple_VEF_U1U2U3_METHODE1.son.ref new file mode 100644 index 0000000000..341b3f96e0 --- /dev/null +++ b/tests/Reference/Correlation_triple_VEF/Correlation_triple_VEF_U1U2U3_METHODE1.son.ref @@ -0,0 +1,15 @@ +# Correlation_triple_VEF_U1U2U3_METHODE1.son +# Temps x= 5.00000000e-01 y= 5.00000000e-01 z= 5.00000000e-01 +# Champ U1U2U3_METHODE1 [??] +# Type POINT +0.00000000e+00 0.00000000e+00 +5.78703704e-04 0.00000000e+00 +1.15740323e-03 -1.01909852e-14 +1.73609871e-03 -5.38400943e-14 +2.31479037e-03 -1.43574764e-13 +2.89347850e-03 -2.83434637e-13 +3.47216334e-03 -4.70913274e-13 +4.05084513e-03 -6.99081300e-13 +4.62952411e-03 -9.58414535e-13 +5.20820048e-03 -1.23820699e-12 +5.78687442e-03 -1.52758340e-12 diff --git a/tests/Reference/Correlation_triple_VEF/Correlation_triple_VEF_U1U2U3_METHODE2.son.ref b/tests/Reference/Correlation_triple_VEF/Correlation_triple_VEF_U1U2U3_METHODE2.son.ref new file mode 100644 index 0000000000..88f7f9553a --- /dev/null +++ b/tests/Reference/Correlation_triple_VEF/Correlation_triple_VEF_U1U2U3_METHODE2.son.ref @@ -0,0 +1,15 @@ +# Correlation_triple_VEF_U1U2U3_METHODE2.son +# Temps x= 5.00000000e-01 y= 5.00000000e-01 z= 5.00000000e-01 +# Champ U1U2U3_METHODE2 [??] +# Type POINT +0.00000000e+00 0.00000000e+00 +5.78703704e-04 0.00000000e+00 +1.15740323e-03 -1.01909852e-14 +1.73609871e-03 -5.38400943e-14 +2.31479037e-03 -1.43574764e-13 +2.89347850e-03 -2.83434637e-13 +3.47216334e-03 -4.70913274e-13 +4.05084513e-03 -6.99081300e-13 +4.62952411e-03 -9.58414535e-13 +5.20820048e-03 -1.23820699e-12 +5.78687442e-03 -1.52758340e-12 diff --git a/tests/Reference/Correlation_triple_VEF/Correlation_triple_VEF_U1U3U3_METHODE1.son.ref b/tests/Reference/Correlation_triple_VEF/Correlation_triple_VEF_U1U3U3_METHODE1.son.ref new file mode 100644 index 0000000000..b9e8679628 --- /dev/null +++ b/tests/Reference/Correlation_triple_VEF/Correlation_triple_VEF_U1U3U3_METHODE1.son.ref @@ -0,0 +1,15 @@ +# Correlation_triple_VEF_U1U3U3_METHODE1.son +# Temps x= 5.00000000e-01 y= 5.00000000e-01 z= 5.00000000e-01 +# Champ U1U3U3_METHODE1 [??] +# Type POINT +0.00000000e+00 0.00000000e+00 +5.78703704e-04 0.00000000e+00 +1.15740323e-03 -1.12680635e-15 +1.73609871e-03 -5.95118739e-15 +2.31479037e-03 -1.57664429e-14 +2.89347850e-03 -3.07024816e-14 +3.47216334e-03 -4.99990985e-14 +4.05084513e-03 -7.23603589e-14 +4.62952411e-03 -9.62727240e-14 +5.20820048e-03 -1.20242788e-13 +5.78687442e-03 -1.42951394e-13 diff --git a/tests/Reference/Correlation_triple_VEF/Correlation_triple_VEF_U1U3U3_METHODE2.son.ref b/tests/Reference/Correlation_triple_VEF/Correlation_triple_VEF_U1U3U3_METHODE2.son.ref new file mode 100644 index 0000000000..74db591575 --- /dev/null +++ b/tests/Reference/Correlation_triple_VEF/Correlation_triple_VEF_U1U3U3_METHODE2.son.ref @@ -0,0 +1,15 @@ +# Correlation_triple_VEF_U1U3U3_METHODE2.son +# Temps x= 5.00000000e-01 y= 5.00000000e-01 z= 5.00000000e-01 +# Champ U1U3U3_METHODE2 [??] +# Type POINT +0.00000000e+00 0.00000000e+00 +5.78703704e-04 0.00000000e+00 +1.15740323e-03 -1.12680635e-15 +1.73609871e-03 -5.95118739e-15 +2.31479037e-03 -1.57664429e-14 +2.89347850e-03 -3.07024816e-14 +3.47216334e-03 -4.99990985e-14 +4.05084513e-03 -7.23603589e-14 +4.62952411e-03 -9.62727240e-14 +5.20820048e-03 -1.20242788e-13 +5.78687442e-03 -1.42951394e-13 diff --git a/tests/Reference/Correlation_triple_VEF/Correlation_triple_VEF_U2U2U2_METHODE1.son.ref b/tests/Reference/Correlation_triple_VEF/Correlation_triple_VEF_U2U2U2_METHODE1.son.ref new file mode 100644 index 0000000000..35a28643a9 --- /dev/null +++ b/tests/Reference/Correlation_triple_VEF/Correlation_triple_VEF_U2U2U2_METHODE1.son.ref @@ -0,0 +1,15 @@ +# Correlation_triple_VEF_U2U2U2_METHODE1.son +# Temps x= 5.00000000e-01 y= 5.00000000e-01 z= 5.00000000e-01 +# Champ U2U2U2_METHODE1 [??] +# Type POINT +0.00000000e+00 0.00000000e+00 +5.78703704e-04 0.00000000e+00 +1.15740323e-03 -1.85915058e-13 +1.73609871e-03 -1.00117428e-12 +2.31479037e-03 -2.73460086e-12 +2.89347850e-03 -5.55845539e-12 +3.47216334e-03 -9.55115626e-12 +4.05084513e-03 -1.47183055e-11 +4.62952411e-03 -2.10118152e-11 +5.20820048e-03 -2.83463294e-11 +5.78687442e-03 -3.66126392e-11 diff --git a/tests/Reference/Correlation_triple_VEF/Correlation_triple_VEF_U2U2U2_METHODE2.son.ref b/tests/Reference/Correlation_triple_VEF/Correlation_triple_VEF_U2U2U2_METHODE2.son.ref new file mode 100644 index 0000000000..e78245af53 --- /dev/null +++ b/tests/Reference/Correlation_triple_VEF/Correlation_triple_VEF_U2U2U2_METHODE2.son.ref @@ -0,0 +1,15 @@ +# Correlation_triple_VEF_U2U2U2_METHODE2.son +# Temps x= 5.00000000e-01 y= 5.00000000e-01 z= 5.00000000e-01 +# Champ U2U2U2_METHODE2 [??] +# Type POINT +0.00000000e+00 0.00000000e+00 +5.78703704e-04 0.00000000e+00 +1.15740323e-03 -1.85915058e-13 +1.73609871e-03 -1.00117428e-12 +2.31479037e-03 -2.73460086e-12 +2.89347850e-03 -5.55845539e-12 +3.47216334e-03 -9.55115626e-12 +4.05084513e-03 -1.47183055e-11 +4.62952411e-03 -2.10118152e-11 +5.20820048e-03 -2.83463294e-11 +5.78687442e-03 -3.66126392e-11 diff --git a/tests/Reference/Correlation_triple_VEF/Correlation_triple_VEF_U2U2U3_METHODE1.son.ref b/tests/Reference/Correlation_triple_VEF/Correlation_triple_VEF_U2U2U3_METHODE1.son.ref new file mode 100644 index 0000000000..03d12f4c66 --- /dev/null +++ b/tests/Reference/Correlation_triple_VEF/Correlation_triple_VEF_U2U2U3_METHODE1.son.ref @@ -0,0 +1,15 @@ +# Correlation_triple_VEF_U2U2U3_METHODE1.son +# Temps x= 5.00000000e-01 y= 5.00000000e-01 z= 5.00000000e-01 +# Champ U2U2U3_METHODE1 [??] +# Type POINT +0.00000000e+00 0.00000000e+00 +5.78703704e-04 0.00000000e+00 +1.15740323e-03 -2.05564295e-14 +1.73609871e-03 -1.10664194e-13 +2.31479037e-03 -3.00277659e-13 +2.89347850e-03 -6.01931216e-13 +3.47216334e-03 -1.01326075e-12 +4.05084513e-03 -1.52078166e-12 +4.62952411e-03 -2.10379672e-12 +5.20820048e-03 -2.73780195e-12 +5.78687442e-03 -3.39715437e-12 diff --git a/tests/Reference/Correlation_triple_VEF/Correlation_triple_VEF_U2U2U3_METHODE2.son.ref b/tests/Reference/Correlation_triple_VEF/Correlation_triple_VEF_U2U2U3_METHODE2.son.ref new file mode 100644 index 0000000000..73cd24e33c --- /dev/null +++ b/tests/Reference/Correlation_triple_VEF/Correlation_triple_VEF_U2U2U3_METHODE2.son.ref @@ -0,0 +1,15 @@ +# Correlation_triple_VEF_U2U2U3_METHODE2.son +# Temps x= 5.00000000e-01 y= 5.00000000e-01 z= 5.00000000e-01 +# Champ U2U2U3_METHODE2 [??] +# Type POINT +0.00000000e+00 0.00000000e+00 +5.78703704e-04 0.00000000e+00 +1.15740323e-03 -2.05564295e-14 +1.73609871e-03 -1.10664194e-13 +2.31479037e-03 -3.00277659e-13 +2.89347850e-03 -6.01931216e-13 +3.47216334e-03 -1.01326075e-12 +4.05084513e-03 -1.52078166e-12 +4.62952411e-03 -2.10379672e-12 +5.20820048e-03 -2.73780195e-12 +5.78687442e-03 -3.39715437e-12 diff --git a/tests/Reference/Correlation_triple_VEF/Correlation_triple_VEF_U2U3U3_METHODE1.son.ref b/tests/Reference/Correlation_triple_VEF/Correlation_triple_VEF_U2U3U3_METHODE1.son.ref new file mode 100644 index 0000000000..d92bfff7cd --- /dev/null +++ b/tests/Reference/Correlation_triple_VEF/Correlation_triple_VEF_U2U3U3_METHODE1.son.ref @@ -0,0 +1,15 @@ +# Correlation_triple_VEF_U2U3U3_METHODE1.son +# Temps x= 5.00000000e-01 y= 5.00000000e-01 z= 5.00000000e-01 +# Champ U2U3U3_METHODE1 [??] +# Type POINT +0.00000000e+00 0.00000000e+00 +5.78703704e-04 0.00000000e+00 +1.15740323e-03 -2.27290246e-15 +1.73609871e-03 -1.22322000e-14 +2.31479037e-03 -3.29730648e-14 +2.89347850e-03 -6.51925159e-14 +3.47216334e-03 -1.07545496e-13 +4.05084513e-03 -1.57320462e-13 +4.62952411e-03 -2.11143844e-13 +5.20820048e-03 -2.65560730e-13 +5.78687442e-03 -3.17442734e-13 diff --git a/tests/Reference/Correlation_triple_VEF/Correlation_triple_VEF_U2U3U3_METHODE2.son.ref b/tests/Reference/Correlation_triple_VEF/Correlation_triple_VEF_U2U3U3_METHODE2.son.ref new file mode 100644 index 0000000000..aedb5e8971 --- /dev/null +++ b/tests/Reference/Correlation_triple_VEF/Correlation_triple_VEF_U2U3U3_METHODE2.son.ref @@ -0,0 +1,15 @@ +# Correlation_triple_VEF_U2U3U3_METHODE2.son +# Temps x= 5.00000000e-01 y= 5.00000000e-01 z= 5.00000000e-01 +# Champ U2U3U3_METHODE2 [??] +# Type POINT +0.00000000e+00 0.00000000e+00 +5.78703704e-04 0.00000000e+00 +1.15740323e-03 -2.27290246e-15 +1.73609871e-03 -1.22322000e-14 +2.31479037e-03 -3.29730648e-14 +2.89347850e-03 -6.51925159e-14 +3.47216334e-03 -1.07545496e-13 +4.05084513e-03 -1.57320462e-13 +4.62952411e-03 -2.11143844e-13 +5.20820048e-03 -2.65560730e-13 +5.78687442e-03 -3.17442734e-13 diff --git a/tests/Reference/Correlation_triple_VEF/Correlation_triple_VEF_U3U3U3_METHODE1.son.ref b/tests/Reference/Correlation_triple_VEF/Correlation_triple_VEF_U3U3U3_METHODE1.son.ref new file mode 100644 index 0000000000..1d7289a9b2 --- /dev/null +++ b/tests/Reference/Correlation_triple_VEF/Correlation_triple_VEF_U3U3U3_METHODE1.son.ref @@ -0,0 +1,15 @@ +# Correlation_triple_VEF_U3U3U3_METHODE1.son +# Temps x= 5.00000000e-01 y= 5.00000000e-01 z= 5.00000000e-01 +# Champ U3U3U3_METHODE1 [??] +# Type POINT +0.00000000e+00 0.00000000e+00 +5.78703704e-04 0.00000000e+00 +1.15740323e-03 -2.51312399e-16 +1.73609871e-03 -1.35207888e-15 +2.31479037e-03 -3.62078602e-15 +2.89347850e-03 -7.06166536e-15 +3.47216334e-03 -1.14201266e-14 +4.05084513e-03 -1.62936895e-14 +4.62952411e-03 -2.12424418e-14 +5.20820048e-03 -2.58712844e-14 +5.78687442e-03 -2.98775739e-14 diff --git a/tests/Reference/Correlation_triple_VEF/Correlation_triple_VEF_U3U3U3_METHODE2.son.ref b/tests/Reference/Correlation_triple_VEF/Correlation_triple_VEF_U3U3U3_METHODE2.son.ref new file mode 100644 index 0000000000..9232b1d95e --- /dev/null +++ b/tests/Reference/Correlation_triple_VEF/Correlation_triple_VEF_U3U3U3_METHODE2.son.ref @@ -0,0 +1,15 @@ +# Correlation_triple_VEF_U3U3U3_METHODE2.son +# Temps x= 5.00000000e-01 y= 5.00000000e-01 z= 5.00000000e-01 +# Champ U3U3U3_METHODE2 [??] +# Type POINT +0.00000000e+00 0.00000000e+00 +5.78703704e-04 0.00000000e+00 +1.15740323e-03 -2.51312399e-16 +1.73609871e-03 -1.35207888e-15 +2.31479037e-03 -3.62078602e-15 +2.89347850e-03 -7.06166536e-15 +3.47216334e-03 -1.14201266e-14 +4.05084513e-03 -1.62936895e-14 +4.62952411e-03 -2.12424418e-14 +5.20820048e-03 -2.58712844e-14 +5.78687442e-03 -2.98775739e-14 diff --git a/tests/Reference/Correlation_triple_VEF/verifie b/tests/Reference/Correlation_triple_VEF/verifie new file mode 100755 index 0000000000..f989b60130 --- /dev/null +++ b/tests/Reference/Correlation_triple_VEF/verifie @@ -0,0 +1,17 @@ +message() +{ + [ $1 != $2 ] && echo $ECHO_OPTS "Error ($1!=$2) when checking:\n $msg" && err=1 + #echo $msg +} + +##################################### +# Comparaison non regression des .son (reduction) +##################################### +err=0 +for file in `ls *.son.ref 2>/dev/null` +do + msg="compare_sonde $file ${file%.ref}" + eval $msg 1>verifie.log 2>&1 + message $? 0 +done +exit $err diff --git a/tests/Reference/Cylindre_2D_Axi_VDF_Radiation/Cylindre_2D_Axi_VDF_Radiation.data b/tests/Reference/Cylindre_2D_Axi_VDF_Radiation/Cylindre_2D_Axi_VDF_Radiation.data index 427df828f6..b939aa3873 100644 --- a/tests/Reference/Cylindre_2D_Axi_VDF_Radiation/Cylindre_2D_Axi_VDF_Radiation.data +++ b/tests/Reference/Cylindre_2D_Axi_VDF_Radiation/Cylindre_2D_Axi_VDF_Radiation.data @@ -80,7 +80,7 @@ Lire pb Navier_Stokes_standard { - solveur_pression cholesky { } + solveur_pression petsc cholesky { } convection { amont } diffusion { } sources { boussinesq_temperature { T0 431. } } diff --git a/tests/Reference/Cylindre_tournant/Cylindre_tournant_EC_dans_repere_fixe.son_ref b/tests/Reference/Cylindre_tournant/Cylindre_tournant_EC_dans_repere_fixe.son_ref new file mode 100644 index 0000000000..cc2e762c74 --- /dev/null +++ b/tests/Reference/Cylindre_tournant/Cylindre_tournant_EC_dans_repere_fixe.son_ref @@ -0,0 +1,3 @@ +# Temps Energie_cinetique_totale +0.00000000e+00 1.17216882e-02 +4.52937624e-03 1.17216694e-02 diff --git a/tests/Reference/Cylindre_tournant/verifie b/tests/Reference/Cylindre_tournant/verifie new file mode 100755 index 0000000000..c991c9f644 --- /dev/null +++ b/tests/Reference/Cylindre_tournant/verifie @@ -0,0 +1,5 @@ +if [ -f $1_EC_dans_repere_fixe.son_ref ] +then +compare_sonde $1_EC_dans_repere_fixe.son_ref $1_EC_dans_repere_fixe.son 1>verifie.log 2>&1 || exit -1 +fi +exit 0 diff --git a/tests/Reference/DEC_64/DEC_64.data b/tests/Reference/DEC_64/DEC_64.data index 2f8b72e1d0..6bc3ac78df 100644 --- a/tests/Reference/DEC_64/DEC_64.data +++ b/tests/Reference/DEC_64/DEC_64.data @@ -23,6 +23,11 @@ Mailler_64 dom } Raffiner_simplexes_64 dom Transformer_64 dom x y z +Analyse_Angle_64 dom 10 + +Corriger_frontiere_periodique_64 { domaine dom bord paroiY } +Ecrire_med_64 dom dom.med +Lire_med_64 { domaine dom file dom.med } Partition_64 dom { Partition_tool metis_64 { nb_parts 4 } @@ -30,6 +35,10 @@ Partition_64 dom Larg_joint 2 zones_name DOM } +# +Discretiser_domaine_64 dom +Postraiter_domaine_64 { format single_lata fichier dom domaine dom } +# # VEF domain # lire_med_64 { domaine dom file tetra.med } @@ -37,15 +46,24 @@ lire_med_64 { domaine dom file tetra.med } RegroupeBord_64 dom perioz { entree sortie } Raffiner_simplexes_64 dom +Transformer_64 dom x y z Analyse_Angle_64 dom 10 Declarer_bord_perio_64 { domaine dom bord paroiY } - +Ecrire_med_64 dom dom.med +Lire_med_64 { domaine dom file dom.med } Decouper_64 dom { partitionneur metis_64 { Nb_parts 3 } larg_joint 2 single_hdf nom_Zones dom } +Ecrire_fichier dom dom.geom +Lire_fichier dom dom.geom + +# +Discretiser_domaine_64 dom +Postraiter_domaine_64 { format single_lata fichier dom domaine dom } +# Fin diff --git a/tests/Reference/Rayonnement_VDF/Rayonnement_VDF.data b/tests/Reference/Rayonnement_VDF/Rayonnement_VDF.data index 4fee042a2d..d8dfca79e3 100644 --- a/tests/Reference/Rayonnement_VDF/Rayonnement_VDF.data +++ b/tests/Reference/Rayonnement_VDF/Rayonnement_VDF.data @@ -114,7 +114,7 @@ Lire pb2 Navier_Stokes_standard { - solveur_pression cholesky { } + solveur_pression petsc cholesky { } convection { amont } diffusion { } sources { boussinesq_temperature { T0 300. } } diff --git a/tests/Reference/cpu_3D/cpu_3D.TU.ref_is246827 b/tests/Reference/cpu_3D/cpu_3D.TU.ref_is246827 new file mode 100644 index 0000000000..3583e9a0d8 --- /dev/null +++ b/tests/Reference/cpu_3D/cpu_3D.TU.ref_is246827 @@ -0,0 +1,44 @@ +Statistiques d'initialisation du calcul + +Temps total 0.409547 + +Statistiques de resolution du probleme + +Temps total 0.6614 + + +Timesteps 3 +Secondes / pas de temps 0.220459 +Dont solveurs Ax=B 0.154522 70% (1 appel/pas de temps) +Dont mettre_a_jour 0.001801 0% (1 appel/pas de temps) +Dont operateurs convection 0.033740 15% (2 appels/pas de temps) +Dont operateurs diffusion 0.013478 6% (2 appels/pas de temps) +Dont operateurs gradient 0.003080 1% (2 appels/pas de temps) +Dont operateurs divergence 0.002406 1% (2 appels/pas de temps) +Dont operateurs source 0.001065 0% (1 appel/pas de temps) +Dont operations postraitement 0.001074 0% (1 appel/pas de temps) +Dont calcul dt 0.002536 1% (4 appels/pas de temps) +Dont calcul divers 0.006757 3% (0 appels/pas de temps) +Nb solveur / pas de temps 1 +Secondes / solveur 0.154522 +Iterations / solveur 103 +I/O: + +Timesteps = number of time steps +Nb solveur = number of linear system resolutions +Nb assemblage implicite = number of matrix assemblies for the implicit scheme +Iterations = average number of iterations of the solver +Communications = fraction of the time spent + in communications between processors (excluding io files) +Network latency = time of one mpsum measured by an internal bench over 0.1s +Network bandwidth = maximum on all processors + of the average bandwidth of send_recv operations +Waiting time = estimation of the waiting time of the different processors + +Max_waiting_time big => probably due to a bad partitioning +Communications > 30% => too many processors or network too slow + +Statistiques de post resolution + +Temps total 0.001712 + diff --git a/tests/Reference/rayo_transp_VDF/rayo_transp_VDF.data b/tests/Reference/rayo_transp_VDF/rayo_transp_VDF.data index 5c2746edc7..6766d5d38a 100644 --- a/tests/Reference/rayo_transp_VDF/rayo_transp_VDF.data +++ b/tests/Reference/rayo_transp_VDF/rayo_transp_VDF.data @@ -148,7 +148,7 @@ Lire pb_f1 Navier_Stokes_standard { - solveur_pression cholesky { } + solveur_pression petsc cholesky { } convection { amont } diffusion { } sources { boussinesq_temperature { T0 300. } } @@ -254,7 +254,7 @@ Lire pb_f2 Navier_Stokes_standard { - solveur_pression cholesky { } + solveur_pression petsc cholesky { } convection { amont } diffusion { } sources { boussinesq_temperature { T0 300. } } diff --git a/tests/Turbulence/Marche3D_Turb_Null/Marche3D_Turb_Null.data b/tests/Turbulence/Marche3D_Turb_Null/Marche3D_Turb_Null.data index 4278aea188..1e7d886863 100644 --- a/tests/Turbulence/Marche3D_Turb_Null/Marche3D_Turb_Null.data +++ b/tests/Turbulence/Marche3D_Turb_Null/Marche3D_Turb_Null.data @@ -52,7 +52,7 @@ Read pb Navier_Stokes_Turbulent { - solveur_pression cholesky { } + solveur_pression petsc cholesky { } convection { amont } diffusion { } initial_conditions { diff --git a/tests/UnitTests/unit_device.cpp b/tests/UnitTests/unit_device.cpp index a45d4635c7..64c97c4e8a 100644 --- a/tests/UnitTests/unit_device.cpp +++ b/tests/UnitTests/unit_device.cpp @@ -466,9 +466,35 @@ TEST(DeviceTest, DoubleTravCopyConstructor) -TEST(DeviceTest, copyToDevice2) +// NVCC extended lambdas cannot be used inside protected/private member functions +// (GTest's TestBody is protected). Extract kernels as free functions at file scope. +template +static void check_values_3d(View3D view_rw, int n0, int n1, int n2, bool& all_correct) +{ + auto policy = Kokkos::MDRangePolicy>({0, 0, 0}, {n0, n1, n2}); + Kokkos::parallel_reduce("CheckValues", policy, + KOKKOS_LAMBDA(int i, int j, int k, bool& result) { + if (view_rw(i, j, k) != i + j - k) { + result = false; + printf("Mismatch at i=%d, j=%d, k=%d: view_rw(i,j,k)=%d, expected=%d\n", + i, j, k, view_rw(i, j, k), i + j - k); + } + }, + Kokkos::LAnd(all_correct)); +} + +template +static void set_values_3d(View3D view_rw, int n0, int n1, int n2) { + auto policy = Kokkos::MDRangePolicy>({0, 0, 0}, {n0, n1, n2}); + Kokkos::parallel_for("SetValues", policy, + KOKKOS_LAMBDA(int i, int j, int k) { + view_rw(i, j, k) = i + j - k; + }); +} +TEST(DeviceTest, copyToDevice2) +{ int n0=2, n1=3, n2=4; TRUSTTab tab(n0,n1,n2); @@ -480,40 +506,22 @@ TEST(DeviceTest, copyToDevice2) } } - //This does a map to device auto view_rw = tab.view_rw<3, Kokkos::DefaultExecutionSpace>(); - - // Parallel reduce to check if all values match i + j + k - auto policy = Kokkos::MDRangePolicy>({0, 0, 0}, {n0, n1, n2}); bool all_correct = true; - Kokkos::parallel_reduce("CheckValues", policy, - KOKKOS_LAMBDA(int i, int j, int k, bool& result) { - - if (view_rw(i, j, k) != i + j - k) {result = false;printf("Mismatch at i=%d, j=%d, k=%d: view_rw(i,j,k)=%d, expected=%d\n", - i, j, k, view_rw(i, j, k), i + j - k);}}, - Kokkos::LAnd(all_correct)); + check_values_3d(view_rw, n0, n1, n2, all_correct); EXPECT_TRUE(all_correct); } TEST(DeviceTest, copyFromDevice2) { - int n0=10, n1=11, n2=12; TRUSTTab tab(n0,n1,n2); - //This does a map to device auto view_rw = tab.view_rw<3, Kokkos::DefaultExecutionSpace>(); - // Parallel reduce to check if all values match i + j + k - auto policy = Kokkos::MDRangePolicy>({0, 0, 0}, {n0, n1, n2}); - - Kokkos::parallel_for("SetValues", policy, - KOKKOS_LAMBDA(int i, int j, int k) - { - view_rw(i, j, k) = i + j - k; - }); + set_values_3d(view_rw, n0, n1, n2); //copyFromDevice(tab); @@ -526,6 +534,47 @@ TEST(DeviceTest, copyFromDevice2) } } +// NVCC: KOKKOS_LAMBDA cannot appear inside GTest's protected TestBody. +// Extract all kernel patterns as file-scope template helpers. +template +static void check_values_1d(View1D view, int N, bool& all_correct) +{ + Kokkos::parallel_reduce("CheckValues", N, + KOKKOS_LAMBDA(int i, bool& result) { + if (view(i) != (decltype(view(i)))i) result = false; + }, Kokkos::LAnd(all_correct)); +} + +template +static void check_values_2d_col(View2D view, int N, int col, bool& all_correct) +{ + Kokkos::parallel_reduce("CheckValues", N, + KOKKOS_LAMBDA(int i, bool& result) { + if (view(i, col) != i) result = false; + }, Kokkos::LAnd(all_correct)); +} + +template +static void check_values_2d_row(View2D view, int N, int row, bool& all_correct) +{ + Kokkos::parallel_reduce("CheckValues", N, + KOKKOS_LAMBDA(int i, bool& result) { + if (view(row, i) != i) result = false; + }, Kokkos::LAnd(all_correct)); +} + +// Checks 2D view using flat index: row = i/ncols (0 or 1), col = i%ncols +template +static void check_values_2d_flat(View2D view, int N, int ncols, bool& all_correct) +{ + Kokkos::parallel_reduce("CheckValues", N, + KOKKOS_LAMBDA(int i, bool& result) { + int k = (int)(i >= ncols); + int l = i % ncols; + if (view(k, l) != i) result = false; + }, Kokkos::LAnd(all_correct)); +} + TEST(DeviceTest, resizeGPUArrayUP){ int N=10; @@ -538,13 +587,8 @@ TEST(DeviceTest, resizeGPUArrayUP){ for (int i=0; i(all_correct)); + check_values_1d(view_rw, N, all_correct); EXPECT_TRUE(all_correct); } @@ -558,13 +602,8 @@ TEST(DeviceTest, resizeGPUArrayDOWN){ a.resize(N/2); for (int i=0; i(all_correct)); + check_values_1d(view_rw, N/2, all_correct); EXPECT_TRUE(all_correct); } @@ -583,15 +622,9 @@ TEST(DeviceTest, resizeGPUTabUP){ auto view = tab.view_rw(); bool all_correct = true; - Kokkos::parallel_reduce("CheckValues", 6, - KOKKOS_LAMBDA(int i, bool& result) { - int k=(int)(i>=3); - int l=i%3; - if (view(k,l) != i) {result = false; - printf("Mismatch, i=%d, view(%d,%d)=%d\n", i, k,l, view(k,l));}}, - Kokkos::LAnd(all_correct)); + check_values_2d_flat(view, 6, 3, all_correct); EXPECT_TRUE(all_correct); - + tab.resize(4,6); auto view2 = tab.view_rw(); @@ -599,15 +632,9 @@ TEST(DeviceTest, resizeGPUTabUP){ for (int i=0; i<6; i++){ EXPECT_EQ(tab(0,i),i); } - + all_correct = true; - Kokkos::parallel_reduce("CheckValues", 6, - KOKKOS_LAMBDA(int i, bool& result) { - if (view2(0,i) != i) { - result = false; - printf("Mismatch at i=%d, view2(0,i)=%d\n", i, view2(0,i)); - }}, - Kokkos::LAnd(all_correct)); + check_values_2d_row(view2, 6, 0, all_correct); EXPECT_TRUE(all_correct); } @@ -634,13 +661,7 @@ TEST(DeviceTest, resizeGPUTabDOWN){ EXPECT_EQ(tab(1,2), 5); bool all_correct = true; - Kokkos::parallel_reduce("CheckValues", 6, - KOKKOS_LAMBDA(int i, bool& result) { - int k=(int)(i>=3); - int l=i%3; - if (view2(k,l) != i) {result = false; - printf("Mismatch, i=%d, view(%d,%d)=%d\n", i, k,l, view2(k,l));}}, - Kokkos::LAnd(all_correct)); + check_values_2d_flat(view2, 6, 3, all_correct); EXPECT_TRUE(all_correct); } @@ -705,10 +726,7 @@ TEST(DeviceTest, append_line_GPU_dim1){ auto view1=tab.view_rw(); bool all_correct = true; - Kokkos::parallel_reduce("CheckValues", 5, - KOKKOS_LAMBDA(int i, bool& result) { - if (view1(i,0) != i) {result = false;}}, - Kokkos::LAnd(all_correct)); + check_values_2d_col(view1, 5, 0, all_correct); EXPECT_TRUE(all_correct); } @@ -750,17 +768,11 @@ TEST(DeviceTest, append_line_GPU_dim2){ auto view1=tab.view_rw(); bool all_correct = true; - Kokkos::parallel_reduce("CheckValues", 5, - KOKKOS_LAMBDA(int i, bool& result) { - if (view1(i,0) != i) {result = false;}}, - Kokkos::LAnd(all_correct)); + check_values_2d_col(view1, 5, 0, all_correct); EXPECT_TRUE(all_correct); all_correct = true; - Kokkos::parallel_reduce("CheckValues", 5, - KOKKOS_LAMBDA(int i, bool& result) { - if (view1(i,1) != i) {result = false;}}, - Kokkos::LAnd(all_correct)); + check_values_2d_col(view1, 5, 1, all_correct); EXPECT_TRUE(all_correct); } @@ -797,11 +809,7 @@ TEST(DeviceTest, append_line_GPU_dim3) { // Verify with parallel_reduce for each column for (int col = 0; col < 3; col++) { bool all_correct = true; - Kokkos::parallel_reduce("CheckValues", 5, - KOKKOS_LAMBDA(int i, bool& result) { - if (view1(i, col) != i) {result = false;} - }, - Kokkos::LAnd(all_correct)); + check_values_2d_col(view1, 5, col, all_correct); EXPECT_TRUE(all_correct); } } @@ -839,11 +847,7 @@ TEST(DeviceTest, append_line_GPU_dim4) { // Verify with parallel_reduce for each column for (int col = 0; col < 4; col++) { bool all_correct = true; - Kokkos::parallel_reduce("CheckValues", 5, - KOKKOS_LAMBDA(int i, bool& result) { - if (view1(i, col) != i) {result = false;} - }, - Kokkos::LAnd(all_correct)); + check_values_2d_col(view1, 5, col, all_correct); EXPECT_TRUE(all_correct); } } @@ -867,11 +871,7 @@ TEST(DeviceTest, copy_ctor_tab_GPU) { // Verify with parallel_reduce for each column for (int col = 0; col < 4; col++) { bool all_correct = true; - Kokkos::parallel_reduce("CheckValues", 5, - KOKKOS_LAMBDA(int i, bool& result) { - if (view2(i, col) != i) {result = false;} - }, - Kokkos::LAnd(all_correct)); + check_values_2d_col(view2, 5, col, all_correct); EXPECT_TRUE(all_correct); } }