diff --git a/DEVELOPER_NOTES b/DEVELOPER_NOTES
index 1db881a092..49eb67844c 100644
--- a/DEVELOPER_NOTES
+++ b/DEVELOPER_NOTES
@@ -1,3 +1,7 @@
+XX/XX/26 (TRUST) GPU          : Blackwell B6000 and Cuda 13.0 build (NVHPC 25.11) supported
+XX/XX/26 (TRUST) Matrix       : Introduce Stencil structure to deal with possible nnz larger than 2^31 (possible on future GPU device)
+XX/XX/26 (TRUST) VEF          : Elem_VEF_base::normale replaced by Elem_VEF_base::creer_face_normales
+XX/XX/26 (TRUST) Build        : Add a new target profiling to build a -O3 -g (+specific options) binary to ease the profiler tools (perf, nsys, rocprof,...)
 -------------------------------------------------------------
 Developer notes version 1.9.8_beta : Changes since version 1.9.7 :
 -------------------------------------------------------------
diff --git a/RELEASE_NOTES b/RELEASE_NOTES
index 7151a4deb7..331cb686e3 100644
--- a/RELEASE_NOTES
+++ b/RELEASE_NOTES
@@ -1,3 +1,10 @@
+XX/XX/26 (TRUST) GPU          : AMG solver fixed for gfx1100 AMD card and ROCm 7.2 supported
+XX/XX/26 (TRUST) Keyword      : Add Correlation_triple keyword for computed advanced fields
+XX/XX/26 (TRUST) Keyword      : Add Enstrophie_totale keyword
+XX/XX/26 (TRUST) Bug fix      : Fix Corriger_frontiere_periodique_64 issue (VDF only) + Ecrire_med_64 not available
+XX/XX/26 (TRUST) Change       : Warning, VEF VerifierCoin keyword algorithm has been rewritten which possibly change the numerotation of the mesh and the partitioned mesh
+XX/XX/26 (TRUST) Solver       : Sparskit based solver "solveur gen { solv_elem bicgstab|gmres ... }" removed. It can be replaced by more efficient PETSc equivalent solvers
+XX/XX/26 (TRUST) Tool         : trust -energy JOB_ID returns now on well-configured cluster the energy consumption of your job
 -------------------------------------------------------------------------------------------------
 Release notes version 1.9.8_beta : Enhancements, modifications and corrected bugs since version 1.9.7 :
 --------------------------------------------------------------------------------------------------
@@ -7,7 +14,7 @@ Release notes version 1.9.8_beta : Enhancements, modifications and corrected bug
 30/04/26 (TRUST) New feature  : Introducing an IBM pre-processor + optimisations
 27/04/26 (TRUST) New feature  : Add new colocalised discretization that currently works with a compressible single-phase Euler or two-phase Baer-Nunziato problem integrating a Riemann solver (HLL & Rusanov).
 08/04/26 (TRUST) Change       : 'corriger_frontiere_periodique' becomes 'declarer_bord_perio'. Old keyword remains valid as a synonym. A periodic boundary must now **always** be declared with this keyword. The 'periodic' option in Partitioners is no longer necessary.
-20/04/26 (TRUST) Change       : Warning, VEF VerifierCoin keyword algorithm has been rewritten which possibly change the numerotation of the mesh and the partitioned mesh 
+20/04/26 (TRUST) Change       : Warning, VEF VerifierCoin keyword algorithm has been rewritten which possibly change the numerotation of the mesh and the partitioned mesh
 20/04/26 (TRUST) Solver       : Sparskit based solver "solveur gen { solv_elem bicgstab|gmres ... }" removed. It can be replaced by more efficient PETSc equivalent solvers
 20/04/26 (TRUST) Tool         : trust -energy JOB_ID returns now on well-configured cluster the energy consumption of your job
 14/04/26 (TRUST) New feature  : CGNS supports now reset_time used in the framework of ICoCo
@@ -18,8 +25,8 @@ Release notes version 1.9.8_beta : Enhancements, modifications and corrected bug
 09/04/26 (TRUST) Major change : TrioCFD radiation models (transparent and semi-transparent media) are re-written completely so that they can be used in basic classes of TRUST. See examples and validations in TrioCFD code. For transparent medium, it is possible to define now more than one model in a coupled problem; ie: a model in each fluid problem. It is also possible to use it with a distant problem; ie: without a coupled problem.
 31/03/26 (TRUST) Change       : Change the diffusive time step computation in multiphase VDF. Less restrictive, similar as in PolyMAC now.
 31/03/26 (TRUST) Change       : Change the diffusive time step computation in multiphase VDF. Less restrictive, similar
-as in PolyMAC now. 
-30/03/26 (TRUST) Performance  : Computation overlapped by communication in VEF convective schemes. Convection operator duration reduced by 20% on 128 GPU case on Adastra. 
+as in PolyMAC now.
+30/03/26 (TRUST) Performance  : Computation overlapped by communication in VEF convective schemes. Convection operator duration reduced by 20% on 128 GPU case on Adastra.
 12/03/26 (TRUST) New keyword  : Analyse_angle_64 version of Analyse_angle keyword for large mesh to compute angles histogram especially for tetraedras to detect too much obtuse angles
 24/02/26 (TRUST) GPU          : Single GPU performance on MI250X (adastra, lumi) and MI300A (adastra) improved by 30% after ROCm update (6.4.x)
 24/02/26 (TRUST) Bug fix      : Replace CG by GMRES in AMG solver for better robustness and speed especially on GPU (may not converge)
@@ -51,6 +58,14 @@ as in PolyMAC now.
 13/01/26 (TRUST) New feature  : Add new flag 'adapt_dt_tmax' in time scheme that ensures that the simulation ends at tmax
 08/01/26 (TRUST) Bug fix      : Fix for the case where dt_post is specified once in the header of post-processing block and where a mix of post-processings from files and old fashion is requested
 08/01/26 (TRUST) Bug fix      : Possible memory increase when writing some files (typically opened/closed at each write)
+07/01/26 (TRUST) New feature  : New mechanics module (Meca) integrated, including linear elasticity (Hooke's law), a Newmark solver for elastodynamics, and thermo-elastic source terms.
+07/01/26 (TRUST) New feature  : Enable bidim_axi support in EF and PolyMAC_MPFA.
+07/01/26 (TRUST) New feature  : DP_impose & regular pressure drops: time-dependent driving, regul option, and save/restore; dp_regul parameter renamed from eps to alpha.
+07/01/26 (TRUST) New feature  : New source term Echange_Thermique_Volumique: implicit heat conduction between two domains in VDF/Poly*
+07/01/26 (TRUST) New feature  : New Champ_Morceaux (piecewise fields on sub-domains) and improvements to Champ_Fonc_Tabule_Morceaux (can use post-processed fields).
+07/01/26 (TRUST) Fix          : Postprocessing on deformable/ALE domains: fix weighted_sum/average (face control volumes) and optimize volume_maille.
+07/01/26 (Tools) Fix          : Improve macOS/darwin builds (PDI, darwin_g++).
+07/01/26 (TRUST) Change       : Decouper_multi now splits domains following the order specified in the input file.
 --------------------------------------------------------------------------------------------------
 Release notes version 1.9.7 : Enhancements, modifications and corrected bugs since version 1.9.6 :
 --------------------------------------------------------------------------------------------------
diff --git a/ThirdPart/src/LIBAMGX/install b/ThirdPart/src/LIBAMGX/install
index 028e669f62..c95dca46fe 100755
--- a/ThirdPart/src/LIBAMGX/install
+++ b/ThirdPart/src/LIBAMGX/install
@@ -51,7 +51,7 @@ then
    # Hack du CMakeLists.txt (vu sur orcus avec Cuda 12) car manque de cublasLt
    sed -i "1,$ s?CUDA::cublas?CUDA::cublas CUDA::cublasLt?g" ../CMakeLists.txt || exit -1
    
-   if [ "$HOST" = jean-zay ]
+   if [ "$HOST" = jean-zay ] || [ $HOST = dalianvl ]
    then
       # ToDo: cublas not found on JeanZay so we simplify and it works now... Generalize to other builds ?
       cmake -DCMAKE_CUDA_COMPILER=$TRUST_NVCC -DAMGX_NO_RPATH=1 -DCMAKE_INSTALL_PREFIX=$AMGX_DIR/$LIB $AMGX_COMPILERS -DCMAKE_CUDA_FLAGS_RELEASE="-DNDEBUG" .. || exit -1
diff --git a/ThirdPart/src/LIBAMGXWRAPPER/install b/ThirdPart/src/LIBAMGXWRAPPER/install
index 611d5eae22..a49d4e8ff6 100755
--- a/ThirdPart/src/LIBAMGXWRAPPER/install
+++ b/ThirdPart/src/LIBAMGXWRAPPER/install
@@ -67,6 +67,7 @@ then
    cd $AMGX_DIR/$LIB 
    rm -r -f example && mkdir -p example && cd example
    tests="poisson solveFromFiles"
+   tests=""
    for test in $tests
    do
       echo "Building $test test..."
diff --git a/ThirdPart/src/LIBCUDSS/install.sh b/ThirdPart/src/LIBCUDSS/install.sh
index c1e3652490..1939513417 100755
--- a/ThirdPart/src/LIBCUDSS/install.sh
+++ b/ThirdPart/src/LIBCUDSS/install.sh
@@ -1,5 +1,7 @@
 #!/bin/bash
-build_dir=libcudss-linux-`uname -m`-0.6.0.5_cuda12-archive
+version=0.7.1.4
+cuda=12 && [ "`nvcc --version 2>/dev/null | grep cuda_13`" != "" ] && cuda=13
+build_dir=libcudss-linux-`uname -m`-$version"_cuda"$cuda-archive
 mkdir -p $TRUST_ROOT/lib/src/LIBCUDSS
 tar -xf $TRUST_ROOT/externalpackages/cudss/$build_dir.tar.xz || exit -1
 cp -r $build_dir/* $TRUST_ROOT/lib/src/LIBCUDSS
diff --git a/ThirdPart/src/LIBKOKKOS/install_arborx.sh b/ThirdPart/src/LIBKOKKOS/install_arborx.sh
index 97df6ccbae..73016db37b 100755
--- a/ThirdPart/src/LIBKOKKOS/install_arborx.sh
+++ b/ThirdPart/src/LIBKOKKOS/install_arborx.sh
@@ -1,7 +1,7 @@
 #!/bin/bash
 [ "$TRUST_STDCPP" = c++14 ] && exit 0
 [ "$TRUST_STDCPP" = c++17 ] && exit 0
-archive=$TRUST_ROOT/externalpackages/kokkos/arborx-2.0.1.tar.gz # C++ 20 
+archive=$TRUST_ROOT/externalpackages/kokkos/arborx-2.1.tar.gz # C++ 20 
 
 build_dir=$TRUST_ROOT/build/arborx
 KOKKOS_ROOT_DIR=$TRUST_ROOT/lib/src/LIBKOKKOS
diff --git a/ThirdPart/src/LIBKOKKOS/install_kokkos-kernels.sh b/ThirdPart/src/LIBKOKKOS/install_kokkos-kernels.sh
index 37120ef684..82f6906946 100755
--- a/ThirdPart/src/LIBKOKKOS/install_kokkos-kernels.sh
+++ b/ThirdPart/src/LIBKOKKOS/install_kokkos-kernels.sh
@@ -3,7 +3,7 @@
 [ "$TRUST_USE_GPU" != 1 ] && exit 0
 
 # Kokkos-kernels:
-archive=$TRUST_ROOT/externalpackages/kokkos/kokkos-kernels-release-candidate-5.1.0.tar.gz
+archive=$TRUST_ROOT/externalpackages/kokkos/kokkos-kernels-5.1.1.tar.gz
 build_dir=$TRUST_ROOT/build/kokkos-kernels
 KOKKOS_ROOT_DIR=$TRUST_ROOT/lib/src/LIBKOKKOS
 # Log file of the process:
diff --git a/ThirdPart/src/LIBKOKKOS/install_kokkos.sh b/ThirdPart/src/LIBKOKKOS/install_kokkos.sh
index 696963c50d..b01d9908da 100755
--- a/ThirdPart/src/LIBKOKKOS/install_kokkos.sh
+++ b/ThirdPart/src/LIBKOKKOS/install_kokkos.sh
@@ -3,7 +3,7 @@
 if [ "$TRUST_STDCPP" = c++20 ]
 then
    # Kokkos (C++20):
-   archive=$TRUST_ROOT/externalpackages/kokkos/kokkos-release-candidate-5.1.0.tar.gz
+   archive=$TRUST_ROOT/externalpackages/kokkos/kokkos-5.1.1.tar.gz
 elif [ "$TRUST_STDCPP" = c++17 ]
 then
    # Kokkos (C++17):
diff --git a/ThirdPart/src/LIBKOKKOS/makefile b/ThirdPart/src/LIBKOKKOS/makefile
index 28d6cfebc7..aca90cc518 100644
--- a/ThirdPart/src/LIBKOKKOS/makefile
+++ b/ThirdPart/src/LIBKOKKOS/makefile
@@ -1,8 +1,9 @@
 # Kokkos
-lib=$(TRUST_LIB)/src/LIBKOKKOS/$(TRUST_ARCH)$(OPT)/lib64/libkokkoscore.a
+lib1=$(TRUST_LIB)/src/LIBKOKKOS/$(TRUST_ARCH)$(OPT)/lib64/libkokkoscore.a
+lib2=$(TRUST_LIB)/src/LIBKOKKOS/$(TRUST_ARCH)$(OPT)/lib64/libkokkoskernels.a
 
-all: $(lib) 
-$(lib): install_kokkos.sh install_kokkos-kernels.sh install_arborx.sh makefile
+all: $(lib1) $(lib2)
+$(lib1) $(lib2): install_kokkos.sh install_kokkos-kernels.sh install_arborx.sh makefile
 	@make clean
 	./install_kokkos.sh && ./install_kokkos-kernels.sh && ./install_arborx.sh
 
diff --git a/ThirdPart/src/LIBLAPACK/Installer b/ThirdPart/src/LIBLAPACK/Installer
index e1bd5482fb..17a8b8a411 100755
--- a/ThirdPart/src/LIBLAPACK/Installer
+++ b/ThirdPart/src/LIBLAPACK/Installer
@@ -11,7 +11,7 @@ install_OpenBlas()
    rm -r -f *-OpenBLAS-*
    OPENBLAS_USE_OPENMP=$TRUST_USE_OPENMP
    # Disable OpenMP for PETSc (Probleme avec STRUMPACK sur GPU)
-   OPENBLAS_USE_OPENMP=0 
+   OPENBLAS_USE_OPENMP=0
    gunzip -f -c $package | tar -xf -
    cd OpenBLAS-$version_openblas
    if [ "$TRUST_CC_BASE_EXTP" != "" ]
@@ -39,7 +39,7 @@ install_OpenBlas()
    #   CPU_ARCH=""
    #fi
    # Instructions -mavx512 fait crasher valgrind sur les machines avec instructions avx512 ... On desactive (comme dans TRUST d'ailleurs)
-   options="CC=$CC FC=$FC USE_THREAD=$OPENBLAS_USE_OPENMP USE_OPENMP=$OPENBLAS_USE_OPENMP NO_AVX512=1 $CPU_ARCH"
+   options="CC=$CC FC=$FC USE_THREAD=$OPENBLAS_USE_OPENMP USE_OPENMP=$OPENBLAS_USE_OPENMP NO_AVX512=1 BUILD_BFLOAT16=0 $CPU_ARCH"
    echo "Installation of $package ($options)..."
    if [ `uname -s` = Darwin ]
    then
@@ -144,8 +144,7 @@ version_lapack=3.4.1 && [ "$TRUST_INT64" != "1" ] && [ "$TRUST_USE_MUMPS" != 1 ]
 # [HPC][!Portabilite] Utilisation OpenBlas par defaut (valide sur TRUST/F5/TrioCFD/G3) -> Decomposition LU plus rapide
 if [ "$TRUST_USE_OPENBLAS" = 1 ]
 then
-   #for tag in 0.3.25 0.3.29
-   for tag in 0.3.29
+   for tag in 0.3.33
    do 
      version_openblas=$tag
      echo "version_openblas = $tag"
diff --git a/ThirdPart/src/LIBPETSC/amgx_int32.cxx b/ThirdPart/src/LIBPETSC/amgx_int32.cxx
index 0cde5cd0ba..305bef9ac8 100644
--- a/ThirdPart/src/LIBPETSC/amgx_int32.cxx
+++ b/ThirdPart/src/LIBPETSC/amgx_int32.cxx
@@ -552,30 +552,30 @@ static PetscErrorCode PCView_AMGX(PC pc, PetscViewer viewer)
 }
 
 /*MC
-     PCAMGX - Interface to NVIDIA's AmgX algebraic multigrid
-
-   Options Database Keys:
-+    -pc_amgx_amg_method <CLASSICAL,AGGREGATION> - set the AMG algorithm to use
-.    -pc_amgx_amg_cycle <V,W,F,CG> - set the AMG cycle type
-.    -pc_amgx_smoother <PCG,PCGF,PBICGSTAB,GMRES,FGMRES,JACOBI_L1,BLOCK_JACOBI,GS,MULTICOLOR_GS,MULTICOLOR_ILU,MULTICOLOR_DILU,CHEBYSHEV_POLY,NOSOLVER> - set the AMG pre/post smoother
-.    -pc_amgx_jacobi_relaxation_factor - set the relaxation factor for Jacobi smoothing
-.    -pc_amgx_gs_symmetric - enforce symmetric Gauss-Seidel smoothing (only applies if GS smoothing is selected)
-.    -pc_amgx_selector <SIZE_2,SIZE_4,SIZE_8,MULTI_PAIRWISE,PMIS,HMIS> - set the AMG coarse selector
-.    -pc_amgx_presweeps - set the number of AMG pre-sweeps
-.    -pc_amgx_postsweeps - set the number of AMG post-sweeps
-.    -pc_amgx_max_levels - set the maximum number of levels in the AMG level hierarchy
-.    -pc_amgx_strength_threshold - set the strength threshold for the AMG coarsening
-.    -pc_amgx_aggressive_levels - set the number of levels (from the finest) that should apply aggressive coarsening
-.    -pc_amgx_coarse_solver <DENSE_LU_SOLVER,NOSOLVER> - set the coarse solve
-.    -pc_amgx_print_grid_stats - output the AMG grid hierarchy to stdout
--    -pc_amgx_verbose - enable AmgX output
+  PCAMGX - Interface to NVIDIA's AmgX algebraic multigrid
+
+  Options Database Keys:
++    -pc_amgx_amg_method (CLASSICAL,AGGREGATION)                       - set the AMG algorithm to use
+.    -pc_amgx_amg_cycle (V,W,F,CG)                                     - set the AMG cycle type
+.    -pc_amgx_jacobi_relaxation_factor                                 - set the relaxation factor for Jacobi smoothing
+.    -pc_amgx_gs_symmetric                                             - enforce symmetric Gauss-Seidel smoothing (only applies if GS smoothing is selected)
+. -pc_amgx_selector (SIZE_2|SIZE_4|SIZE_8|MULTI_PAIRWISE|PMIS|HMIS) - set the AMG coarse selector
+.    -pc_amgx_presweeps                                                - set the number of AMG pre-sweeps
+.    -pc_amgx_postsweeps                                               - set the number of AMG post-sweeps
+.    -pc_amgx_max_levels                                               - set the maximum number of levels in the AMG level hierarchy
+.    -pc_amgx_strength_threshold                                       - set the strength threshold for the AMG coarsening
+.    -pc_amgx_aggressive_levels                                        - set the number of levels (from the finest) that should apply aggressive coarsening
+.    -pc_amgx_coarse_solver (DENSE_LU_SOLVER,NOSOLVER)                 - set the coarse solve
+.    -pc_amgx_print_grid_stats                                         - output the AMG grid hierarchy to `stdout`
+-    -pc_amgx_verbose                                                  - enable AmgX verbose output
+-    -pc_amgx_smoother (PCG|PCGF|PBICGSTAB|GMRES|FGMRES|JACOBI_L1|BLOCK_JACOBI|GS|MULTICOLOR_GS|MULTICOLOR_ILU|MULTICOLOR_DILU|CHEBYSHEV_POLY|NOSOLVER) - set the AMG pre/post smoother
 
    Level: intermediate
 
    Note:
-     Implementation will accept host or device pointers, but good performance will require that the `KSP` is also GPU accelerated so that data is not frequently transferred between host and device.
+   Implementation will accept host or device pointers, but good performance will require that the `KSP` is also GPU accelerated so that data is not frequently transferred between host and device.
 
-.seealso: [](ch_ksp), `PCGAMG`, `PCHYPRE`, `PCMG`, `PCAmgXGetResources()`, `PCCreate()`, `PCSetType()`, `PCType` (for list of available types), `PC`
+.seealso: [](ch_ksp), `PCGAMG`, `PCHYPRE`, `PCMG`, `PCAmgXGetResources()`, `PCCreate()`, `PCSetType()`, `PCType`, `PC`
 M*/
 
 PETSC_EXTERN PetscErrorCode PCCreate_AMGX(PC pc)
diff --git a/ThirdPart/src/LIBPETSC/amgx_int64.cxx b/ThirdPart/src/LIBPETSC/amgx_int64.cxx
index 5587942db3..7b629ce4e1 100644
--- a/ThirdPart/src/LIBPETSC/amgx_int64.cxx
+++ b/ThirdPart/src/LIBPETSC/amgx_int64.cxx
@@ -556,30 +556,30 @@ static PetscErrorCode PCView_AMGX(PC pc, PetscViewer viewer)
 }
 
 /*MC
-     PCAMGX - Interface to NVIDIA's AmgX algebraic multigrid
-
-   Options Database Keys:
-+    -pc_amgx_amg_method <CLASSICAL,AGGREGATION> - set the AMG algorithm to use
-.    -pc_amgx_amg_cycle <V,W,F,CG> - set the AMG cycle type
-.    -pc_amgx_smoother <PCG,PCGF,PBICGSTAB,GMRES,FGMRES,JACOBI_L1,BLOCK_JACOBI,GS,MULTICOLOR_GS,MULTICOLOR_ILU,MULTICOLOR_DILU,CHEBYSHEV_POLY,NOSOLVER> - set the AMG pre/post smoother
-.    -pc_amgx_jacobi_relaxation_factor - set the relaxation factor for Jacobi smoothing
-.    -pc_amgx_gs_symmetric - enforce symmetric Gauss-Seidel smoothing (only applies if GS smoothing is selected)
-.    -pc_amgx_selector <SIZE_2,SIZE_4,SIZE_8,MULTI_PAIRWISE,PMIS,HMIS> - set the AMG coarse selector
-.    -pc_amgx_presweeps - set the number of AMG pre-sweeps
-.    -pc_amgx_postsweeps - set the number of AMG post-sweeps
-.    -pc_amgx_max_levels - set the maximum number of levels in the AMG level hierarchy
-.    -pc_amgx_strength_threshold - set the strength threshold for the AMG coarsening
-.    -pc_amgx_aggressive_levels - set the number of levels (from the finest) that should apply aggressive coarsening
-.    -pc_amgx_coarse_solver <DENSE_LU_SOLVER,NOSOLVER> - set the coarse solve
-.    -pc_amgx_print_grid_stats - output the AMG grid hierarchy to stdout
--    -pc_amgx_verbose - enable AmgX output
+  PCAMGX - Interface to NVIDIA's AmgX algebraic multigrid
+
+  Options Database Keys:
++    -pc_amgx_amg_method (CLASSICAL,AGGREGATION)                       - set the AMG algorithm to use
+.    -pc_amgx_amg_cycle (V,W,F,CG)                                     - set the AMG cycle type
+.    -pc_amgx_jacobi_relaxation_factor                                 - set the relaxation factor for Jacobi smoothing
+.    -pc_amgx_gs_symmetric                                             - enforce symmetric Gauss-Seidel smoothing (only applies if GS smoothing is selected)
+. -pc_amgx_selector (SIZE_2|SIZE_4|SIZE_8|MULTI_PAIRWISE|PMIS|HMIS) - set the AMG coarse selector
+.    -pc_amgx_presweeps                                                - set the number of AMG pre-sweeps
+.    -pc_amgx_postsweeps                                               - set the number of AMG post-sweeps
+.    -pc_amgx_max_levels                                               - set the maximum number of levels in the AMG level hierarchy
+.    -pc_amgx_strength_threshold                                       - set the strength threshold for the AMG coarsening
+.    -pc_amgx_aggressive_levels                                        - set the number of levels (from the finest) that should apply aggressive coarsening
+.    -pc_amgx_coarse_solver (DENSE_LU_SOLVER,NOSOLVER)                 - set the coarse solve
+.    -pc_amgx_print_grid_stats                                         - output the AMG grid hierarchy to `stdout`
+-    -pc_amgx_verbose                                                  - enable AmgX verbose output
+-    -pc_amgx_smoother (PCG|PCGF|PBICGSTAB|GMRES|FGMRES|JACOBI_L1|BLOCK_JACOBI|GS|MULTICOLOR_GS|MULTICOLOR_ILU|MULTICOLOR_DILU|CHEBYSHEV_POLY|NOSOLVER) - set the AMG pre/post smoother
 
    Level: intermediate
 
    Note:
-     Implementation will accept host or device pointers, but good performance will require that the `KSP` is also GPU accelerated so that data is not frequently transferred between host and device.
+   Implementation will accept host or device pointers, but good performance will require that the `KSP` is also GPU accelerated so that data is not frequently transferred between host and device.
 
-.seealso: [](ch_ksp), `PCGAMG`, `PCHYPRE`, `PCMG`, `PCAmgXGetResources()`, `PCCreate()`, `PCSetType()`, `PCType` (for list of available types), `PC`
+.seealso: [](ch_ksp), `PCGAMG`, `PCHYPRE`, `PCMG`, `PCAmgXGetResources()`, `PCCreate()`, `PCSetType()`, `PCType`, `PC`
 M*/
 
 PETSC_EXTERN PetscErrorCode PCCreate_AMGX(PC pc)
diff --git a/ThirdPart/src/LIBPETSC/install b/ThirdPart/src/LIBPETSC/install
index 345bec7c98..9d542d2d60 100755
--- a/ThirdPart/src/LIBPETSC/install
+++ b/ThirdPart/src/LIBPETSC/install
@@ -10,7 +10,7 @@ then
    cd -
 else
    # package=petsc-3.24.2.tar.gz # Huge issue with 3.24.x : make ctest_optim hangs
-   package=petsc-3.23.7.tar.gz && [ "$TRUST_USE_GPU" = 1 ] && package=petsc-5d4b16a5b.tar.gz # 2026_01_06
+   package=petsc-3.23.7.tar.gz && [ "$TRUST_USE_GPU" = 1 ] && package=petsc-99a952e4.tar.gz # 2026_05_31
 fi
 
 ######################################
@@ -290,10 +290,9 @@ with_gpu=""
 # Kokkos-Kernels #
 ##################
 # Toujours un pb de link en debug avec kokkos-kernels
-# Et les performances de kokkos-kernels est moindre (x4 slowdown on A6000) par rapport aux kernels PETSc CUDA
+# Et les performances de kokkos-kernels encore en retrait par rapport aux kernels PETSc CUDA
 # Mais utilise avec GAMG de PETSc cela permet d'avoir une alternative a AmgX (int64?) et Hypre
-# Build error on HIP also and orcus: on active ponctuellement
-ENABLE_KOKKOS=0 # && [ "$HOST" = topaze ] && ENABLE_KOKKOS=1
+ENABLE_KOKKOS=1
 if [ "$ENABLE_KOKKOS" = 1 ] && [ -f $TRUST_ROOT/lib/src/LIBKOKKOS/$TRUST_ARCH"_opt"/lib64/libkokkoskernels.a ]
 then
    with_gpu=$with_gpu" --with-kokkos=1 --with-kokkos-dir=$TRUST_ROOT/lib/src/LIBKOKKOS/$TRUST_ARCH"_opt
@@ -316,8 +315,11 @@ then
       ln -s -f $CUDA_ROOT/lib64/stubs/libcuda.so libcuda.so.1
       export LD_LIBRARY_PATH=`pwd`:$LD_LIBRARY_PATH
    fi
-   # Hack for this sh.t :
-   [ ! -f /usr/lib64/libnvidia-ml.so.1 ] && cp -f $TRUST_ROOT/ThirdPart/src/LIBPETSC/libnvidia-ml.so.1 $TRUST_LIB/libnvidia-ml.so.1
+   # Hack for this sh.t : PL ou cela pose probleme ? Sur orcus, pas sur la frontale donc stubs copie et plantage sur noeud de calcul...
+   #for file in /usr/lib64/libnvidia-ml.so.1 /usr/lib/x86_64-linux-gnu/libnvidia-ml.so.1 $TRUST_ROOT/ThirdPart/src/LIBPETSC/libnvidia-ml.so.1
+   #do
+   #   [ -f $file ] && cp -f $file $TRUST_LIB && break
+   #done
    echo "CUDA support for PETSc with: "$with_gpu
 elif [ "$TRUST_USE_ROCM" = 1 ]
 then
@@ -590,27 +592,38 @@ then
    #######
    # Hypre
    #######
-   add_package hypre 2.33.0
-   # add_package hypre
+   # Le support de ROCm 7.x commence avec 3.1.0. Cuda 13 necessite aussi 3.1.0
+   # hypre=2.33.0 && [ "`nvcc --version 2>&1 | grep cuda_13`" != "" ] && hypre=3.1.0
+   # On passe a 3.1.0 car valide sur JeanZay H100 et Adastra MI300
+   # On passe a master (89e7e8d) pour corriger un pb de build sur ROCm 7.2.x
+   hypre=master
+   add_package hypre $hypre
    if [ "$TRUST_USE_GPU" = 1 ]
    then
-       # Ne surtout pas activer UVM: lent sur AMD et CUDA :
+      # Ne surtout pas activer UVM: lent sur AMD et CUDA :
       #hypre_configure="--enable-unified-memory"
-
-      # Enable MPI GPU awareness for Hypre (plantage encore sur adastra MI250 pendant KSPSolve: Memory access fault by GPU)
-      #[ "$TRUST_MPI_GPU_AWARE" = 1 ] && [ "$ROCM_ARCH" != gfx90a ] && hypre_configure=$hypre_configure" --enable-gpu-aware-mpi"
-      # PL: I disable --enable-gpu-aware-mpi cause issue on adastra (perf or crash)
-      sed -i "1,$ s?--enable-gpu-aware-mpi??g" config/BuildSystem/config/packages/hypre.py || exit -1
-
+      # Seems OK on Lumi (40% faster on 4xMI250X on OpenMP_Iterateur)
+      # 20% faster on 2xA100 (orcus) TaylorGreen_BENCH
+      # 50% faster on 8xMI300A (adastra) TaylorGreen_BENCH
+      # No gain on Topaze on 4xA100 CALCUL_83M ?
+      #if [ $TRUST_USE_ROCM = 1 ]
+      #then
+      #   echo "WARNING: Disabling --enable-gpu-aware-mpi in Hypre on ROCm cause issues on adastra (perf or crash) ?"
+      #   sed -i "1,$ s?--enable-gpu-aware-mpi??g" config/BuildSystem/config/packages/hypre.py || exit -1
+      #fi
       # Hack pour Hypre car --download-hypre-configure-arguments difficile a faire marcher avec plusieurs arguments...
       sed -i "1,$ s?--with-MPI-libs?$hypre_configure --with-MPI-libs?g" config/BuildSystem/config/packages/hypre.py || exit -1
-
+      
       #nedit config/BuildSystem/config/packages/hypre.py
 
       [ "$TRUST_USE_CUDA" = 1 ] && GPU_ARCH=$TRUST_CUDA_CC
       [ "$TRUST_USE_ROCM" = 1 ] && GPU_ARCH=$ROCM_ARCH
       with_packages=$with_packages" --with-hypre-gpu-arch=$GPU_ARCH"
-      #add_package Umpire # Recommended for performance See later some issue when linking with Hypre
+      # Recommended for performance during Hypre setup according PETSc:
+      # Setup on 8xMI300A 15s->13s but +30% RAM !
+      # Mandatory for HIP with --enable-gpu-aware-mpi cause OpenMP_Iterateur/weak_scaling.sh crash on device on 256xMI250X
+      # Disabled for CUDA cause hangs on orcus a100
+      [ "$TRUST_USE_ROCM" = 1 ] && add_package Umpire && LIBS="--LIBS=-lrt" # Fix on old Fedora: Undefined shm_open
    fi
 fi
 
@@ -701,7 +714,7 @@ do
    echo "Configuring PETSc..."
    # Hack provisoire sur aarch64, python de conda fait planter le configure:
    [ "`uname -m`" = aarch64 ] && PATH=/usr/bin:$PATH
-   LIBS="" && [ `uname -s` = Darwin ] && LIBS=--LIBS=`$TRUST_Awk '/SYSLIBS =/ {gsub("SYSLIBS =","",$0);print $0}' $fic_env`
+   [ `uname -s` = Darwin ] && LIBS=--LIBS=`$TRUST_Awk '/SYSLIBS =/ {gsub("SYSLIBS =","",$0);print $0}' $fic_env`
    export TMPDIR=$TRUST_TMP # Par defaut, les fichiers temporaires de PETSC sont sous /tmp, cela peut probleme si pas de droit d'execution donnes
    ./configure --help 1>../configure.help 2>&1
    cp ../configure.help $PETSC_ROOT/
diff --git a/Validation/Rapports_automatiques/Verification/Verification_codage/Champ_front_MED/src/pre_run b/Validation/Rapports_automatiques/Verification/Verification_codage/Champ_front_MED/src/pre_run
index 1a37b51bfc..d017c52b45 100644
--- a/Validation/Rapports_automatiques/Verification/Verification_codage/Champ_front_MED/src/pre_run
+++ b/Validation/Rapports_automatiques/Verification/Verification_codage/Champ_front_MED/src/pre_run
@@ -8,5 +8,6 @@ source $TRUST_MEDCOUPLING_ROOT/env.sh
 python ./prepare.py || exit -1
 
 # Dump du field.med cree:
+export LD_LIBRARY_PATH=$TRUST_ROOT/lib/src/LIBHDF5/lib:$LD_LIBRARY_PATH
 echo -e "1\n1\n0\n" | $TRUST_ROOT/lib/src/LIBMED/bin/mdump --structure fields.med
 
diff --git a/bin/KSH/Createcmakefile.py b/bin/KSH/Createcmakefile.py
index a4fe09d131..34f800114d 100644
--- a/bin/KSH/Createcmakefile.py
+++ b/bin/KSH/Createcmakefile.py
@@ -359,6 +359,9 @@ def generate_cmake_files(root_dir, atelier):
 
   if (lib STREQUAL "nvidia-ml")
     set (lib${lib} /usr/lib64/libnvidia-ml.so.1) # PC, some clusters
+    if(NOT EXISTS ${lib${lib}})
+       set (lib${lib} /usr/lib/x86_64-linux-gnu/libnvidia-ml.so.1) # PC Ubuntu
+    endif()   
     if(NOT EXISTS ${lib${lib}}) # Other clusters, cmake can't find this sh.t
        set (lib${lib} ${TRUST_ROOT}/lib/libnvidia-ml.so.1)
     endif()   
diff --git a/bin/KSH/debog.sh b/bin/KSH/debog.sh
index c0d82260d9..7c3df673db 100755
--- a/bin/KSH/debog.sh
+++ b/bin/KSH/debog.sh
@@ -4,8 +4,11 @@ jdd=`pwd`
 jdd=`basename $jdd`
 cp $jdd.data cpu.data
 cp $jdd.data gpu.data
-sed -i "1,$ s?Solve?Debog pb seq faces 1.e-6 0 Solve?g" cpu.data
-sed -i "1,$ s?Solve?Debog pb seq faces 1.e-6 1 Solve?g" gpu.data
-(source $cpu/env_TRUST.sh;$exec  cpu 2>&1 | tee cpu.out_err)
-$exec gpu 2>&1 | tee gpu.out_err
-compare_lata cpu.lml gpu.lml
+pb=`awk '/Solve/ && (NF==2) {print $2}' $jdd.data`
+seuil=1.e-12 # seuil relatif mais sert aussi comme seuil absolu pour filtrer
+sed -i "1,$ s?Solve?Debog $pb seq faces $seuil 1 Solve?g" cpu.data || exit -1
+sed -i "1,$ s?Solve?Debog $pb seq faces $seuil 0 Solve?g" gpu.data || exit -1
+rm -f *lml
+TRUST_CLOCK_ON=1 $exec gpu 2>&1 | tee gpu.out_err
+(env=`ls $cpu/env_*sh`;source $env;$exec  cpu 2>&1 | tee cpu.out_err)
+compare_lata cpu.lml gpu.lml --max_delta
diff --git a/bin/gestion_externalpackages/md5.ref b/bin/gestion_externalpackages/md5.ref
index fa522032be..89865c560a 100644
--- a/bin/gestion_externalpackages/md5.ref
+++ b/bin/gestion_externalpackages/md5.ref
@@ -259,21 +259,23 @@ b8042f9970ea70a36da1ee1fae27c448  VisIt/mesa-17.3.9.tar.xz
 85adef240c5f370b308da8c938951a68  VisIt/zlib-1.2.11.tar.xz
 ba84eaa8564155babd4ba1458d4eaa11  astyle_2.03_linux.tar.gz
 d43a8fbe83767978098ba7f8ee25d3d1  ccache/ccache-3.1.4.tar.gz
+134a43d5c8e01c7805b1adc2dc7e7048  ccache/ccache-4.13.6-linux-aarch64-musl-static.tar.gz
 27fc515a919221d69008cf7347137752  ccache/ccache-4.8.2-darwin.tar.gz
 34991901e77027afcc3bb16a9595c353  ccache/ccache-4.8.2-linux-x86_64.tar.gz
-34fd4b0843da02ebaa76f5711e1b63de  cudss/libcudss-linux-aarch64-0.6.0.5_cuda12-archive.tar.xz
-4ac17f5b35a4ecc550c4d7c479a5c5b5  cudss/libcudss-linux-x86_64-0.6.0.5_cuda12-archive.tar.xz
+e81b58209814379f5d1476705229602f  cudss/libcudss-linux-aarch64-0.7.1.4_cuda13-archive.tar.xz
+97a40c68c2f4d4d0405532c65ede87b7  cudss/libcudss-linux-x86_64-0.7.1.4_cuda12-archive.tar.xz
+38cfe9a97f3d8e8060d99ba34bdd8d3b  cudss/libcudss-linux-x86_64-0.7.1.4_cuda13-archive.tar.xz
 2cf02a542c1933de95bdbe3f42188ffa  doxygen-1.7.4.linux.bin.tar.gz
 065cef54eb09cdb54614e1ed353ddbd1  doxygen-1.9.3.src.tar.gz
 27c5022f697e2522c0dbab439b9573b9  gnuplot/gnuplot-5.2.7.tar.gz
 ea0931758fc180e3b1950931b9869921  gnuplot/gnuplot-6.0.2.tar.gz
 4fa24da17c99b122a56cb8808b6eb78b  hwloc-2.7.1.tar.gz
-a51b0a245c34151f42a4c6120d2aceec  kokkos/arborx-2.0.1.tar.gz
+58e992e49dcb3a100e6b7dbd75a6689b  kokkos/arborx-2.1.tar.gz
 36abe803480d07db87f9ba03cd5a842c  kokkos/kokkos-3.7.02.tgz
 24cd603e2a047fc8d67d814f33769f54  kokkos/kokkos-4.7.00.tar.gz
-7ab1e3728978c5be85d77485eea96aa1  kokkos/kokkos-kernels-release-candidate-5.1.0.tar.gz
-6a1c520d8aa7147fb4067ee7745e2575  kokkos/kokkos-release-candidate-5.1.0.tar.gz
-853a0c5c0747c5943e7ef4bbb793162d  lapack/OpenBLAS-0.3.29.tar.gz
+1d68ff32eaea69cb97726fb6b6354b7c  kokkos/kokkos-5.1.1.tar.gz
+3a7125c6e47f19dc3d75712b24293aaa  kokkos/kokkos-kernels-5.1.1.tar.gz
+96c5cd9013013faefc294bc57830c77d  lapack/OpenBLAS-0.3.33.tar.gz
 44c3869c38c8335c2b9c2a8bb276eb55  lapack/lapack-3.4.1.tgz
 b5e558f981326d9ca1bfdb841640721a  make-4.0.tar.gz
 9be6e048224797bf531f94b7a8aaa99d  osqp-0.6.0.tar.gz
@@ -286,12 +288,13 @@ d4c0862c48e6e9742807e6e50bdf5deb  petsc/STRUMPACK_8.0.0.tar.gz
 8048f7b7b50daa99257593cf2e7d785a  petsc/ScaLAPACK-6f56981cb0cabffd8c72c7d1016146c4b8e276dc.tar.gz
 e659373ed5e9b961d2fcb6d67d250783  petsc/SuiteSparse-7.7.0.tar.gz
 f8559a94ee64c8b70ebd79b65576d08d  petsc/SuperLU_DIST-9.1.0.tar.gz
-da990c4e944ede86879fd29ee309d8c4  petsc/Umpire-2025.09.0.tar.gz
+a684171841395b8903963e3970aae567  petsc/Umpire-2025.12.0.tar.gz
 d4990384b7b1d8b0357fc34d91530d49  petsc/hypre-2.33.0.tar.gz
 c33d67d1ae475460002782b09929e5cf  petsc/hypre-3.1.0.tar.gz
+870443b8d6d173469e91af547e6bcb5a  petsc/hypre-master.tar.gz
 88a40e3bf9e8ee28af8725a73f9e3bc3  petsc/petsc-3.23.7.tar.gz
 d4c79d4859cd6770439e7a4d880777de  petsc/petsc-3.24.2.tar.gz
-6a53e417a59d7cbde483babb624703f7  petsc/petsc-5d4b16a5b.tar.gz
+bb46da11f4af1c212cd0bdcaef48fa91  petsc/petsc-99a952e4.tar.gz
 f121c9d7ef5e43a20899acd93f425b22  petsc/slate-v2023.06.00.tgz
 5390282424f874836d572e2ae4e3c185  petsc/zfp-1.0.1.tar.gz
 e97ed4ddf3b59a05729097ab66a46b03  pip/mpi4py-4.0.0.tar.gz
diff --git a/bin/lance_test b/bin/lance_test
index b559b50010..aba68fa885 100755
--- a/bin/lance_test
+++ b/bin/lance_test
@@ -1011,7 +1011,7 @@ then
              liste_skipped=$liste_skipped" $i"
              i=""
           # Discard Pb_multiphase #
-          elif [ "`grep -i "Pb_multiphase " $file 2>/dev/null`" != "" ]
+          elif [ "`grep -i "Pb_multiphase" $file 2>/dev/null`" != "" ]
           then
              echo "Test $i skipped (Pb_multiphase not yet supported on GPU)"
              liste_skipped=$liste_skipped" $i"
diff --git a/bin/mklibs b/bin/mklibs
index 27d802beca..8671767134 100755
--- a/bin/mklibs
+++ b/bin/mklibs
@@ -50,7 +50,7 @@ then
       [ -f libcmumps.a ] && PETSC_L=$PETSC_L" cmumps dmumps mumps_common smumps zmumps pord"
       [ -f libpastix.a ] && PETSC_L=$PETSC_L" pastix"
       [ -f libml.a ] && PETSC_L=$PETSC_L" ml"
-      for lib in HYPRE strumpack magma sbutterflypack dbutterflypack cbutterflypack zbutterflypack zfp slate blaspp lapackpp scalapack blacs spai parms parmetis metis ptesmumps ptscotcherr ptscotcherrexit ptscotchparmetisv3 ptscotch scotch scotcherr scotcherrexit
+      for lib in HYPRE umpire camp strumpack magma sbutterflypack dbutterflypack cbutterflypack zbutterflypack zfp slate blaspp lapackpp scalapack blacs spai parms parmetis metis ptesmumps ptscotcherr ptscotcherrexit ptscotchparmetisv3 ptscotch scotch scotcherr scotcherrexit
       do
          [ -f lib$lib.a ] && PETSC_L=$PETSC_L" $lib"
       done
diff --git a/bin/trust b/bin/trust
index 3fae8615f4..aac19d669a 100755
--- a/bin/trust
+++ b/bin/trust
@@ -144,6 +144,7 @@ help()
    echo "-perf                         : Run perf tool (profiling)."
    echo "-trace                        : Run traceanalyzer tool (MPI profiling)."
    [ "`rocprof --version 1>/dev/null 2>&1;echo $?`" = 0 ] && echo "-rocprof                      : Run rocprof tool (GPU profiling on AMD)"
+   [ "`rocprof --version 1>/dev/null 2>&1;echo $?`" = 0 ] && echo "-rcu                       : Run rocprof compute (GPU profiling on AMD)"
    [ "`nsys --version 1>/dev/null 2>&1;echo $?`" = 0 ] && echo "-nsys                         : Run Nsight system tool (GPU profiling)."
    [ "`ncu --version 1>/dev/null 2>&1;echo $?`" = 0 ] && echo "-ncu kernel                    : Run Nsight compute tool on given kernel (GPU Kernel profiling)."
    [ "`compute-sanitizer --version 1>/dev/null 2>&1;echo $?`" = 0 ] && echo "-cs memcheck|racecheck|initcheck|synccheck : Run compute sanitizer tool (GPU debugging)."
@@ -455,6 +456,9 @@ do
    elif [ "$1" = "-rocprof" ]
    then
       ROCPROF=1
+   elif [ "$1" = "-rcu" ]
+   then
+      RCU=1
    elif [ "$1" = "-nsys" ]
    then
       NSYS=1
@@ -880,9 +884,17 @@ then
    rm -f *.pftrace *.csv
    #See https://dci.dci-gitlab.cines.fr/webextranet/software_stack/tools/index.html#adastra-software-stack-tools-profiling-rocprof
    #export AMD_SERIALIZE_COPY=3 AMD_SERIALIZE_KERNEL=3 GPU_MAX_HW_QUEUES=1
-   #profiler="rocprofv3 -o $NOM --output-format=pftrace --hip-trace --"
-   #exec="\"rocprofv3 -o $NOM --output-format=pftrace --hip-trace --hip-runtime-trace -- $exec\""
-   exec="\"rocprofv3 -o $NOM --output-format=pftrace --memory-allocation-trace --hip-trace --kernel-trace --kokkos-trace -- $exec\""
+   # --runtime-trace : Collects tracing data for HIP runtime API, marker (ROCTx) API, RCCL API, memory operations (copies, scratch, and allocation), and kernel dispatches.
+   # --kokkos-trace: Enables builtin Kokkos tools support, which implies enabling --marker-trace collection and --kernel-rename.
+   # --scratch-memory-trace: detect scratch mem alloc
+   # --stats
+   exec="\"rocprofv3 -o $NOM --output-format=pftrace --memory-copy-trace --memory-allocation-trace --hip-trace --kernel-trace --kokkos-trace -- $exec\""
+elif [ "$RCU" = 1 ]
+then
+   # Trace roofline
+   rm -f *.pftrace *.csv
+   echo "pmc: TCC_EA_RDREQ_32B_sum TCC_EA_RDREQ_sum TCC_EA_WRREQ_sum TCC_EA_WRREQ_64B_sum SQ_INSTS_VALU_ADD_F64 SQ_INSTS_VALU_MUL_F64 SQ_INSTS_VALU_FMA_F64 SQ_INSTS_VALU_TRANS_F64 SQ_INSTS_VALU_MFMA_MOPS_F64" > counters.txt
+   exec="\"rocprofv3 --input=counters.txt -- $exec\""   
 elif [ "$NSYS" = 1 ]
 then
    rm -f $NOM.qdrep
@@ -903,7 +915,7 @@ then
       then
          sudo sh -c 'echo 1 >/proc/sys/kernel/perf_event_paranoid'
          # metriques GPU en plus mais necessite d'etre sudo
-         trace=$trace" --gpu-metrics-device all"
+         trace=$trace" --gpu-metrics-devices all"
          # profiler="sudo " # Mais attention fichiers ecrits root !
       fi
    fi   
@@ -1201,17 +1213,6 @@ then
       then
          [ "`ldd $binary 2>/dev/null | grep gtl_hsa`" = "" ] && echo "Error, binary $exec not linked correctly for MPICH_GPU_SUPPORT_ENABLED=1 !" && exit -1
          echo "export MPICH_GPU_SUPPORT_ENABLED=1" >> $GPU_DIRECT
-         # More variables for adastra to fiabilize GPU communications tips from adastra support:
-         # Move into HOST files ?
-         if [ $HOST = adastra ] || [ $HOST = lumi ]
-         then
-	   echo "# Some flags to enable if issues MI250/MI300:
-export MPICH_ASYNC_PROGRESS=1
-export FI_CXI_RX_MATCH_MODE=software
-export FI_CXI_REQ_BUF_SIZE=12582912
-export FI_CXI_REQ_BUF_MIN_POSTED=8
-export FI_CXI_DEFAULT_CQ_SIZE=393216" >> $GPU_DIRECT
-	 fi
       fi
       PETSC_OPTIONS=$PETSC_OPTIONS" -use_gpu_aware_mpi 1"
    elif [ "$TRUST_USE_GPU" = 1 ]
@@ -1236,6 +1237,7 @@ export FI_CXI_DEFAULT_CQ_SIZE=393216" >> $GPU_DIRECT
       # et blocage possible sur castor avec Baltik par exemple !!!
       OUTPUT=.$NOM
    fi
+   rm -f $OUTPUT.err $OUTPUT.out
    if [ $NB_PROCS = 1 ] && [ $USE_MPIRUN = 0 ]
    then
       echo "$profiler \$exec \$case $PETSC_OPTIONS 1>$OUTPUT.out 2>$OUTPUT.err" >> $sub_file
@@ -1326,10 +1328,13 @@ export FI_CXI_DEFAULT_CQ_SIZE=393216" >> $GPU_DIRECT
    # A cause de platine (permission denied), on utilise plutot les chemins pointes /proc/self/fd/1 et 2
    [ -f $OUTPUT.out ] && cat $OUTPUT.out >> "/proc/self/fd/1" #cat $OUTPUT.out > "/dev/stdout"
    [ -f $OUTPUT.err ] && cat $OUTPUT.err >> "/proc/self/fd/2" #cat $OUTPUT.err > "/dev/stderr"
-   # Check for TRUST calculation only (cause coupled MC2 calculation DO NOT produce this message for example):
-   if [ "`grep 'Executable: ' $OUTPUT.err 2>/dev/null`" != "" ] && [ "`grep 'Arret des processes.' $OUTPUT.err`" = "" ]
+   if [ ! -f $OUTPUT.err ]
    then
+      echo "Error of submission !"
       err=1
+   else
+      # Check for TRUST calculation only (cause coupled MC2 calculation DO NOT produce this message for example):
+      [ "`grep 'Executable: ' $OUTPUT.err 2>/dev/null`" != "" ] && [ "`grep 'Arret des processes.' $OUTPUT.err`" = "" ] && err=1
    fi
    # Try to detect crashes (if it is not a TRUST binary, example PETSc test case)
    if [ "`grep 'invalid device function' $OUTPUT.err`" != "" ] || [ "`grep 'Signal: Aborted' $OUTPUT.err`" != "" ]
@@ -1337,7 +1342,7 @@ export FI_CXI_DEFAULT_CQ_SIZE=393216" >> $GPU_DIRECT
       err=1
    fi
    [ $sub = CCC ] && ccc_myproject -P $queue 2>/dev/null 		# Sur CCRT heures de calcul
-   [ "$TRUST_USE_SACCT" = 1 ] && trust -energy $id | tee -a $NOM.TU	# Energy consumption printed and added to .TU
+   [ "$err" != 1 ] && [ "$TRUST_USE_SACCT" = 1 ] && trust -energy $id | tee -a $NOM.TU	# Energy consumption printed and added to .TU
 else
    # Pas de gestionnaire, example PC:
    [ "$cpus_per_task" != "" ]         && [ "$HOST" != "jean-zay" ] && echo "Number of core per task option not supported yet on $HOST. Contact TRUST support" && exit -1
@@ -1432,10 +1437,10 @@ then
 elif [ "$ROCPROF" = 1 ]
 then
    echo "Use -g flag for source stack"
-   echo "Open $NOM"_results".fptrace file with: firefox https://ui.perfetto.dev"
+   echo "Open $NOM"_results".pftrace file with: firefox https://ui.perfetto.dev"
 elif [ "$NSYS" = 1 ]
 then
-   [ -f $NOM.nsys-rep ] && nsys-ui $NOM.nsys-rep
+   [ -f $NOM.nsys-rep ] && nsys-ui $NOM.nsys-rep && rm -f $NOM.nsys-rep $NOM.*.html $NOM.sqlite
 elif [ "$NCU" = 1 ]
 then
    # Marche pas encore:
diff --git a/env_src/HOST.env b/env_src/HOST.env
index bc979b8d90..7a1baa330f 100755
--- a/env_src/HOST.env
+++ b/env_src/HOST.env
@@ -21,10 +21,14 @@ elif [ "${HOST#gutta}"        != $HOST ];then HOST=aar
 elif [ "${HOST#grenx}"        != $HOST ];then HOST=aar
 elif [ "${HOST#aar}"          != $HOST ];then HOST=aar
 elif [ "${HOST#summer}"       != $HOST ];then HOST=summer
-elif [ "${HOST#calypso}"   != $HOST ]
+elif [ "${HOST#calypso}"      != $HOST ] # GH100:
 then
-    [ "${HOST#calypso-grace}"   = $HOST ] && echo "Error, you need to log on grace node with: salloc -p grace --gres=gpu:0" && exit
+    [ "${HOST#calypso-grace}"   = $HOST ] && echo "Error, you need to log and build on grace node (ARM) with: salloc -p grace --gres=gpu:0"
     HOST=calypso-grace
+elif [ "${HOST#dalia}"        != $HOST ] # GB200: Intel sur la frontale, incompatible avec ARM sur le noeud de calcul:
+then
+    [ "${HOST#dalianvl}"        = $HOST ] && echo "Error, you need to log and build (configure && make) on grace node (ARM) with: srun -p defq -t 240 --exclusive -c 144 --gres=gpu:0 --pty bash"
+    HOST=dalianvl
 elif [ "${HOST#mezel}"        != $HOST ]
 then
    HOST=mezel
diff --git a/env_src/HOST_adastra.sh b/env_src/HOST_adastra.sh
index 9c90cfb203..f13a351db3 100755
--- a/env_src/HOST_adastra.sh
+++ b/env_src/HOST_adastra.sh
@@ -9,7 +9,8 @@ define_modules_config()
    echo "Command qstat created on $HOST"
    cp $TRUST_ROOT/bin/KSH/qstat_wrapper $TRUST_ROOT/bin/KSH/qstat
    # Initialisation de l environnement module $MODULE_PATH
-   echo "source /etc/profile" >> $env
+   # echo "source /etc/profile" >> $env # Slow....
+   echo "source /etc/bashrc" >> $env
    #
    # Load modules
    if [ "$TRUST_USE_ROCM" = 1 ]
@@ -23,6 +24,7 @@ define_modules_config()
          echo "$ROCM_ARCH not supported on adastra!"
       fi
       # Compilateur hipcc
+      module="PrgEnv-gnu/8.6.0 craype-accel-amd-$ROCM_ARCH rocm/6.4.3 gcc/11.2" # KO
       module="PrgEnv-gnu/8.6.0 craype-accel-amd-$ROCM_ARCH rocm/6.4.3"
       module=$module" firefox" # For profiling
    else
@@ -36,7 +38,7 @@ define_modules_config()
       # PL: C++20
       module="craype-x86-trento craype-network-ofi PrgEnv-gnu/8.5.0 libfabric" # gcc 13.X
    fi
-   module=$module" python/3.12.1 swig" # Pour -without-conda
+   module=$module" python/3.12.1 cmake/3.27.9 swig" # Pour -without-conda
    #
    echo "# Module $module detected and loaded on $HOST."
    echo "module purge 1>/dev/null" >> $env
@@ -67,6 +69,7 @@ define_soumission_batch()
    then
       project="genden15"
    fi   
+   rm -f ld_env.sh
    if [ "$gpu" = 1 ]
    then
       if [ "$ROCM_ARCH" = gfx90a ] # Partition MI250X (BW: 1600 GB/s)
@@ -83,7 +86,7 @@ define_soumission_batch()
           gpu_per_node=4
 	  # Not available on the GPU nodes:
           #cp -f /lib64/libsuitesparseconfig.so.4 .
-          #echo "export LD_LIBRARY_PATH=.:\$LD_LIBRARY_PATH" > ld_env.sh
+          #echo "export LD_LIBRARY_PATH=.:\$LD_LIBRARY_PATH" >> ld_env.sh
 	  #echo "export TRUST_DISABLE_CHECK_OS=1" >> ld_env.sh
       fi
       noeuds=`echo "1+($NB_PROCS-1)/$gpu_per_node" | bc`
@@ -100,6 +103,13 @@ define_soumission_batch()
       srun_options=""
       #[ $NB_PROCS -gt ??? ] && qos=???
    fi
+   # More variables for adastra to fiabilize GPU communications tips from adastra support:
+   echo "# Some flags to enable if issues MI250/MI300:
+export MPICH_ASYNC_PROGRESS=1
+export FI_CXI_RX_MATCH_MODE=software
+export FI_CXI_REQ_BUF_SIZE=12582912
+export FI_CXI_REQ_BUF_MIN_POSTED=8
+export FI_CXI_DEFAULT_CQ_SIZE=393216" >> ld_env.sh
    node=1 # --exclusive
    ram=0 # Important pour acceder a toute la RAM du noeud
    # ToDo utiliser le binding !!!
@@ -107,11 +117,14 @@ define_soumission_batch()
    # https://dci.dci-gitlab.cines.fr/webextranet/porting_optimization/detailed_binding_script.html#adastra-detailed-binding-script
    # Attention, le verbose est important sinon crash ! voir doc
    USE_MPIRUN=1 # Pour profiter du binding meme en sequentiel
-   if [ "$TRUST_USE_OLD_BINDING" = 1 ] || [ "$ROCM_ARCH" = gfx942 ] # Pas clair encore le binding sur MI300
+   if [ "$ROCM_ARCH" = gfx942 ] # Pas clair encore le binding sur MI300
    then
       mpirun="srun -l $srun_options --mpi=cray_shasta --mem-bind=local --cpu-bind=verbose,cores"
-   else
+   elif [ "$ROCM_ARCH" = gfx90a ] # MI250
+   then
       mpirun="srun -l $srun_options --mem-bind=none --cpu-bind=verbose,none -- \$TRUST_ROOT/env_src/adastra_acc_binding.sh"
+   else
+      mpirun="srun -l $srun_options --mem-bind=none --cpu-bind=verbose,none -- "
    fi
    sub=SLURM
 }
diff --git a/env_src/HOST_calypso.sh b/env_src/HOST_calypso.sh
deleted file mode 100755
index 856b92f4fc..0000000000
--- a/env_src/HOST_calypso.sh
+++ /dev/null
@@ -1,54 +0,0 @@
-#!/bin/bash
-##################################
-# Variables for configure script #
-##################################
-define_modules_config()
-{
-   env=$TRUST_ROOT/env/machine.env
-   # Initialisation de l environnement module $MODULE_PATH
-   #echo "source /etc/profile.d/modules.sh " >> $env
-   # Load modules
-   module="python/3.12.10 tools/cmake/3.28.2_arm nvhpc/24.1"  # openmpi/4.1.7_gcc114_cuda124 gcc/12.3.0_arm
-   [ "$TRUST_CUDA_CC" = "" ] && TRUST_CUDA_CC=90 # H100
-   echo "# Module $module detected and loaded on $HOST." 
-   echo "module purge 1>/dev/null" >> $env
-   echo "module load $module" >> $env
-   echo "[ \$? != 0 ] && echo \"Error: $module not found; we exit...\" && echo \"Contat TRUST support team or system administrator\" && exit -1" >> $env
-   echo $source >> $env
-   . $env
-   # Creation wrapper qstat -> squeue
-   echo "#!/bin/bash
-squeue" > $TRUST_ROOT/bin/qstat
-   chmod +x $TRUST_ROOT/bin/qstat
-}
-
-##############################
-# Variables for trust script #
-##############################
-define_soumission_batch()
-{
-   soumission=2
-   [ "$prod" = 1 ] && soumission=1
-   [ "$gpu"  = 1 ] && soumission=1
-   queue=grace && gpus_per_node=`echo $NB_PROCS | awk '{print $1<1?$1:1}'` && noeuds=`echo "1+($NB_PROCS-1)/1" | bc` # 1GPU/node
-   if [ "$prod" = 1 ] || [ "$NB_PROCS" -gt 40 ]
-   then
-      qos=2jours && cpu=2880
-      [ "$gpu" != 1 ] && node=1 # exclusif uniquement sur cpu
-   else
-      qos=test	&& cpu=60   && node=0 
-   fi
-   # Le binding ameliore fortement les performances sur AMD quelque soit MPI:
-   if [ "$I_MPI_ROOT" != "" ] # IntelMPI
-   then
-      binding="-m block:block --cpu-bind=rank"
-      mpirun="srun $binding -n \$SLURM_NTASKS"
-   elif  [ "$HPCX_DIR" != "" ] # HPC-X
-   then 
-      binding="--map-by numa --bind-to core"
-      mpirun="mpirun $binding -n \$SLURM_NTASKS"
-   else
-      mpirun="srun --mpi=pmix -n \$SLURM_NTASKS"
-   fi
-   sub=SLURM
-}
diff --git a/env_src/HOST_dalianvl.sh b/env_src/HOST_dalianvl.sh
new file mode 100755
index 0000000000..23c2b489bc
--- /dev/null
+++ b/env_src/HOST_dalianvl.sh
@@ -0,0 +1,53 @@
+#!/bin/bash
+#######################################################################
+# Dalia                                                      #
+#######################################################################
+
+
+##################################
+# Variables for configure script #
+##################################
+define_modules_config()
+{
+   env=$TRUST_ROOT/env/machine.env
+   # Initialisation de l environnement module $MODULE_PATH si pas disponible:
+   module -v 1>/dev/null 2>&1 || echo $echo "source /etc/profile" >> $env
+   #
+   # Load modules (do not take gcc/14.2.0)
+   module="gmp/6.3.0 mpfr/4.2.1 mpc/1.3.1 slurm/slurm/24.11 nvhpc/25.9"
+   echo "# Module $module detected and loaded on $HOST."
+   echo "module purge 1>/dev/null" >> $env
+   echo "module load $module 1>/dev/null || exit -1" >> $env
+   echo "export TRUST_USE_SACCT=1" >> $env # Energy data 
+   echo "export USE_NVHPC_MPI=1" >> $env
+   #echo "export TRUST_BATCH=\"srun -p defq -n 144 -A pdl17744\"" >> $env
+   . $env
+   # Creation wrapper qstat -> squeue
+   echo "#!/bin/bash
+squeue" > $TRUST_ROOT/bin/qstat
+   chmod +x $TRUST_ROOT/bin/qstat
+}
+
+##############################
+# Variables for trust script #
+##############################
+define_soumission_batch()
+{
+   soumission=1
+   # http://www.idris.fr/docs/dalia/dalia-environnement
+   project="pdl17744"
+   ntasks=144 # number of cores max
+   gpus_per_node=`echo $NB_PROCS | awk '{print $1<4?$1:4}'`
+   noeuds=`echo "1+($NB_PROCS-1)/4" | bc`
+   cpus_per_task=`echo $ntasks/$gpus_per_node | bc`
+   noeuds=`echo "1+($NB_PROCS-1)/4" | bc`
+   queue=defq
+   cpu=60 # 2880 # 2 days
+   #os=qos_gpu$q-t3 && cpu=1200 && [ "$prod" != 1 ] && [ $NB_PROCS -le 32 ] && qos=qos_gpu$q-dev && cpu=120 
+   #hintnomultithread=1
+   #node=1 # --exclusive
+   USE_MPIRUN=1
+   mpirun="mpiexec -n \$SLURM_NTASKS --mca pml ucx" # Command for NVHPC MPI UCX
+   sub=SLURM
+}
+
diff --git a/env_src/HOST_lumi.sh b/env_src/HOST_lumi.sh
index 24025d2447..0e590ea525 100755
--- a/env_src/HOST_lumi.sh
+++ b/env_src/HOST_lumi.sh
@@ -21,7 +21,8 @@ define_modules_config()
       else
          echo "$ROCM_ARCH not supported on lumi!"
       fi
-      module="PrgEnv-gnu/8.5.0 craype-accel-amd-$ROCM_ARCH LUMI/24.03 partition/G buildtools/24.03 CrayEnv rocm/6.4.4 gnuplot/5.4.10-cpeGNU-24.03"
+      LUMI=25.03 # Default 2026_05_15
+      module="PrgEnv-gnu/8.5.0 craype-accel-amd-$ROCM_ARCH LUMI/$LUMI partition/G buildtools/$LUMI CrayEnv rocm/6.4.4"
    else
       echo "Not configured." && exit -1 
    fi
@@ -52,7 +53,7 @@ define_soumission_batch()
       [ $NB_PROCS -le 128 ] && queue=dev-g      && qos="" && cpu=60 # 1h
       [ $NB_PROCS -le 64 ]  && queue=dev-g      && qos="" && cpu=120 # h2
    fi
-   project=project_465002428
+   project=project_465002986
    if [ "$gpu" = 1 ]
    then
       if [ "$ROCM_ARCH" = gfx90a ] # Partition MI250X (BW: 1600 GB/s)
@@ -68,6 +69,13 @@ define_soumission_batch()
    else
       echo "ToDo"
    fi
+   # More variables for adastra to fiabilize GPU communications tips from adastra support:
+   echo "# Some flags to enable if issues MI250/MI300:
+export MPICH_ASYNC_PROGRESS=1
+export FI_CXI_RX_MATCH_MODE=software
+export FI_CXI_REQ_BUF_SIZE=12582912
+export FI_CXI_REQ_BUF_MIN_POSTED=8
+export FI_CXI_DEFAULT_CQ_SIZE=393216" >> ld_env.sh
    node=1 # --exclusive
    ram=480g # RAM per node (512 - 32)
    USE_MPIRUN=1
diff --git a/env_src/HOST_orcus-intel.sh b/env_src/HOST_orcus-intel.sh
index 9ecd8c39fd..e340386a8c 100755
--- a/env_src/HOST_orcus-intel.sh
+++ b/env_src/HOST_orcus-intel.sh
@@ -42,6 +42,7 @@ squeue" > $TRUST_ROOT/bin/qstat
 ##############################
 define_soumission_batch()
 {
+   [ "$TRUST_USE_CUDA" = 1 ] && echo "GPU run can't be launched on the Intel orcus frontale. Use orcus AMD frontale." && exit -1
    soumission=2
    [ "$prod" = 1 ] && soumission=1
    [ "$gpu"  = 1 ] && soumission=1
diff --git a/env_src/configurer_env b/env_src/configurer_env
index d161e54143..b9b243fa5f 100755
--- a/env_src/configurer_env
+++ b/env_src/configurer_env
@@ -685,9 +685,9 @@ case $arch in
                CUDA_COMPILER=nvcc
                TRUST_CC_BASE="nvcc_wrapper"
             fi
-            # Si n'est pas dans le PATH, on download NVHPC:
-            $CUDA_COMPILER --version 1>/dev/null 2>&1 || INSTALL_NVHPC=1
-            if [ "$INSTALL_NVHPC" = 1 ]
+            # Si n'est pas dans le PATH ou version ancienne, on download NVHPC:
+            NVCC_OK=`nvcc --version 2>/dev/null | awk '/release/ {gsub(",","",$5); print ($5>12.1)}'`
+            if [ "$NVCC_OK" != 1 ]
             then
                source ../env_src/gpu/install_nvhpc_sdk_toolkit.sh
                echo "ADD_PATH $NVHPC/bin" >>$env
@@ -786,6 +786,11 @@ TRUST_NB_PROCS=`./configurer_env -TRUST_NB_PROCS`
 TRUST_MAKE="make -j $TRUST_NB_PROCS"
 if [ -f /proc/cpuinfo ]
 then
+   if [ "`uname -m`" = aarch64 ]
+   then
+      total_cores=`grep processor /proc/cpuinfo  | wc -l`
+      TRUST_NB_PROCS=$total_cores
+   else
    # Nombre de processeurs physiques:
    procs=`grep "physical id" /proc/cpuinfo | sort -u | wc -l` && [ "$procs" = 0 ] && procs=1
    cores_per_proc=`grep "core id" /proc/cpuinfo | sort -u | wc -l` && [ "$cores_per_proc" = 0 ] && cores_per_proc=1
@@ -793,6 +798,7 @@ then
    echo "# Detected $procs processors of $cores_per_proc cores means a total of $total_cores physical cores." | tee -a $env
    CACHE_SIZE=`awk '/cache size/ {if ($(NF-1)>cs) cs=$(NF-1)} END {print cs}' /proc/cpuinfo | sort -u`
    echo "# Detected a size cache of $CACHE_SIZE KB." | tee -a $env
+   fi
 elif [ `uname -s` = "Darwin" ]; then
    total_cores=$(sysctl -n hw.perflevel0.logicalcpu)
 fi
@@ -1291,13 +1297,14 @@ then
    then
       echo "NVidia driver version: $NVIDIA_VERSION"
       NVCC_VALIDE=`echo $NVCC_VERSION $NVIDIA_VERSION | awk '{if (1.0*$1<=1.0*$2) print 1}'`
-      if [ "$NVCC_VALIDE" = "" ]
-      then
-         echo "The NVidia drivers are too old for your Cuda compiler."
-         echo "You will experience possible error code:222, reason: the provided PTX was compiled with an unsupported toolchain"
-         echo "-> Update NVidia driver or take older Cuda compiler."
+      # Ok si meme Cuda 12.x ou Cuda 13.x
+      #if [ "$NVCC_VALIDE" = "" ]
+      #then
+      #   echo "The NVidia drivers are too old for your Cuda compiler."
+      #   echo "You will experience possible error code:222, reason: the provided PTX was compiled with an unsupported toolchain"
+      #   echo "-> Update NVidia driver or take older Cuda compiler."
          #exit -1
-      fi
+      #fi
    fi
    if [ "$TRUST_CUDA_CC" = "" ]
    then
@@ -1366,8 +1373,6 @@ then
       echo "ADD_PATH \$NVHPC_ROOT/$MPI/bin" >> $env
       echo "ADD_LD_LIBRARY_PATH \$NVHPC_ROOT/$MPI/lib" >> $env
    fi
-   # nvidia-ml for PETSc (energy measure):
-   [ -f /usr/lib64/libnvidia-ml.so ] && echo "ADD_LD_LIBRARY_PATH /usr/lib64" >> $env
 fi
 m="# TRUST will use CUDA ?";e="TRUST_USE_CUDA=\"$TRUST_USE_CUDA\" && export TRUST_USE_CUDA";ecrit $m"|"$e"|"$env
 
@@ -1946,7 +1951,8 @@ case $TRUST_ARCH_CC in
 	   [ "`echo $TRUST_VERSION_GNU | awk -F. '{print ($1==12)}'`" = 1 ] && CppFlags=$CppFlags" -Wno-use-after-free" # Uniquement si gcc12 (Erreur Kokkos sinon)
 	elif [ $TRUST_ARCH_CC = linux_nvcc_wrapper ]
 	then
-	   CppFlags=$CppFlags" -arch=sm_$TRUST_CUDA_CC --extended-lambda -Werror"
+     # 20011/20014 :  calling a __host__ function() from a __host__ __device__ function() is not allowed
+	   CppFlags=$CppFlags" -arch=sm_$TRUST_CUDA_CC --extended-lambda -Werror -Xcudafe --diag_suppress=20011 -Xcudafe --diag_suppress=20014"
 	fi
 	CppFlags=$CppFlags" -fno-common -Wno-long-long -Wall -Wno-unknown-pragmas -Wnon-virtual-dtor -Wreorder -Woverloaded-virtual -Wsynth -Wextra -Wno-unused-parameter -pedantic -fabi-version=0 -Wno-cpp"
 	CppFlags=$CppFlags" -fno-math-errno" # Operations mathematiques optimisees sans ecarts crees (vient de F5)
diff --git a/env_src/gpu/install_nvhpc_sdk_toolkit.sh b/env_src/gpu/install_nvhpc_sdk_toolkit.sh
index 86d6d83af7..a0a8296a16 100644
--- a/env_src/gpu/install_nvhpc_sdk_toolkit.sh
+++ b/env_src/gpu/install_nvhpc_sdk_toolkit.sh
@@ -1,12 +1,17 @@
 #!/bin/bash
-# NVIDIA HPC SDK
-NVIDIA_VERSION=`nvidia-smi 2>/dev/null | awk '/CUDA Version/ {v=$(NF-1);gsub("\\\.","",v);print v}'`
 # Cuda12.9 works on Driver 12.x. Issue for major version only. E.g: Cuda13.x on Cuda12.x
-# Keep 23.5 if issue on orcus or jean-zay...
-# SDK_VERSION=23.5  && CUDA_VERSION=12.1 && installer=nvhpc_2023_235_Linux_x86_64_cuda_$CUDA_VERSION  && installer_md5sum=eff38d63c4d08ca5c2962dad049a6833
-# Support Blackwell: 
-SDK_VERSION=25.5 && CUDA_VERSION=12.9 && installer=nvhpc_2025_255_Linux_x86_64_cuda_$CUDA_VERSION && installer_md5sum=748302adcb483bc332214a34dad1e31d
 
+TRUST_CUDA_VERSION=`nvidia-smi 2>/dev/null | awk '/CUDA Version/ {print $(NF-1)}'`
+if [ "$TRUST_CUDA_VERSION" = 13.0 ]
+then
+   # Debut du support Cuda 13 dans TRUST (ex: B6000)
+   # On limite 25.11 a 13.0 pour le moment car sur A6000 avec 13.2, crashes bizarre avec SEGFAULT ou deivision pas 0, 25.5 OK)
+   SDK_VERSION=25.11 && CUDA_VERSION=13.0 && installer=nvhpc_2025_2511_Linux_x86_64_cuda_$CUDA_VERSION && installer_md5sum=2601954ba94355aea67d32043aaa3263
+   # Domaine_VF.cpp plante sur ArborX avec:
+   # SDK_VERSION=26.3 && CUDA_VERSION=13.1 && installer=nvhpc_2026_263_Linux_x86_64_cuda_$CUDA_VERSION && installer_md5sum=712c670a876409f96608f7f29bdbcf51
+else
+   SDK_VERSION=25.5 && CUDA_VERSION=12.9 && installer=nvhpc_2025_255_Linux_x86_64_cuda_$CUDA_VERSION && installer_md5sum=748302adcb483bc332214a34dad1e31d
+fi
 INSTALL=$TRUST_ROOT/env/gpu/install
 NVHPC=$INSTALL/nvhpc-$SDK_VERSION/Linux_x86_64/$SDK_VERSION/compilers
 
diff --git a/src/DG/Operateurs/Op_Diff_Dift/Op_Diff_DG_Elem.cpp b/src/DG/Operateurs/Op_Diff_Dift/Op_Diff_DG_Elem.cpp
index a5a7a11c2d..3d8e21e858 100644
--- a/src/DG/Operateurs/Op_Diff_Dift/Op_Diff_DG_Elem.cpp
+++ b/src/DG/Operateurs/Op_Diff_Dift/Op_Diff_DG_Elem.cpp
@@ -107,7 +107,7 @@ void Op_Diff_DG_Elem::dimensionner(Matrice_Morse& la_matrice) const // TODO a re
   int size_inc = indices_glob_elem(nb_elem_tot);
 
   const Stencil& stencil_sorted = domaine.get_stencil_sorted();
-  const int nb_stencil_max = stencil_sorted.dimension(1);
+  const auto nb_stencil_max = stencil_sorted.dimension(1);
 
   la_matrice.dimensionner(size_inc, size_inc, 0);
 
@@ -137,7 +137,7 @@ void Op_Diff_DG_Elem::dimensionner(Matrice_Morse& la_matrice) const // TODO a re
   for (int nelem = 0; nelem < nb_elem_tot; nelem++)
     {
       auto row = tab1[indices_glob_elem(nelem)]-1 ;
-      auto nb_indices_line = tab1[indices_glob_elem(nelem)+1] - tab1[indices_glob_elem(nelem)];
+      int nb_indices_line = (int)(tab1[indices_glob_elem(nelem)+1] - tab1[indices_glob_elem(nelem)]);
       indice = 0;
       for (int d = 0; d < dim; d++)
         {
diff --git a/src/DG/Operateurs/Op_Divers/Op_Div_DG.cpp b/src/DG/Operateurs/Op_Divers/Op_Div_DG.cpp
index 012850a7b6..ac9006327b 100644
--- a/src/DG/Operateurs/Op_Divers/Op_Div_DG.cpp
+++ b/src/DG/Operateurs/Op_Divers/Op_Div_DG.cpp
@@ -101,7 +101,7 @@ void Op_Div_DG::dimensionner_blocs(matrices_t matrices, const tabs_t& semi_impl)
   int size_col = indices_glob_elem_v(nb_elem_tot);
 
   const Stencil& stencil_sorted = domaine.get_stencil_sorted();
-  const int nb_stencil_max = stencil_sorted.dimension(1);
+  const auto nb_stencil_max = stencil_sorted.dimension(1);
 
   int nb_indices_line;
   int row, col, indice;
@@ -132,8 +132,8 @@ void Op_Div_DG::dimensionner_blocs(matrices_t matrices, const tabs_t& semi_impl)
 
       for (int nelem = 0; nelem < nb_elem_tot; nelem++)
         {
-          row = tabv1[indices_glob_elem_p(nelem)] - 1;
-          nb_indices_line = tabv1[indices_glob_elem_p(nelem) + 1] - tabv1[indices_glob_elem_p(nelem)];
+          row = (int)(tabv1[indices_glob_elem_p(nelem)] - 1);
+          nb_indices_line = (int)(tabv1[indices_glob_elem_p(nelem) + 1] - tabv1[indices_glob_elem_p(nelem)]);
           indice = 0;
 
           for (int i = 0; i < nb_bfunc_p; i++)
@@ -187,11 +187,11 @@ void Op_Div_DG::dimensionner_blocs(matrices_t matrices, const tabs_t& semi_impl)
 
       for (int nelem = 0; nelem < nb_elem_tot; nelem++)
         {
-          nb_indices_line = tabp1(indices_glob_elem_p(nelem) + 1) - tabp1(indices_glob_elem_p(nelem));
+          nb_indices_line = (int)(tabp1(indices_glob_elem_p(nelem) + 1) - tabp1(indices_glob_elem_p(nelem)));
 
           for (int i = 0; i < nb_bfunc_p; i++)
             {
-              row = tabp1(indices_glob_elem_p(nelem) + i) - 1;
+              row = (int)(tabp1(indices_glob_elem_p(nelem) + i) - 1);
 
               for (int k = 0; k < nb_stencil_max; k++)
                 {
diff --git a/src/DG/Operateurs/Op_Divers/Op_Grad_DG.cpp b/src/DG/Operateurs/Op_Divers/Op_Grad_DG.cpp
index c881783504..b462730a97 100644
--- a/src/DG/Operateurs/Op_Divers/Op_Grad_DG.cpp
+++ b/src/DG/Operateurs/Op_Divers/Op_Grad_DG.cpp
@@ -93,7 +93,7 @@ void Op_Grad_DG::dimensionner_blocs(matrices_t matrices, const tabs_t& semi_impl
   coeff = 0;
 
   const Stencil& stencil_sorted = domaine.get_stencil_sorted();
-  const int nb_stencil_max = stencil_sorted.dimension(1);
+  const auto nb_stencil_max = stencil_sorted.dimension(1);
 
   int nb_indices_line;
   int row, col, indice;
@@ -116,8 +116,8 @@ void Op_Grad_DG::dimensionner_blocs(matrices_t matrices, const tabs_t& semi_impl
 
   for (int nelem = 0; nelem < nb_elem_tot; nelem++)
     {
-      row = tab1[indices_glob_elem_v(nelem)] - 1;
-      nb_indices_line = tab1[indices_glob_elem_v(nelem) + 1] - tab1[indices_glob_elem_v(nelem)];
+      row = (int)(tab1[indices_glob_elem_v(nelem)] - 1);
+      nb_indices_line = (int)(tab1[indices_glob_elem_v(nelem) + 1] - tab1[indices_glob_elem_v(nelem)]);
       indice = 0;
 
       for (int i = 0; i < nb_bfunc_v*dim; i++)
diff --git a/src/DG/Solveurs/Assembleur_P_DG.cpp b/src/DG/Solveurs/Assembleur_P_DG.cpp
index 2cfb3c10e8..e2da8a1d57 100644
--- a/src/DG/Solveurs/Assembleur_P_DG.cpp
+++ b/src/DG/Solveurs/Assembleur_P_DG.cpp
@@ -121,7 +121,7 @@ int Assembleur_P_DG::assembler_mat(Matrice& la_matrice, const DoubleVect& diag,
   int size_inc = indices_glob_elem(nb_elem_tot);
 
   const Stencil& stencil_sorted = domaine.get_stencil_sorted();
-  const int nb_stencil_max = stencil_sorted.dimension(1);
+  const auto nb_stencil_max = stencil_sorted.dimension(1);
 
   mat.dimensionner(size_inc, size_inc, 0);
   auto& tab1 = mat.get_set_tab1();
@@ -148,8 +148,8 @@ int Assembleur_P_DG::assembler_mat(Matrice& la_matrice, const DoubleVect& diag,
 
   for (int nelem = 0; nelem < nb_elem_tot; nelem++)
     {
-      row = tab1[indices_glob_elem(nelem)] - 1;
-      nb_indices_line = tab1[indices_glob_elem(nelem) + 1] - tab1[indices_glob_elem(nelem)];
+      row = (int)(tab1[indices_glob_elem(nelem)] - 1);
+      nb_indices_line = (int)(tab1[indices_glob_elem(nelem) + 1] - tab1[indices_glob_elem(nelem)]);
       indice = 0;
       for (int k = 0; k < nb_stencil_max; k++)
         {
diff --git a/src/EF/Champs/Champ_Fonc_Tabule_P0_EF.cpp b/src/EF/Champs/Champ_Fonc_Tabule_P0_EF.cpp
index 578ad7f0bd..6c3741e1cb 100644
--- a/src/EF/Champs/Champ_Fonc_Tabule_P0_EF.cpp
+++ b/src/EF/Champs/Champ_Fonc_Tabule_P0_EF.cpp
@@ -31,6 +31,7 @@ void Champ_Fonc_Tabule_P0_EF::associer_param(const VECT(OBS_PTR(Champ_base)) &le
 
 void Champ_Fonc_Tabule_P0_EF::mettre_a_jour(double t)
 {
+  // ToDo: replace by Champ_Fonc_P0_base::mettre_a_jour(t, la_table.valeur(), les_ch_param);
   DoubleTab& mes_valeurs = valeurs();
   int nb_elem = le_dom_VF->nb_elem(), nb_elem_tot = le_dom_VF->nb_elem_tot(), nb_param = les_ch_param.size();
   DoubleTabs val_params_aux_elems;
diff --git a/src/Kernel/Champs/Champ_Gen_de_Champs_Gen.cpp b/src/Kernel/Champs/Champ_Gen_de_Champs_Gen.cpp
index 965c00a014..7dd2e5e6a3 100644
--- a/src/Kernel/Champs/Champ_Gen_de_Champs_Gen.cpp
+++ b/src/Kernel/Champs/Champ_Gen_de_Champs_Gen.cpp
@@ -127,10 +127,6 @@ OWN_PTR(Champ_Fonc_base)& Champ_Gen_de_Champs_Gen::creer_espace_stockage(const N
                                                                          const int nb_comp,
                                                                          OWN_PTR(Champ_Fonc_base)& es_tmp) const
 {
-  if (es_tmp)
-    {
-      ToDo_Kokkos("critical, call to creer_espace_stockage() is expensive on GPU (fields copy on host). Refactor like Champ_Generique_Moyenne and other advanced fields...");
-    }
   Noms noms;
   Noms unites;
   for (int c=0; c<nb_comp; c++)
@@ -138,6 +134,10 @@ OWN_PTR(Champ_Fonc_base)& Champ_Gen_de_Champs_Gen::creer_espace_stockage(const N
   noms.add("bidon");
   double temps;
   temps = get_time();
+  if (temps>0) // Once the computation is initialized
+    {
+      ToDo_Kokkos("critical, recurent calls to creer_espace_stockage() is expensive on GPU (fields copy on host). Refactor like Champ_Generique_Moyenne and other advanced fields...");
+    }
   const Discretisation_base&  discr = get_discretisation();
   Motcle directive = get_directive_pour_discr();
   const Domaine_dis_base& domaine_dis = get_ref_domaine_dis_base();
diff --git a/src/Kernel/Champs/Champ_Generique_Correlation.cpp b/src/Kernel/Champs/Champ_Generique_Correlation.cpp
index 1b21f25078..c157fa3862 100644
--- a/src/Kernel/Champs/Champ_Generique_Correlation.cpp
+++ b/src/Kernel/Champs/Champ_Generique_Correlation.cpp
@@ -120,7 +120,7 @@ const Champ_base& Champ_Generique_Correlation::get_champ(OWN_PTR(Champ_base)& es
   else
     espace_stockage_->changer_temps(temps());
   DoubleTab& tab_correlation = espace_stockage_->valeurs();
-  tab_correlation = Op_Correlation_.calculer_valeurs();
+  Op_Correlation_.calculer(tab_correlation);
   tab_correlation.echange_espace_virtuel();
   return espace_stockage_;
 }
diff --git a/src/Kernel/Champs/Champ_Generique_Correlation_Triple.cpp b/src/Kernel/Champs/Champ_Generique_Correlation_Triple.cpp
new file mode 100644
index 0000000000..604fedae00
--- /dev/null
+++ b/src/Kernel/Champs/Champ_Generique_Correlation_Triple.cpp
@@ -0,0 +1,142 @@
+/****************************************************************************
+* Copyright (c) 2026, CEA
+* All rights reserved.
+*
+* Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
+* 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
+* 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.
+* 3. Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission.
+*
+* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+* IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+* OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*
+*****************************************************************************/
+
+#include <Champ_Generique_Correlation_Triple.h>
+#include <Discretisation_base.h>
+#include <Schema_Temps_base.h>
+#include <Postraitement.h>
+#include <Synonyme_info.h>
+
+Implemente_instanciable(Champ_Generique_Correlation_Triple,
+                        "Champ_Post_Statistiques_Correlation_Triple|Correlation_Triple",
+                        Champ_Generique_Statistiques_base);
+// XD correlation_triple champ_post_statistiques_base correlation_triple -1 to calculate the triple correlation between three fields.
+
+Sortie& Champ_Generique_Correlation_Triple::printOn(Sortie& s) const
+{
+  return s << que_suis_je() << " " << le_nom();
+}
+
+Entree& Champ_Generique_Correlation_Triple::readOn(Entree& s)
+{
+  return Champ_Generique_Statistiques_base::readOn(s);
+}
+
+void Champ_Generique_Correlation_Triple::completer(const Postraitement_base& post)
+{
+  Champ_Gen_de_Champs_Gen::completer(post);
+  const Probleme_base& Pb = get_ref_pb_base();
+
+  const Champ_Generique_base& ch1 = get_source(0);
+  const Champ_Generique_base& ch2 = get_source(1);
+  const Champ_Generique_base& ch3 = get_source(2);
+  const Domaine_dis_base& zdis = get_ref_domaine_dis_base();
+
+  // Tout est gere en interne : pas besoin de chercher des champs dans le postraitement
+  Op_Correlation_Triple_.associer(zdis, ch1, ch2, ch3, tstat_deb_, tstat_fin_);
+
+  Nom prefix = Pb.le_nom() + "_";
+  if (post.le_nom() != "??" && post.le_nom() != "neant")
+    prefix += post.le_nom() + "_";
+  if (parent_name_ != "??" && !use_source_name_only_)
+    prefix += parent_name_ + "_";
+  Op_Correlation_Triple_.completer(Pb, prefix);
+}
+
+const Champ_base& Champ_Generique_Correlation_Triple::get_champ_without_evaluation(OWN_PTR(Champ_base)& espace_stockage) const
+{
+  // nb_comp_post() retourne nb_comp_abc_ (sans les colonnes auxiliaires du tableau etendu)
+  const int nb_comp = Op_Correlation_Triple_.nb_comp_post();
+  Nature_du_champ nature_source = (nb_comp == 1) ? scalaire : vectoriel;
+  OWN_PTR(Champ_Fonc_base) es_tmp;
+  espace_stockage = creer_espace_stockage(nature_source, nb_comp, es_tmp);
+  return espace_stockage;
+}
+
+const Champ_base& Champ_Generique_Correlation_Triple::get_champ(OWN_PTR(Champ_base)& espace_stockage) const
+{
+  // nb_comp_post() retourne nb_comp_abc_ (sans les colonnes auxiliaires du tableau etendu)
+  const int nb_comp = Op_Correlation_Triple_.nb_comp_post();
+  Nature_du_champ nature_source = (nb_comp == 1) ? scalaire : vectoriel;
+  if (!espace_stockage_)
+    creer_espace_stockage(nature_source, nb_comp, espace_stockage_);
+  else
+    espace_stockage_->changer_temps(temps());
+  DoubleTab& tab = espace_stockage_->valeurs();
+  // fill_result ecrit directement dans tab (qui porte deja le bon md_vector P0)
+  // sans creer de tableau temporaire, evitant l'assertion md_vector_ == v.md_vector_
+  // qui se declencherait si on faisait tab = calculer_valeurs() avec un resultat
+  // sans md_vector.
+  Op_Correlation_Triple_.fill_result(tab);
+  tab.echange_espace_virtuel();
+  return espace_stockage_;
+}
+
+const Noms Champ_Generique_Correlation_Triple::get_property(const Motcle& query) const
+{
+  Motcles motcles(2);
+  motcles[0] = "unites";
+  motcles[1] = "composantes";
+  switch (motcles.search(query))
+    {
+    case 0:
+      {
+        // Ne retourner que les nb_comp_abc_ premieres unites (pas les colonnes auxiliaires)
+        const int nb = Op_Correlation_Triple_.nb_comp_post();
+        const Noms& all = integrale().le_champ_calcule().unites();
+        if (all.size() <= nb) return all;
+        Noms res(nb);
+        for (int i = 0; i < nb; i++) res[i] = all[i];
+        return res;
+      }
+    case 1:
+      {
+        // Ne retourner que les nb_comp_abc_ premiers noms de composantes
+        const int nb = Op_Correlation_Triple_.nb_comp_post();
+        const Noms& all = integrale().le_champ_calcule().noms_compo();
+        if (all.size() <= nb) return all;
+        Noms res(nb);
+        for (int i = 0; i < nb; i++) res[i] = all[i];
+        return res;
+      }
+    }
+  return Champ_Gen_de_Champs_Gen::get_property(query);
+}
+
+void Champ_Generique_Correlation_Triple::nommer_source()
+{
+  if (nom_post_ == "??")
+    {
+      Nom n("Correlation_Triple_");
+      n += get_source(0).get_property("nom")[0];
+      n += "_";
+      n += get_source(1).get_property("nom")[0];
+      n += "_";
+      n += get_source(2).get_property("nom")[0];
+      nommer(n);
+    }
+}
+
+int Champ_Generique_Correlation_Triple::get_info_type_post() const
+{
+  return (get_property("composantes").size() > 1) ? 1 : 0;
+}
+
+const Motcle Champ_Generique_Correlation_Triple::get_directive_pour_discr() const
+{
+  if (Op_Correlation_Triple_.integrale().get_support_different())
+    return Motcle("champ_elem");
+  return Op_Correlation_Triple_.le_champ_a()->get_directive_pour_discr();
+}
diff --git a/src/Kernel/Champs/Champ_Generique_Correlation_Triple.h b/src/Kernel/Champs/Champ_Generique_Correlation_Triple.h
new file mode 100644
index 0000000000..4c830bb444
--- /dev/null
+++ b/src/Kernel/Champs/Champ_Generique_Correlation_Triple.h
@@ -0,0 +1,77 @@
+/****************************************************************************
+* Copyright (c) 2026, CEA
+* All rights reserved.
+*
+* Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
+* 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
+* 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.
+* 3. Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission.
+*
+* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+* IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+* OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*
+*****************************************************************************/
+
+#ifndef Champ_Generique_Correlation_Triple_included
+#define Champ_Generique_Correlation_Triple_included
+
+#include <Champ_Generique_Statistiques_base.h>
+#include <TRUSTTabs_forward.h>
+#include <Op_Correlation_Triple.h>
+
+class Postraitement_base;
+
+/*! @brief class Champ_Generique_Correlation_Triple
+ *
+ *  Champ destine a post-traiter une correlation triple <F'G'H'>.
+ *  Les trois champs Moyenne correspondants doivent etre declares avant ce champ dans le jeu de donnees.
+ *
+ *  Syntaxe :
+ *    nom  correlation_triple { t_deb val t_fin val sources_reference { F , G , H } }
+ *
+ */
+class Champ_Generique_Correlation_Triple : public Champ_Generique_Statistiques_base
+{
+  Declare_instanciable(Champ_Generique_Correlation_Triple);
+
+public:
+
+  const Noms get_property(const Motcle& query) const override;
+
+  inline double temps() const override
+  {
+    return Op_Correlation_Triple_.integrale().le_champ_calcule().temps();
+  };
+  inline const Integrale_tps_Champ& integrale() const override
+  {
+    return Op_Correlation_Triple_.integrale();
+  };
+
+  inline const Operateur_Statistique_tps_base& Operateur_Statistique() const override;
+  inline Operateur_Statistique_tps_base& Operateur_Statistique() override;
+  void completer(const Postraitement_base& post) override;
+
+  const Motcle get_directive_pour_discr() const override;
+  const Champ_base& get_champ_without_evaluation(OWN_PTR(Champ_base)& espace_stockage) const override;
+  const Champ_base& get_champ(OWN_PTR(Champ_base)& espace_stockage) const override;
+  void nommer_source() override;
+  int get_info_type_post() const override;
+
+protected:
+  Op_Correlation_Triple Op_Correlation_Triple_;
+
+private:
+  mutable OWN_PTR(Champ_Fonc_base) espace_stockage_;
+};
+
+inline const Operateur_Statistique_tps_base& Champ_Generique_Correlation_Triple::Operateur_Statistique() const
+{
+  return Op_Correlation_Triple_;
+}
+inline Operateur_Statistique_tps_base& Champ_Generique_Correlation_Triple::Operateur_Statistique()
+{
+  return Op_Correlation_Triple_;
+}
+
+#endif
diff --git a/src/Kernel/Champs/Champ_Generique_Ecart_Type.cpp b/src/Kernel/Champs/Champ_Generique_Ecart_Type.cpp
index e5c6805663..1646a003dc 100644
--- a/src/Kernel/Champs/Champ_Generique_Ecart_Type.cpp
+++ b/src/Kernel/Champs/Champ_Generique_Ecart_Type.cpp
@@ -107,7 +107,7 @@ const Champ_base& Champ_Generique_Ecart_Type::get_champ(OWN_PTR(Champ_base)&) co
   else
     espace_stockage_->changer_temps(temps());
   DoubleTab& tab_ecart_type = espace_stockage_->valeurs();
-  tab_ecart_type = Op_Ecart_Type_.calculer_valeurs();
+  Op_Ecart_Type_.calculer(tab_ecart_type);
   tab_ecart_type.echange_espace_virtuel();
   return espace_stockage_;
 }
diff --git a/src/Kernel/Champs/Champ_Generique_Moyenne.cpp b/src/Kernel/Champs/Champ_Generique_Moyenne.cpp
index 2cf4ada943..80af70a040 100644
--- a/src/Kernel/Champs/Champ_Generique_Moyenne.cpp
+++ b/src/Kernel/Champs/Champ_Generique_Moyenne.cpp
@@ -93,11 +93,15 @@ const Champ_base& Champ_Generique_Moyenne::get_champ(OWN_PTR(Champ_base)&) const
   Nature_du_champ nature_source = source.nature_du_champ();
   int nb_comp = source.nb_comp();
   if (!espace_stockage_)
-    creer_espace_stockage(nature_source,nb_comp,espace_stockage_);
+    {
+      creer_espace_stockage(nature_source, nb_comp, espace_stockage_);
+      //mapToDevice(espace_stockage_->valeurs()); // Force creation on device
+      // PL: normally A=B (line 103) if B is on device should create A on device no ?
+    }
   else
     espace_stockage_->changer_temps(temps());
   DoubleTab& tab_moy = espace_stockage_->valeurs();
-  tab_moy = Op_Moyenne_.calculer_valeurs();
+  Op_Moyenne_.calculer(tab_moy);
   tab_moy.echange_espace_virtuel();
   return espace_stockage_;
 }
diff --git a/src/Kernel/Champs/Champ_Generique_Predefini.cpp b/src/Kernel/Champs/Champ_Generique_Predefini.cpp
index 70a30d0c2d..4787ffda2f 100644
--- a/src/Kernel/Champs/Champ_Generique_Predefini.cpp
+++ b/src/Kernel/Champs/Champ_Generique_Predefini.cpp
@@ -19,7 +19,7 @@
 
 Implemente_instanciable(Champ_Generique_Predefini,"Predefini",Champ_Gen_de_Champs_Gen);
 // XD predefini champ_generique_base predefini -1 This keyword is used to post process predefined postprocessing fields.
-// XD attr pb_champ deuxmots pb_champ 0 { Pb_champ nom_pb nom_champ } : nom_pb is the problem name and nom_champ is the selected field name. The available keywords for the field name are: energie_cinetique_totale, energie_cinetique_elem, viscosite_turbulente, viscous_force_x, viscous_force_y, viscous_force_z, pressure_force_x, pressure_force_y, pressure_force_z, total_force_x, total_force_y, total_force_z, viscous_force, pressure_force, total_force
+// XD attr pb_champ deuxmots pb_champ 0 { Pb_champ nom_pb nom_champ } : nom_pb is the problem name and nom_champ is the selected field name. The available keywords for the field name are: energie_cinetique_totale, energie_cinetique_elem, enstrophie_totale, viscosite_turbulente, viscous_force_x, viscous_force_y, viscous_force_z, pressure_force_x, pressure_force_y, pressure_force_z, total_force_x, total_force_y, total_force_z, viscous_force, pressure_force, total_force
 
 Sortie& Champ_Generique_Predefini::printOn(Sortie& s ) const
 {
@@ -92,6 +92,8 @@ const Noms Champ_Generique_Predefini::get_property(const Motcle& query) const
           mots[0] = "kg.m2/s2";
         else if (Motcle(type_champ_)=="ENERGIE_CINETIQUE_ELEM")
           mots[0] = "kg/(m.s2)";
+        else if (Motcle(type_champ_)=="ENSTROPHIE_TOTALE")
+          mots[0] = "s-2";
         else if (Motcle(type_champ_)=="VISCOSITE_TURBULENTE")
           mots[0] = "m2/s";
         else if (Motcle(type_champ_)=="VISCOUS_FORCE_X")
@@ -141,7 +143,7 @@ void Champ_Generique_Predefini::nommer_source()
 
 Nom Champ_Generique_Predefini::construit_expression()
 {
-  Motcles les_mots(15);
+  Motcles les_mots(16);
   {
     les_mots[0] = "energie_cinetique_totale";
     les_mots[1] = "energie_cinetique_elem";
@@ -158,6 +160,7 @@ Nom Champ_Generique_Predefini::construit_expression()
     les_mots[12] = "viscous_force";
     les_mots[13] = "pressure_force";
     les_mots[14] = "total_force";
+    les_mots[15] = "enstrophie_totale";
   }
 
   Nom expression("");
@@ -363,6 +366,17 @@ Nom Champ_Generique_Predefini::construit_expression()
       }
 
 
+    case 15:
+      {
+        // enstrophie_totale = volume-integrated 0.5*|omega|^2, with omega = curl(u)
+        expression  = " Reduction_0D { methode somme_ponderee ";
+        expression += " source Transformation { methode formule expression 1 0.5*norme_omega*norme_omega ";
+        expression += " sources { Transformation { methode norme  localisation elem  source RefChamp { Pb_champ ";
+        expression += nom_pb_;
+        expression += " vorticite }  nom_source norme_omega } } } } ";
+        break;
+      }
+
     default :
       {
         Cerr<<"Only keywords among "<<les_mots<<" are allowed."<<finl;
diff --git a/src/Kernel/Champs/Champ_Generique_Reduction_0D.cpp b/src/Kernel/Champs/Champ_Generique_Reduction_0D.cpp
index 045f9437f2..921f00c726 100644
--- a/src/Kernel/Champs/Champ_Generique_Reduction_0D.cpp
+++ b/src/Kernel/Champs/Champ_Generique_Reduction_0D.cpp
@@ -276,10 +276,13 @@ const Champ_base& Champ_Generique_Reduction_0D::get_champ(OWN_PTR(Champ_base)&)
           if (nb_dim!=nb_comp) //Cas des Champ_Face_VDF
             {
               size_vect=0;
-              ToDo_Kokkos("critical, warning check you have a NR test case with .son !");
-              for (int i=0; i<valeurs_source.dimension(0); i++)
-                if (zvf.orientation(i)==comp)
-                  ++size_vect;
+              CIntArrView orientation = zvf.orientation().view_ro();
+              Kokkos::parallel_reduce(start_gpu_timer(__KERNEL_NAME__), valeurs_source.dimension(0),
+                                      KOKKOS_LAMBDA(const int i, int& count)
+              {
+                if (orientation(i) == comp) count++;
+              }, Kokkos::Sum<int>(size_vect));
+              end_gpu_timer(__KERNEL_NAME__);
             }
 
           DoubleTrav vect_source;
@@ -287,7 +290,8 @@ const Champ_base& Champ_Generique_Reduction_0D::get_champ(OWN_PTR(Champ_base)&)
           //pour appliquer val_extraite = mp_prodscal(vect_source,un)
           //Sa dimension est alors fixee par rapport au nombre d items de la source
           //ex : zvf.nb_faces() si loc==FACE
-          if (methode_=="somme" || methode_=="moyenne" || methode_=="sum" || methode_=="average")
+          const bool flag = methode_=="somme" || methode_=="moyenne" || methode_=="sum" || methode_=="average";
+          if (flag)
             {
               Entity loc = get_localisation();
               if (loc==Entity::ELEMENT)
@@ -316,19 +320,30 @@ const Champ_base& Champ_Generique_Reduction_0D::get_champ(OWN_PTR(Champ_base)&)
             }
           else
             {
-              ToDo_Kokkos("critical, warning check you have a NR test case with .son !");
-              int k=0;
-              for (int i=0; i<valeurs_source.dimension(0); i++)
-                if (zvf.orientation(i)==comp)
+              const int n_faces = valeurs_source.dimension(0);
+              if (flag)
+                {
+                  CIntArrView orientation = zvf.orientation().view_ro();
+                  CDoubleArrView valeurs = static_cast<const ArrOfDouble&>(valeurs_source).view_ro();
+                  DoubleArrView vect = static_cast<ArrOfDouble&>(vect_source).view_rw();
+                  Kokkos::parallel_for(start_gpu_timer(__KERNEL_NAME__), n_faces,
+                                       KOKKOS_LAMBDA(const int i)
                   {
-                    if (methode_=="somme" || methode_=="moyenne" || methode_=="sum" || methode_=="average")
-                      vect_source(i) = valeurs_source(i);
-                    else
+                    if (orientation(i) == comp) vect(i) = valeurs(i);
+                  });
+                  end_gpu_timer(__KERNEL_NAME__);
+                }
+              else
+                {
+                  ToDo_Kokkos("critical not so easy, numerotation dependant");
+                  int k = 0;
+                  for (int i = 0; i < n_faces; i++)
+                    if (zvf.orientation(i) == comp)
                       {
                         vect_source(k) = valeurs_source(i);
                         k++;
                       }
-                  }
+                }
             }
           // Passage si necessaire de la composante pour les Champ_face
           extraire(val_extraite,vect_source,basis_function,(nb_dim==nb_comp?-1:comp));
@@ -344,10 +359,15 @@ const Champ_base& Champ_Generique_Reduction_0D::get_champ(OWN_PTR(Champ_base)&)
             }
           else
             {
-              ToDo_Kokkos("critical, warning check you have a NR test case with .son !");
-              for (int i=0; i<valeurs_source.dimension(0); i++)
-                if (zvf.orientation(i)==comp)
-                  espace_valeurs(i) = val_extraite;
+              int size = valeurs_source.dimension(0);
+              CIntArrView orientation = zvf.orientation().view_ro();
+              DoubleArrView valeurs = static_cast<ArrOfDouble&>(espace_valeurs).view_rw();
+              Kokkos::parallel_for(start_gpu_timer(__KERNEL_NAME__), size, KOKKOS_LAMBDA(const int i)
+              {
+                if (orientation(i)==comp)
+                  valeurs(i) = val_extraite;
+              });
+              end_gpu_timer(__KERNEL_NAME__);
             }
         }
     }
diff --git a/src/Kernel/Champs/Champ_Generique_Transformation.cpp b/src/Kernel/Champs/Champ_Generique_Transformation.cpp
index a69934301b..ba180272f4 100644
--- a/src/Kernel/Champs/Champ_Generique_Transformation.cpp
+++ b/src/Kernel/Champs/Champ_Generique_Transformation.cpp
@@ -538,12 +538,7 @@ const Champ_base& Champ_Generique_Transformation::get_champ(OWN_PTR(Champ_base)&
   //de stockage
   DoubleTrav positions;
   if (localisation_ == "elem")
-    {
-      if (zvf.xp().nb_dim() != 2) /* xp() non initialise */
-        zvf.domaine().calculer_centres_gravite(positions);
-      else
-        zvf.get_position(positions);
-    }
+    zvf.get_position(positions);
   else if (localisation_ == "som")
     positions = get_ref_domain().coord_sommets();
   else if (localisation_ == "faces")
@@ -579,7 +574,7 @@ const Champ_base& Champ_Generique_Transformation::get_champ(OWN_PTR(Champ_base)&
       Process::exit();
     }
   DoubleTravs sources_val(nb_sources);
-  IntTrav nb_comps(nb_sources);
+  ArrOfInt nb_comps(nb_sources);
   Noms nom_source(nb_sources);
   int dim_compo = 2*dimension;
   Noms compo(dim_compo);
@@ -888,6 +883,7 @@ const Champ_base& Champ_Generique_Transformation::get_champ(OWN_PTR(Champ_base)&
         {
           ToDo_Kokkos("Code but check test!");
           int dim = dimension;
+          int nb_comp = nb_comp_;
           int nb_elem = valeurs_espace.dimension(0);
           Kokkos::Array<CDoubleTabView, max_nb_sources> sources;
           for (int so=0; so<nb_sources; so++)
@@ -903,9 +899,9 @@ const Champ_base& Champ_Generique_Transformation::get_champ(OWN_PTR(Champ_base)&
           DoubleTabView valeurs = valeurs_espace.view_wo();
           Kokkos::parallel_for(start_gpu_timer(__KERNEL_NAME__), nb_elem, KOKKOS_LAMBDA(const int i)
           {
+            int threadId = parser.acquire();
             for (int pt=0; pt<nb_points_w(i); pt++)
               {
-                int threadId = parser.acquire();
                 int k = ind_integ_points_w(i)+pt;
                 double x = special ? 1e38 : pos(k,0);
                 double y = special ? 1e38 : pos(k,1);
@@ -914,8 +910,7 @@ const Champ_base& Champ_Generique_Transformation::get_champ(OWN_PTR(Champ_base)&
                 parser.setVar(1,y,threadId);
                 parser.setVar(2,z,threadId);
                 parser.setVar(3,temps,threadId);
-
-                for (int d = 0; d < nb_comp_; d++)
+                for (int d = 0; d < nb_comp; d++)
                   {
                     int j = nb_pts_integ_max*d + pt;
                     for (int so=0; so<nb_sources; so++)
@@ -923,6 +918,7 @@ const Champ_base& Champ_Generique_Transformation::get_champ(OWN_PTR(Champ_base)&
                     valeurs(i, j) = parser.eval(threadId);
                   }
               }
+            parser.release(threadId);
           });
           end_gpu_timer(__KERNEL_NAME__);
         }
@@ -1036,9 +1032,15 @@ const Motcle Champ_Generique_Transformation::get_directive_pour_discr() const
   Motcle directive;
   if (localisation_=="elem")
     {
-      OWN_PTR(Champ_base) source_espace_stockage;
-      const Champ_base& source = get_source(0).get_champ_without_evaluation(source_espace_stockage);
-      directive = (source.is_basis_function() || source.is_quadrature()) ? "champ_fonc_quad_dg" : "champ_elem";
+      const Domaine_dis_base& domaine_dis = get_ref_domaine_dis_base();
+      if (domaine_dis.que_suis_je() == "Domaine_DG")
+        {
+          OWN_PTR(Champ_base) source_espace_stockage;
+          const Champ_base& source = get_source(0).get_champ_without_evaluation(source_espace_stockage); // PL especially slow on GPU for the moment...
+          directive = (source.is_basis_function() || source.is_quadrature()) ? "champ_fonc_quad_dg" : "champ_elem";
+        }
+      else
+        directive = "champ_elem";
     }
   else if (localisation_=="som")
     {
diff --git a/src/Kernel/Champs/Champ_front_txyz.cpp b/src/Kernel/Champs/Champ_front_txyz.cpp
index de903026ef..417182827b 100644
--- a/src/Kernel/Champs/Champ_front_txyz.cpp
+++ b/src/Kernel/Champs/Champ_front_txyz.cpp
@@ -1,5 +1,5 @@
 /****************************************************************************
-* Copyright (c) 2025, CEA
+* Copyright (c) 2026, CEA
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
@@ -19,6 +19,8 @@
 #include <Frontiere_dis_base.h>
 #include <Domaine_VF.h>
 #include <TRUSTTrav.h>
+#include <ParserView.h>
+#include <Device.h>
 
 Implemente_instanciable(Champ_front_txyz,"Champ_front_fonc_txyz",Ch_front_var_instationnaire_indep);
 // XD champ_front_fonc_txyz front_field_base champ_front_fonc_txyz 0 Boundary field which is not constant in space and in time.
@@ -98,25 +100,33 @@ Champ_front_base& Champ_front_txyz::affecter_(const Champ_front_base& ch)
 
 void Champ_front_txyz::mettre_a_jour(double temps)
 {
-  int dim=nb_comp();
-  const Frontiere_dis_base& fr_dis=frontiere_dis();
+  int dim = nb_comp();
+  const Frontiere_dis_base& fr_dis = frontiere_dis();
   const Domaine_VF& zvf = ref_cast(Domaine_VF, fr_dis.domaine_dis());
-  int nb_faces=ref_cast(Front_VF, fr_dis).nb_faces();
+  int nb_faces = ref_cast(Front_VF, fr_dis).nb_faces();
   int premiere_face = ref_cast(Front_VF, fr_dis).num_premiere_face();
-  DoubleTab& tab=valeurs_au_temps(temps);
-  for(int i=0; i<nb_faces; i++)
+  DoubleTab& tab_val = valeurs_au_temps(temps);
+
+  int dim3 = (dimension >= 3);
+  CDoubleTabView xv = zvf.xv().view_ro();
+  DoubleTabView tab = tab_val.view_rw();
+  for (int k = 0; k < dim; k++)
     {
-      for(int k=0; k<dim; k++)
-        {
-          fxyz[k].setVar(0,temps);
-          fxyz[k].setVar(1,zvf.xv(premiere_face + i, 0));
-          fxyz[k].setVar(2,zvf.xv(premiere_face + i, 1));
-          if (dimension >= 3)
-            fxyz[k].setVar(3,zvf.xv(premiere_face + i, 2));
-          tab(i,k)=fxyz[k].eval();
-        }
+      ParserView fxyzk(fxyz[k]);
+      fxyzk.parseString();
+      Kokkos::parallel_for(start_gpu_timer(__KERNEL_NAME__), nb_faces, KOKKOS_LAMBDA(const int i)
+      {
+        int threadId = fxyzk.acquire();
+        fxyzk.setVar(0, temps, threadId);
+        fxyzk.setVar(1, xv(premiere_face + i, 0), threadId);
+        fxyzk.setVar(2, xv(premiere_face + i, 1), threadId);
+        fxyzk.setVar(3, dim3 ? xv(premiere_face + i, 2) : 0.0, threadId);
+        tab(i, k) = fxyzk.eval(threadId);
+        fxyzk.release(threadId);
+      });
+      end_gpu_timer(__KERNEL_NAME__);
     }
-  tab.echange_espace_virtuel();
+  tab_val.echange_espace_virtuel();
 }
 
 double Champ_front_txyz::valeur_au_temps_et_au_point(double temps,int som,double x,double y, double z, int k) const
diff --git a/src/Kernel/Champs/Champ_front_vide.h b/src/Kernel/Champs/Champ_front_vide.h
index d9f2007ba1..bbe71e4a5f 100644
--- a/src/Kernel/Champs/Champ_front_vide.h
+++ b/src/Kernel/Champs/Champ_front_vide.h
@@ -34,8 +34,8 @@ class Champ_front_vide : public Champ_front_base
 
 public:
   bool has_valeurs_au_temps(double temps) const override { return false; }
-  DoubleTab& valeurs_au_temps(double temps) override { Process::exit("Impossible d'appeler les valeurs d'un champ_fronc_vide"); return les_valeurs->valeurs();};
-  const DoubleTab& valeurs_au_temps(double temps) const override { Process::exit("Impossible d'appeler les valeurs d'un champ_fronc_vide"); return les_valeurs->valeurs();};
+  DoubleTab& valeurs_au_temps(double temps) override { Process::exit("Impossible d'appeler les valeurs d'un champ_front_vide"); return les_valeurs->valeurs();};
+  const DoubleTab& valeurs_au_temps(double temps) const override { Process::exit("Impossible d'appeler les valeurs d'un champ_front_vide"); return les_valeurs->valeurs();};
   int avancer(double temps) override {return 1;};
   int reculer(double temps) override {return 1;};
   Champ_front_base& affecter_(const Champ_front_base& ch) override {return *this;};
diff --git a/src/Kernel/Champs/Champs_Don/Champ_Don_lu.cpp b/src/Kernel/Champs/Champs_Don/Champ_Don_lu.cpp
index c30792f99a..5df5851921 100644
--- a/src/Kernel/Champs/Champs_Don/Champ_Don_lu.cpp
+++ b/src/Kernel/Champs/Champs_Don/Champ_Don_lu.cpp
@@ -104,7 +104,6 @@ Entree& Champ_Don_lu::readOn(Entree& is)
   // Lecture des valeurs dans le fichier fic
   DoubleTab& mes_val = valeurs();
   IntTab compteur(nb_elems);
-  compteur = 0;
   DoubleVect point(dimension);
   DoubleVect val_lu(dim);
   int elem2;
diff --git a/src/Kernel/Champs/Champs_Don/Champ_Uniforme_Morceaux.cpp b/src/Kernel/Champs/Champs_Don/Champ_Uniforme_Morceaux.cpp
index ec4a3d7613..f08ddc13cf 100644
--- a/src/Kernel/Champs/Champs_Don/Champ_Uniforme_Morceaux.cpp
+++ b/src/Kernel/Champs/Champs_Don/Champ_Uniforme_Morceaux.cpp
@@ -97,9 +97,9 @@ Entree& Champ_Uniforme_Morceaux::readOn(Entree& is)
       z = z/nsom;
       for( k=0; k< dim; k++)
         {
-          fxyz[k].setVar("x",x);
-          fxyz[k].setVar("y",y);
-          fxyz[k].setVar("z",z);
+          fxyz[k].setVar(0,x);
+          fxyz[k].setVar(1,y);
+          fxyz[k].setVar(2,z);
           valeurs_(poly,k)=fxyz[k].eval();
         }
     }
@@ -134,9 +134,9 @@ Entree& Champ_Uniforme_Morceaux::readOn(Entree& is)
           z = z/nsom;
           for( k=0; k< dim; k++)
             {
-              fxyz[k].setVar("x",x);
-              fxyz[k].setVar("y",y);
-              fxyz[k].setVar("z",z);
+              fxyz[k].setVar(0,x);
+              fxyz[k].setVar(1,y);
+              fxyz[k].setVar(2,z);
               valeurs_(ssz(poly),k)=fxyz[k].eval();
             }
         }
diff --git a/src/Kernel/Champs_dis/Champ_Fonc_P0_base.h b/src/Kernel/Champs_dis/Champ_Fonc_P0_base.h
index 62a6794cdd..4d760c130f 100644
--- a/src/Kernel/Champs_dis/Champ_Fonc_P0_base.h
+++ b/src/Kernel/Champs_dis/Champ_Fonc_P0_base.h
@@ -18,6 +18,11 @@
 
 #include <Champ_implementation_P0.h>
 #include <Champ_Fonc_base.h>
+#include <Table.h>
+#include <Domaine_VF.h>
+#include <TRUSTTrav.h>
+#include <ParserView.h>
+class Table;
 
 class Champ_Fonc_P0_base: public Champ_Fonc_base, public Champ_implementation_P0
 {
@@ -78,6 +83,12 @@ class Champ_Fonc_P0_base: public Champ_Fonc_base, public Champ_implementation_P0
     return Champ_implementation_P0::remplir_coord_noeuds_et_polys(positions, polys);
   }
 
+  inline void mettre_a_jour(double t, const Table& table, VECT(OBS_PTR(Champ_base))& les_ch_param)
+  {
+    Champ_implementation_P0::mettre_a_jour(t, table, les_ch_param);
+    Champ_Fonc_base::mettre_a_jour(t);
+  }
+
 protected:
   Champ_base& le_champ() override { return *this; }
   const Champ_base& le_champ() const override { return *this; }
diff --git a/src/Kernel/Champs_dis/Champ_implementation_P0.cpp b/src/Kernel/Champs_dis/Champ_implementation_P0.cpp
index 21ce4d971a..ab3ef51292 100644
--- a/src/Kernel/Champs_dis/Champ_implementation_P0.cpp
+++ b/src/Kernel/Champs_dis/Champ_implementation_P0.cpp
@@ -1,5 +1,5 @@
 /****************************************************************************
-* Copyright (c) 2025, CEA
+* Copyright (c) 2026, CEA
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
@@ -21,6 +21,9 @@
 #include <Champ_base.h>
 #include <Domaine.h>
 #include <Domaine_VF.h>
+#include <Table.h>
+#include <TRUSTTrav.h>
+#include <ParserView.h>
 
 DoubleVect& Champ_implementation_P0::valeur_a_elem(const DoubleVect& position, DoubleVect& result, int poly) const
 {
@@ -152,7 +155,8 @@ DoubleTab& Champ_implementation_P0::valeur_aux_elems(const DoubleTab& positions,
   assert(tab_values.line_size() == nb_components);
   assert(tab_values.line_size() == nb_components || nb_components == 1);
 
-  bool kernelOnDevice = tab_result.checkDataOnDevice(tab_values);
+  // PL: 2026/05/11, as VDF and VEF is well now ported on GPU, we force the algorithm on the device:
+  bool kernelOnDevice = true; // tab_result.checkDataOnDevice(tab_values);
   if (kernelOnDevice)
     valeur_aux_elems_kernel<Kokkos::DefaultExecutionSpace>(tab_values, tab_polys, tab_result, nb_components);
   else
@@ -187,7 +191,8 @@ DoubleVect& Champ_implementation_P0::valeur_aux_elems_compo(const DoubleTab& pos
   assert(tab_result.size() == tab_polys.size());
   assert(tab_values.line_size() == le_champ().nb_comp());
 
-  bool kernelOnDevice = tab_result.checkDataOnDevice(tab_values);
+  // PL: 2026/05/11, as VDF and VEF is well now ported on GPU, we force the algorithm on the device:
+  bool kernelOnDevice = true; // tab_result.checkDataOnDevice(tab_values);
   if (kernelOnDevice)
     valeur_aux_elems_compo_kernel<Kokkos::DefaultExecutionSpace>(tab_values, tab_polys, tab_result, ncomp);
   else
@@ -333,3 +338,78 @@ int Champ_implementation_P0::affecter_(const Champ_base& ch)
       return 0;
     }
 }
+
+void Champ_implementation_P0::mettre_a_jour(double t, const Table& table, VECT(OBS_PTR(Champ_base))& les_ch_param)
+{
+  const Domaine_VF& domaine_VF = get_domaine_dis();
+  DoubleTab& mes_valeurs = le_champ().valeurs();
+  const int nb_elem = domaine_VF.nb_elem(), nb_elem_tot = domaine_VF.nb_elem_tot(), nb_param = les_ch_param.size();
+  const int nbcomp = mes_valeurs.dimension(1);
+  const DoubleTab& centres_de_gravites = domaine_VF.xp();
+
+  // ToDo Kokkos: factorize somewhere this array or rewrite valeur_aux_elems !
+  IntTrav les_polys(nb_elem_tot);
+  IntArrView les_polys_v = static_cast<IntVect&>(les_polys).view_wo();
+  Kokkos::parallel_for(start_gpu_timer(__KERNEL_NAME__), nb_elem_tot, KOKKOS_LAMBDA(const int num_elem)
+  {
+    les_polys_v(num_elem) = num_elem;
+  });
+  end_gpu_timer(__KERNEL_NAME__);
+
+  if (nb_param==1 && nbcomp==1 && table.isfonction()==1)
+    {
+      // Ported on GPU. ToDo Kokkos, extend to more than one param or more than one nbcomp
+      DoubleTrav val_param_aux_elems(nb_elem_tot, nbcomp);
+      les_ch_param[0]->valeur_aux_elems(centres_de_gravites, les_polys, val_param_aux_elems);
+      // Cree un parser specifique ParserView pour Kokkos:
+      ParserView parser(table.parser(0));
+      parser.parseString();
+      CDoubleTabView val_params_aux_elems_v = val_param_aux_elems.view_ro();
+      DoubleTabView mes_valeurs_v = mes_valeurs.view_wo();
+      Kokkos::parallel_for(start_gpu_timer(__KERNEL_NAME__), nb_elem, KOKKOS_LAMBDA(
+                             const int num_elem)
+      {
+        int threadId = parser.acquire();
+        for (int ncomp = 0; ncomp < nbcomp; ncomp++)
+          {
+            double val = val_params_aux_elems_v(num_elem, ncomp);
+
+            parser.setVar(0, val, threadId);
+            mes_valeurs_v(num_elem, ncomp) = parser.eval(threadId);
+          }
+        parser.release(threadId);
+      });
+      end_gpu_timer(__KERNEL_NAME__);
+    }
+  else
+    {
+      ToDo_Kokkos("critical");
+      DoubleTabs val_params_aux_elems;
+      for (int i = 0; i < nb_param; i++)
+        {
+          DoubleTab vp(nb_elem_tot, les_ch_param[i]->valeurs().dimension(1));
+          val_params_aux_elems.add(vp);
+        }
+      for (int i = 0; i < nb_param; i++)
+        les_ch_param[i]->valeur_aux_elems(centres_de_gravites, les_polys, val_params_aux_elems[i]);
+
+      if (table.isfonction() != 2)
+        {
+          std::vector<double> vals;
+          vals.reserve(nb_param); // Pre-allocate space once
+          for (int num_elem = 0; num_elem < nb_elem; num_elem++)
+            for (int ncomp = 0; ncomp < nbcomp; ncomp++)
+              {
+                vals.clear();
+                for (int n = 0; n < nb_param; n++)
+                  vals.push_back(val_params_aux_elems[n](num_elem, les_ch_param[n]->valeurs().dimension(1) == 1 ? 0 : ncomp));
+                mes_valeurs(num_elem, ncomp) = table.val(vals, ncomp);
+              }
+        }
+      else
+        {
+          table.valeurs(val_params_aux_elems[0], centres_de_gravites, t, mes_valeurs);
+        }
+    }
+}
+
diff --git a/src/Kernel/Champs_dis/Champ_implementation_P0.h b/src/Kernel/Champs_dis/Champ_implementation_P0.h
index e1d4a74d08..1b77f35fff 100644
--- a/src/Kernel/Champs_dis/Champ_implementation_P0.h
+++ b/src/Kernel/Champs_dis/Champ_implementation_P0.h
@@ -19,6 +19,7 @@
 #include <Champ_implementation.h>
 #include <Frontiere_dis_base.h>
 #include <Frontiere.h>
+class Table;
 
 class Champ_implementation_P0: public Champ_implementation
 {
@@ -32,8 +33,9 @@ class Champ_implementation_P0: public Champ_implementation
   DoubleTab& remplir_coord_noeuds(DoubleTab& positions) const override;
   int remplir_coord_noeuds_et_polys(DoubleTab& positions, IntVect& polys) const override;
   int imprime_P0(Sortie&, int) const;
+  void mettre_a_jour(double, const Table&, VECT(OBS_PTR(Champ_base))&);
 
-  public_for_cuda
+  protected_but_public_for_cuda
   DoubleTab& valeur_aux_sommets_impl(DoubleTab& result) const override;
 
 protected:
diff --git a/src/Kernel/Cond_Lim/Dirichlet_loi_paroi.h b/src/Kernel/Cond_Lim/Dirichlet_loi_paroi.h
index e41e8951ad..21598b6cf8 100644
--- a/src/Kernel/Cond_Lim/Dirichlet_loi_paroi.h
+++ b/src/Kernel/Cond_Lim/Dirichlet_loi_paroi.h
@@ -1,5 +1,5 @@
 /****************************************************************************
-* Copyright (c) 2024, CEA
+* Copyright (c) 2025, CEA
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
@@ -56,9 +56,10 @@ class Dirichlet_loi_paroi: public Dirichlet
 
   virtual double val_imp(int i) const override {return d_(i,0);}
   virtual double val_imp(int i, int j) const override {return d_(i,j);}
+  virtual const DoubleTab& tab_val_imp(double temps=DMAXFLOAT) const override { return d_; }
   virtual double val_imp_au_temps(double temps, int i) const override
   {
-    Process::exit(que_suis_je() + " : You shouldn't go through val_imp_au_temps but through val_imp ! ");
+    Process::exit(que_suis_je() + " : You shouldn't go through   val_imp_au_temps but through val_imp ! ");
     return 1.;
   }
   virtual double val_imp_au_temps(double temps, int i, int j) const override
diff --git a/src/Kernel/Cond_Lim/Echange_global_impose.cpp b/src/Kernel/Cond_Lim/Echange_global_impose.cpp
index 5fcab943c0..563609d408 100644
--- a/src/Kernel/Cond_Lim/Echange_global_impose.cpp
+++ b/src/Kernel/Cond_Lim/Echange_global_impose.cpp
@@ -1,5 +1,5 @@
 /****************************************************************************
-* Copyright (c) 2025, CEA
+* Copyright (c) 2026, CEA
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
@@ -249,6 +249,22 @@ double Echange_global_impose::flux_exterieur_impose(int i,int j) const
   return champ_exterieur(i,j,phi_ext());
 }
 
+const DoubleTab& Echange_global_impose::tab_phi_ext() const
+{
+  const Front_VF& le_bord = ref_cast(Front_VF, frontiere_dis());
+  int size = le_champ_front->valeurs().dimension(0) == 1 ? le_bord.nb_faces_tot() : le_champ_front->valeurs().dimension_tot(0);
+  int nb_comp = le_champ_front->valeurs().dimension(1);
+  if (phi_ext_tab_.dimension(0) != size)
+    phi_ext_tab_.resize(size, nb_comp);
+  if (phi_ext_lu_)
+    for (int face = 0; face < size; face++)
+      for (int comp = 0; comp < nb_comp; comp++)
+        phi_ext_tab_(face, comp) = flux_exterieur_impose(face, comp);
+  else
+    phi_ext_tab_ = 0.;
+  return phi_ext_tab_;
+}
+
 double Echange_global_impose::flux_exterieur_impose(int i) const
 {
   return champ_exterieur(i,phi_ext());
diff --git a/src/Kernel/Cond_Lim/Echange_global_impose.h b/src/Kernel/Cond_Lim/Echange_global_impose.h
index ed861315c0..7da5c90117 100644
--- a/src/Kernel/Cond_Lim/Echange_global_impose.h
+++ b/src/Kernel/Cond_Lim/Echange_global_impose.h
@@ -1,5 +1,5 @@
 /****************************************************************************
-* Copyright (c) 2025, CEA
+* Copyright (c) 2026, CEA
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
@@ -63,10 +63,14 @@ class Echange_global_impose: public Echange_impose_base
   virtual double derivee_flux_exterieur_imposee(int i, int j) const;
 
   const bool& has_phi_ext() const { return phi_ext_lu_; }
+  const DoubleTab& tab_phi_ext() const;
 
 protected:
   bool phi_ext_lu_ = false;
   OWN_PTR(Champ_front_base) derivee_phi_ext_, phi_ext_;
+
+private:
+  mutable DoubleTab phi_ext_tab_;
 };
 
 #endif
diff --git a/src/Kernel/Cond_Lim/Echange_impose_base.cpp b/src/Kernel/Cond_Lim/Echange_impose_base.cpp
index 3a9aaa382f..eb0f06a6a2 100644
--- a/src/Kernel/Cond_Lim/Echange_impose_base.cpp
+++ b/src/Kernel/Cond_Lim/Echange_impose_base.cpp
@@ -1,5 +1,5 @@
 /****************************************************************************
-* Copyright (c) 2025, CEA
+* Copyright (c) 2026, CEA
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
@@ -255,80 +255,94 @@ void Echange_impose_base::associer_fr_dis_base(const Frontiere_dis_base& fr)
   Cond_lim_base::associer_fr_dis_base(fr);
 }
 
-const DoubleTab& Echange_impose_base::tab_T_ext(double temps) const
+const DoubleTab& Echange_impose_base::tab_T_ext(double temps, bool with_virtual_faces) const
 {
   if (temps==DMAXFLOAT) temps = le_champ_front->get_temps_defaut();
   const Front_VF& le_bord = ref_cast(Front_VF, frontiere_dis());
   // ToDo factorize in Champ_front_base::valeurs_face()
-  int size = le_champ_front->valeurs().dimension(0) == 1 ? le_bord.nb_faces_tot() : le_champ_front->valeurs().dimension_tot(0);
+  int size;
+  if (with_virtual_faces)
+    size = le_champ_front->valeurs().dimension(0) == 1 ? le_bord.nb_faces_tot() : le_champ_front->valeurs().dimension_tot(0);
+  else
+    size = le_champ_front->valeurs().dimension(0) == 1 ? le_bord.nb_faces() : le_champ_front->valeurs().dimension(0);
+
   if (size>0)
     {
       bool update = le_champ_front->instationnaire();
-      if (text_.dimension(0) != size)
+      if (tab_text_.dimension(0) != size)
         {
-          text_.resize(size, le_champ_front->valeurs().dimension(1));
+          tab_text_.resize(size, le_champ_front->valeurs().dimension(1));
           update = true;
         }
       update = true;  // Provisoire
       if (update)
         {
-          int nb_comp = text_.dimension(1);
+          int nb_comp = tab_text_.dimension(1);
           for (int face = 0; face < size; face++)
             for (int comp = 0; comp < nb_comp; comp++)
-              text_(face, comp) = T_ext(face, comp);
+              tab_text_(face, comp) = T_ext(face, comp);
         }
     }
-  return text_;
+  return tab_text_;
 }
 
-const DoubleTab& Echange_impose_base::tab_h_imp(double temps) const
+const DoubleTab& Echange_impose_base::tab_h_imp(double temps, bool with_virtual_faces) const
 {
   if (temps==DMAXFLOAT) temps = le_champ_front->get_temps_defaut();
   const Front_VF& le_bord = ref_cast(Front_VF, frontiere_dis());
   // ToDo factorize in Champ_front_base::valeurs_face()
-  int size = le_champ_front->valeurs().dimension(0) == 1 ? le_bord.nb_faces_tot() : le_champ_front->valeurs().dimension_tot(0);
+  int size;
+  if (with_virtual_faces)
+    size = le_champ_front->valeurs().dimension(0) == 1 ? le_bord.nb_faces_tot() : le_champ_front->valeurs().dimension_tot(0);
+  else
+    size = le_champ_front->valeurs().dimension(0) == 1 ? le_bord.nb_faces() : le_champ_front->valeurs().dimension(0);
+
   if (size>0)
     {
       bool update = le_champ_front->instationnaire();
-      if (himp_.dimension(0) != size)
+      if (tab_himp_.dimension(0) != size)
         {
-          himp_.resize(size, le_champ_front->valeurs().dimension(1));
+          tab_himp_.resize(size, le_champ_front->valeurs().dimension(1));
           update = true;
         }
       update = true;  // Provisoire
       if (update)
         {
-          int nb_comp = himp_.dimension(1);
+          int nb_comp = tab_himp_.dimension(1);
           for (int face = 0; face < size; face++)
             for (int comp = 0; comp < nb_comp; comp++)
-              himp_(face, comp) = h_imp(face, comp);
+              tab_himp_(face, comp) = h_imp(face, comp);
         }
     }
-  return himp_;
+  return tab_himp_;
 }
 
-const DoubleTab& Echange_impose_base::tab_emissivite(double temps) const
+const DoubleTab& Echange_impose_base::tab_emissivite(double temps, bool with_virtual_faces) const
 {
   if (temps==DMAXFLOAT) temps = le_champ_front->get_temps_defaut();
   const Front_VF& le_bord = ref_cast(Front_VF, frontiere_dis());
   // ToDo factorize in Champ_front_base::valeurs_face()
-  int size = le_champ_front->valeurs().dimension(0) == 1 ? le_bord.nb_faces_tot() : le_champ_front->valeurs().dimension_tot(0);
+  int size;
+  if (with_virtual_faces)
+    size = le_champ_front->valeurs().dimension(0) == 1 ? le_bord.nb_faces_tot() : le_champ_front->valeurs().dimension_tot(0);
+  else
+    size = le_champ_front->valeurs().dimension(0) == 1 ? le_bord.nb_faces() : le_champ_front->valeurs().dimension(0);
   if (size>0)
     {
       bool update = le_champ_front->instationnaire();
-      if (eps_.dimension(0) != size)
+      if (tab_eps_.dimension(0) != size)
         {
-          eps_.resize(size, le_champ_front->valeurs().dimension(1));
+          tab_eps_.resize(size, le_champ_front->valeurs().dimension(1));
           update = true;
         }
       update = true;  // Provisoire
       if (update)
         {
-          int nb_comp = eps_.dimension(1);
+          int nb_comp = tab_eps_.dimension(1);
           for (int face = 0; face < size; face++)
             for (int comp = 0; comp < nb_comp; comp++)
-              eps_(face, comp) = emissivite(face, comp);
+              tab_eps_(face, comp) = emissivite(face, comp);
         }
     }
-  return eps_;
+  return tab_eps_;
 }
diff --git a/src/Kernel/Cond_Lim/Echange_impose_base.h b/src/Kernel/Cond_Lim/Echange_impose_base.h
index fb08bfb929..03d46c0acb 100644
--- a/src/Kernel/Cond_Lim/Echange_impose_base.h
+++ b/src/Kernel/Cond_Lim/Echange_impose_base.h
@@ -45,9 +45,9 @@ class Echange_impose_base : public Cond_lim_base
   inline bool has_emissivite() const { return bool(emissivite_); }
   inline bool has_h_imp() const { return bool(h_imp_); }
 
-  const DoubleTab& tab_T_ext(double temps=DMAXFLOAT) const;
-  const DoubleTab& tab_h_imp(double temps=DMAXFLOAT) const;
-  const DoubleTab& tab_emissivite(double temps=DMAXFLOAT) const;
+  const DoubleTab& tab_T_ext(double temps=DMAXFLOAT, bool with_virtual_faces=false) const;
+  const DoubleTab& tab_h_imp(double temps=DMAXFLOAT, bool with_virtual_faces=false) const;
+  const DoubleTab& tab_emissivite(double temps=DMAXFLOAT, bool with_virtual_faces=false) const;
   virtual double T_ext(int num) const;
   virtual double T_ext(int num,int k) const;
   virtual double h_imp(int num) const;
@@ -92,9 +92,9 @@ protected :
   OWN_PTR(Champ_front_base) h_imp_, emissivite_ /* si Echange_externe_radiatif */;
 private:
   // Stocke toutes les valeurs sur les faces (utile pour GPU):
-  mutable DoubleTab text_;
-  mutable DoubleTab himp_;
-  mutable DoubleTab eps_;
+  mutable DoubleTab tab_text_;
+  mutable DoubleTab tab_himp_;
+  mutable DoubleTab tab_eps_;
 };
 
 #endif /* Echange_impose_base_included */
diff --git a/src/Kernel/Cond_Lim/Navier.h b/src/Kernel/Cond_Lim/Navier.h
index 0210fe1fad..89ad5d5aba 100644
--- a/src/Kernel/Cond_Lim/Navier.h
+++ b/src/Kernel/Cond_Lim/Navier.h
@@ -1,5 +1,5 @@
 /****************************************************************************
-* Copyright (c) 2024, CEA
+* Copyright (c) 2025, CEA
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
@@ -32,7 +32,7 @@ class Navier: public Cond_lim_base
   Declare_base(Navier);
 public:
   int compatible_avec_eqn(const Equation_base&) const override { return 1; }
-
+  virtual const DoubleTab* coefficient_frottement() const { return nullptr; };
   virtual double coefficient_frottement(int ) const { return 0.; }
   virtual double coefficient_frottement(int , int ) const { return 0.; }
   virtual double coefficient_frottement_grad(int ) const { return 0.; } // Change the coefficient when calculation of gradient : nu = nullptr
diff --git a/src/Kernel/Cond_Lim/Neumann.cpp b/src/Kernel/Cond_Lim/Neumann.cpp
index 919cf559d0..b5bfc5fd48 100644
--- a/src/Kernel/Cond_Lim/Neumann.cpp
+++ b/src/Kernel/Cond_Lim/Neumann.cpp
@@ -64,7 +64,7 @@ double Neumann::flux_impose(int i, int j) const
  *
  * @return const DoubleTab& Reference to the updated imposed flux array.
  */
-const DoubleTab& Neumann::flux_impose(bool with_virtual_faces) const
+const DoubleTab& Neumann::tab_flux_impose(bool with_virtual_faces) const
 {
   const Front_VF& le_bord = ref_cast(Front_VF, frontiere_dis());
   // ToDo factorize in Champ_front_base::valeurs_face()
diff --git a/src/Kernel/Cond_Lim/Neumann.h b/src/Kernel/Cond_Lim/Neumann.h
index 8cebcbe9bd..14e4abdb68 100644
--- a/src/Kernel/Cond_Lim/Neumann.h
+++ b/src/Kernel/Cond_Lim/Neumann.h
@@ -33,7 +33,7 @@ class Neumann: public Cond_lim_base
 public:
   virtual double flux_impose(int i) const;
   virtual double flux_impose(int i, int j) const;
-  const DoubleTab& flux_impose(bool nb_faces_tot=false) const;
+  const DoubleTab& tab_flux_impose(bool nb_faces_tot=false) const;
 
 protected:
   mutable DoubleTab flux_impose_; // Stocke toutes les valeurs du flux sur toutes les faces de la frontiere (pas d'hypothese sur un champ uniforme). Utile pour le GPU.
diff --git a/src/Kernel/Cond_Lim/Neumann_val_ext.h b/src/Kernel/Cond_Lim/Neumann_val_ext.h
index b35eca5966..e56deea570 100644
--- a/src/Kernel/Cond_Lim/Neumann_val_ext.h
+++ b/src/Kernel/Cond_Lim/Neumann_val_ext.h
@@ -1,5 +1,5 @@
 /****************************************************************************
-* Copyright (c) 2022, CEA
+* Copyright (c) 2025, CEA
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
@@ -37,6 +37,7 @@ class Neumann_val_ext: public Neumann
 
   virtual double val_ext(int i) const = 0;
   virtual double val_ext(int i, int j) const = 0;
+  virtual const DoubleTab& tab_val_ext() const = 0;
 };
 
 #endif
diff --git a/src/Kernel/Framework/Champ_Inc_base.cpp b/src/Kernel/Framework/Champ_Inc_base.cpp
index 0e0dbe9fd3..35296a649e 100644
--- a/src/Kernel/Framework/Champ_Inc_base.cpp
+++ b/src/Kernel/Framework/Champ_Inc_base.cpp
@@ -742,45 +742,93 @@ DoubleTab Champ_Inc_base::valeur_aux_bords() const
     }
   //sinon, calcul a partir des CLs
   const Domaine_VF& domaine = ref_cast(Domaine_VF, domaine_dis_base());
-  const IntTab& f_e = domaine.face_voisins(), &f_s = domaine.face_sommets();
-  DoubleTrav result(domaine.xv_bord().dimension_tot(0), valeurs().line_size());
+  const IntTab& f_s = domaine.face_sommets();
+  DoubleTrav tab_result(domaine.xv_bord().dimension_tot(0), valeurs().line_size());
 
   const Conds_lim& cls = domaine_Cl_dis().les_conditions_limites();
-  int j, k, f, fb, s, n, N = result.line_size(), is_p = (le_nom().debute_par("pression") || le_nom().debute_par("pressure")), n_som;
+  int k, N = tab_result.line_size(), is_p = (le_nom().debute_par("pression") || le_nom().debute_par("pressure")), n_som;
   for (const auto& itr : cls)
     {
       const Front_VF& fr = ref_cast(Front_VF, itr->frontiere_dis());
       //valeur au bord imposee, sauf si c'est une paroi (dans ce cas, la CL peut avoir moins de composantes que le champ -> Energie_Multiphase)
       if (is_p ? sub_type(Neumann, itr.valeur()) : (sub_type(Dirichlet, itr.valeur()) && !sub_type(Scalaire_impose_paroi, itr.valeur())))
-        for (j = 0; j < fr.nb_faces_tot(); j++)
-          for (f = fr.num_face(j), fb = domaine.fbord(f), n = 0; n < N; n++)
-            result(fb, n) = is_p ? ref_cast(Neumann, itr.valeur()).flux_impose(j, n) : ref_cast(Dirichlet, itr.valeur()).val_imp(j, n);
+        {
+          int nb_faces = domaine.nb_faces();
+          int premiere_face_int = domaine.premiere_face_int();
+          CIntArrView ind_faces_virt_bord = domaine.ind_faces_virt_bord().view_ro();
+          CIntArrView num_face = fr.num_face().view_ro();
+          // tab_flux_impose is pressure here
+          CDoubleTabView val_imp = is_p ? ref_cast(Neumann, itr.valeur()).tab_flux_impose(true).view_ro() : ref_cast(Dirichlet, itr.valeur()).tab_val_imp().view_ro();
+          DoubleTabView result = tab_result.view_wo();
+          Kokkos::parallel_for(start_gpu_timer(__KERNEL_NAME__), range_1D(0, fr.nb_faces_tot()), KOKKOS_LAMBDA(const int j)
+          {
+            int f = num_face(j);
+            int fb = f < premiere_face_int ? f : ind_faces_virt_bord(f - nb_faces); // domaine.fbord(f);
+            for (int n = 0; n < N; n++)
+              result(fb, n) = val_imp(j, n);
+          });
+          end_gpu_timer(__KERNEL_NAME__);
+        }
       else if (sub_type(Neumann_val_ext, itr.valeur())) //valeur externe imposee
-        for (j = 0; j < fr.nb_faces_tot(); j++)
-          for (f = fr.num_face(j), fb = domaine.fbord(f), n = 0; n < N; n++)
-            result(fb, n) = ref_cast(Neumann_val_ext, itr.valeur()).val_ext(j, n);
+        {
+          int nb_faces = domaine.nb_faces();
+          int premiere_face_int = domaine.premiere_face_int();
+          CIntArrView ind_faces_virt_bord = domaine.ind_faces_virt_bord().view_ro();
+          CIntArrView num_face = fr.num_face().view_ro();
+          CDoubleTabView val_ext = ref_cast(Neumann_val_ext, itr.valeur()).tab_val_ext().view_ro();
+          DoubleTabView result = tab_result.view_wo();
+          Kokkos::parallel_for(start_gpu_timer(__KERNEL_NAME__), range_1D(0, fr.nb_faces_tot()), KOKKOS_LAMBDA(const int j)
+          {
+            int f = num_face(j);
+            int fb = f < premiere_face_int ? f : ind_faces_virt_bord(f - nb_faces); // domaine.fbord(f);
+            for (int n = 0; n < N; n++)
+              result(fb, n) = val_ext(j, n);
+          });
+          end_gpu_timer(__KERNEL_NAME__);
+        }
       else if (sub_type(Champ_Inc_P0_base, *this))
-        for (j = 0; j < fr.nb_faces_tot(); j++) //Champ P0 : on peut prendre la valeur en l'element
-          for (f = fr.num_face(j), fb = domaine.fbord(f), n = 0; n < N; n++)
-            result(fb, n) = valeurs()(f_e(f, f_e(f, 0) == -1), n);
-      else if (sub_type(Champ_Inc_P1_base, *this))
-        for (j = 0; j < fr.nb_faces_tot(); j++) //Champ P1 : moyenne des valeurs aux sommets
+        {
+          int nb_faces = domaine.nb_faces();
+          int premiere_face_int = domaine.premiere_face_int();
+          CIntArrView ind_faces_virt_bord = domaine.ind_faces_virt_bord().view_ro();
+          CIntArrView num_face = fr.num_face().view_ro();
+          CIntTabView face_voisins = domaine.face_voisins().view_ro();
+          CDoubleTabView inco = valeurs().view_ro();
+          DoubleTabView result = tab_result.view_wo();
+          //Champ P0 : on peut prendre la valeur en l'element
+          Kokkos::parallel_for(start_gpu_timer(__KERNEL_NAME__), range_1D(0, fr.nb_faces_tot()), KOKKOS_LAMBDA(const int j)
           {
-            f = fr.num_face(j), fb = domaine.fbord(f);
-            for (n_som = 0; n_som < f_s.dimension(1) && f_s(f, n_som) >= 0;)
-              n_som++;
-            for (n = 0; n < N; n++)
-              result(fb, n) = 0;
-            for (k = 0; k < n_som; k++)
-              for (s = f_s(f, k), n = 0; n < N; n++)
-                result(fb, n) += valeurs()(s, n) / n_som;
-          }
+            int f = num_face(j);
+            int fb = f < premiere_face_int ? f : ind_faces_virt_bord(f - nb_faces); // domaine.fbord(f);
+            for (int n = 0; n < N; n++)
+              result(fb, n) = inco(face_voisins(f, face_voisins(f, 0) == -1), n);
+          });
+          end_gpu_timer(__KERNEL_NAME__);
+        }
+      else if (sub_type(Champ_Inc_P1_base, *this))
+        {
+          ToDo_Kokkos("critical");
+          for (int j = 0; j < fr.nb_faces_tot(); j++) //Champ P1 : moyenne des valeurs aux sommets
+            {
+              int f = fr.num_face(j), fb = domaine.fbord(f);
+              for (n_som = 0; n_som < f_s.dimension(1) && f_s(f, n_som) >= 0;)
+                n_som++;
+              for (int n = 0; n < N; n++)
+                tab_result(fb, n) = 0;
+              for (k = 0; k < n_som; k++)
+                for (int s = f_s(f, k), n = 0; n < N; n++)
+                  tab_result(fb, n) += valeurs()(s, n) / n_som;
+            }
+        }
       else if (que_suis_je() == "Champ_P1NC")
-        for (j = 0; j < fr.nb_faces_tot(); j++)
-          for (f = fr.num_face(j), fb = domaine.fbord(f), n = 0; n < N; n++)
-            result(fb, n) = valeurs()(f, n);
+        {
+          ToDo_Kokkos("critical");
+          for (int j = 0; j < fr.nb_faces_tot(); j++)
+            for (int f = fr.num_face(j), fb = domaine.fbord(f), n = 0; n < N; n++)
+              tab_result(fb, n) = valeurs()(f, n);
+        }
       else
         Process::exit("Champ_Inc_base::valeur_aux_bords() : must code something!");
     }
-  return result;
+  return tab_result;
 }
diff --git a/src/Kernel/Framework/Champ_base.cpp b/src/Kernel/Framework/Champ_base.cpp
index 81678cc92d..f94f4365f6 100644
--- a/src/Kernel/Framework/Champ_base.cpp
+++ b/src/Kernel/Framework/Champ_base.cpp
@@ -176,10 +176,10 @@ DoubleTab& Champ_base::valeur_aux_centres_de_gravite(const Domaine& dom, DoubleT
     }
 
   IntTrav les_polys(nb_elem);
-  IntArrView les_polys_v = static_cast<ArrOfInt&>(les_polys).view_wo();
+  IntArrView polys = static_cast<ArrOfInt&>(les_polys).view_wo();
   Kokkos::parallel_for(start_gpu_timer(__KERNEL_NAME__), nb_elem, KOKKOS_LAMBDA(const int i)
   {
-    les_polys_v(i) = i;
+    polys(i) = i;
   });
   end_gpu_timer(__KERNEL_NAME__);
 
diff --git a/src/Kernel/Framework/Debog.cpp b/src/Kernel/Framework/Debog.cpp
index fd7b308e6b..24d208a953 100644
--- a/src/Kernel/Framework/Debog.cpp
+++ b/src/Kernel/Framework/Debog.cpp
@@ -86,6 +86,12 @@ void Debog::verifier_getref(const char* const msg, int x, int& ref)
     Debog_Pb::get_debog_instance()->verifier(msg, x, &ref);
 }
 
+void Debog::verifier(const std::string& msg, const DoubleVect& x)
+{
+  if (Debog_Pb::get_debog_instance())
+    Debog_Pb::get_debog_instance()->verifier(msg.c_str(), x);
+}
+
 void Debog::verifier(const char* const msg, const DoubleVect& x)
 {
   if (Debog_Pb::get_debog_instance())
diff --git a/src/Kernel/Framework/Debog.h b/src/Kernel/Framework/Debog.h
index 4ba1072046..7761ca6b32 100644
--- a/src/Kernel/Framework/Debog.h
+++ b/src/Kernel/Framework/Debog.h
@@ -1,5 +1,5 @@
 /****************************************************************************
-* Copyright (c) 2024, CEA
+* Copyright (c) 2026, CEA
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
@@ -17,7 +17,7 @@
 #define Debog_included
 
 #include <TRUSTTabs_forward.h>
-
+#include <string>
 class Champ_Inc_base;
 class Matrice_Base;
 class MD_Vector;
@@ -29,6 +29,7 @@ class Debog
   static int active();
   static void verifier(const char *const msg, double);
   static void verifier(const char *const msg, int);
+  static void verifier(const std::string& msg, const DoubleVect&);
   static void verifier(const char *const msg, const DoubleVect&);
   static void verifier(const char *const msg, const IntVect&);
   static void verifier_bord(const char *const msg, const DoubleVect& arr, int num_deb);
diff --git a/src/Kernel/Framework/Debog_Pb.tpp b/src/Kernel/Framework/Debog_Pb.tpp
index de02d8dd8f..03b0616e84 100644
--- a/src/Kernel/Framework/Debog_Pb.tpp
+++ b/src/Kernel/Framework/Debog_Pb.tpp
@@ -1,5 +1,5 @@
 /****************************************************************************
-* Copyright (c) 2025, CEA
+* Copyright (c) 2026, CEA
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
@@ -196,7 +196,7 @@ void Debog_Pb::verifier_partie_std(const TRUSTVect<_TYPE_>& reference, const TRU
                       if (delta > seuil_relatif_)
                         {
                           detailed_log_file_ << " DIFF " << message << " reference[" << i2 + j << "]=" << y << " \tcurrent[" << i1 + j
-                                             << "]=" << x << " \trelative error=" << delta << finl;
+                                             << "]=" << x << " \trelative difference=" << delta << finl;
                         }
                     }
                   else   // int
@@ -243,13 +243,13 @@ void Debog_Pb::verifier_partie_std(const TRUSTVect<_TYPE_>& reference, const TRU
   if (Process::je_suis_maitre())
     {
       if (IS_DOUBLE)
-        log_file_ << " " << resu << " : Max relative error " << max_err_items_reels << " (max ref value=" << adim << ") id=" << identificateur << finl;
+        log_file_ << " " << resu << " : Max relative difference " << max_err_items_reels << " (max ref value=" << adim << ") id=" << identificateur << finl;
       else
         log_file_ << " " << resu << " : integer field " << identificateur << finl;
     }
 
   if (IS_DOUBLE)
-    detailed_log_file_ << " " << resu << " : Max relative error " << max_err_items_reels << " (max ref value=" << adim << ") id=" << identificateur << finl;
+    detailed_log_file_ << " " << resu << " : Max relative difference " << max_err_items_reels << " (max ref value=" << adim << ") id=" << identificateur << finl;
   else
     detailed_log_file_ << " " << resu << " : integer field " << identificateur << finl;
 
@@ -404,7 +404,7 @@ Debog_Pb::verifier(const char *const msg, _TYPE_ x, _TYPE_ *ref_value)
           if (delta >= seuil_absolu_ && delta / adim >= seuil_relatif_)
             {
               err = (_TYPE_)delta;
-              detailed_log_file_ << " DIFF (double) reference=" << y << " \tcurrent=" << x << " \trelative error=" << delta / adim << " \t(max ref value=" << adim << ")" << finl;
+              detailed_log_file_ << " DIFF (double) reference=" << y << " \tcurrent=" << x << " \trelative difference=" << delta / adim << " \t(max ref value=" << adim << ")" << finl;
             }
         }
       else // int
@@ -420,7 +420,7 @@ Debog_Pb::verifier(const char *const msg, _TYPE_ x, _TYPE_ *ref_value)
           const char *ok = (err > 0.) ? " ERROR       " : " OK           ";
           if (IS_DOUBLE)
             {
-              log_file_ << ok << " : comparing double: reference=" << y << " absolute error=" << err << finl;
+              log_file_ << ok << " : comparing double: reference=" << y << " absolute difference=" << err << finl;
               if (err > 0.) error_function();
             }
           else // int
diff --git a/src/Kernel/Framework/Domaine_Cl_dis_base.cpp b/src/Kernel/Framework/Domaine_Cl_dis_base.cpp
index c033d8fc17..57d1a02ac2 100644
--- a/src/Kernel/Framework/Domaine_Cl_dis_base.cpp
+++ b/src/Kernel/Framework/Domaine_Cl_dis_base.cpp
@@ -58,7 +58,6 @@ Entree& Domaine_Cl_dis_base::readOn(Entree& is)
 
   int n = ledomaine.nb_front_Cl();
   IntTab front_deja_lu(n);
-  front_deja_lu = 0;
   les_conditions_limites().dimensionner(n);
   int nb_clim=0;
 
diff --git a/src/Kernel/Framework/Ecrire_fichier_xyz_valeur.cpp b/src/Kernel/Framework/Ecrire_fichier_xyz_valeur.cpp
index 64a3face7b..349b9775cb 100644
--- a/src/Kernel/Framework/Ecrire_fichier_xyz_valeur.cpp
+++ b/src/Kernel/Framework/Ecrire_fichier_xyz_valeur.cpp
@@ -198,7 +198,7 @@ void Ecrire_fichier_xyz_valeur::write_fields() const
                       if(champ_stat)
                         {
                           DoubleTab copie(field->valeurs());
-                          field->valeurs() = op_stat->calculer_valeurs();
+                          op_stat->calculer(field->valeurs());
                           field->valeur_aux(pos, val);
                           field->valeurs() = copie;
                         }
diff --git a/src/Kernel/Framework/Equation_base.h b/src/Kernel/Framework/Equation_base.h
index 63f1fc5e62..7b5bc10b52 100644
--- a/src/Kernel/Framework/Equation_base.h
+++ b/src/Kernel/Framework/Equation_base.h
@@ -257,7 +257,7 @@ public :
   }
   inline const bool& diffusion_multi_scalaire() const { return diffusion_multi_scalaire_; }
 
-  public_for_cuda
+  protected_but_public_for_cuda
   void Gradient_conjugue_diff_impl(DoubleTrav& secmem, DoubleTab& solution, int size_terme_mul, const DoubleTab& term_mul);
 
 protected :
diff --git a/src/Kernel/Framework/IBM/Source_PDF_base.cpp b/src/Kernel/Framework/IBM/Source_PDF_base.cpp
index 4fecca3722..9f8c32508e 100644
--- a/src/Kernel/Framework/IBM/Source_PDF_base.cpp
+++ b/src/Kernel/Framework/IBM/Source_PDF_base.cpp
@@ -771,7 +771,6 @@ void Source_PDF_base::update_elem_IBM(DoubleTab& vecteur_deplacement, double alp
   assert (nb_elem == vecteur_deplacement.dimension(0));
   assert (dim_esp == vecteur_deplacement.dimension(1));
   IntTab indic_dead_cell(nb_elem);
-  indic_dead_cell = 0;
 
   // calcul voisins de chaque element traverse
   IntLists elem_voisins(nb_elem_tot);
diff --git a/src/Kernel/Framework/Operateur.cpp b/src/Kernel/Framework/Operateur.cpp
index fc3b9267b8..fbdd0a834d 100644
--- a/src/Kernel/Framework/Operateur.cpp
+++ b/src/Kernel/Framework/Operateur.cpp
@@ -20,6 +20,7 @@
 #include <TRUSTTrav.h>
 #include <Operateur.h>
 #include <Perf_counters.h>
+#include <Debog.h>
 
 Sortie& Operateur::ecrire(Sortie& os) const
 {
@@ -147,6 +148,7 @@ void Operateur::completer()
     le_champ_inco=mon_equation->inconnue();
 
   l_op_base().completer();
+  op_base_ = l_op_base().que_suis_je();
 }
 
 void Operateur::associer_champ(const Champ_Inc_base& ch, const std::string& nom_ch)
@@ -230,6 +232,7 @@ int Operateur::impr(Sortie& os) const
  */
 DoubleTab& Operateur::ajouter(const Champ_Inc_base& ch, DoubleTab& resu) const
 {
+  Debog::verifier(op_base_+"::ajouter(ch,resu) avant ch=",ch.valeurs());
   int i ;
   int nstep=l_op_base().get_nb_ss_pas_de_temps();
   double dt=equation().schema_temps().pas_de_temps();
@@ -249,6 +252,7 @@ DoubleTab& Operateur::ajouter(const Champ_Inc_base& ch, DoubleTab& resu) const
       solveur_masse.appliquer(derivee);
       inco.ajoute_sans_ech_esp_virt(dt, derivee, VECT_ALL_ITEMS) ;
     }
+  Debog::verifier(op_base_+"::ajouter(ch,resu) apres resu=",resu);
   return resu;
 }
 
@@ -282,7 +286,10 @@ DoubleTab& Operateur::calculer(const Champ_Inc_base& ch,DoubleTab& resu) const
  */
 DoubleTab& Operateur::ajouter(DoubleTab& resu) const
 {
-  return ajouter(le_champ_inco->valeurs(), resu);
+  Debog::verifier(op_base_+"::ajouter() avant resu=",resu);
+  OBS_PTR(DoubleTab) ref = ajouter(le_champ_inco->valeurs(), resu);
+  Debog::verifier(op_base_+"::ajouter() apres resu=",resu);
+  return ref;
 }
 
 /*! @brief Applique l'operateur au champ inconnu et renvoie le resultat.
diff --git a/src/Kernel/Framework/Operateur.h b/src/Kernel/Framework/Operateur.h
index b792b816ba..6f54bb3070 100644
--- a/src/Kernel/Framework/Operateur.h
+++ b/src/Kernel/Framework/Operateur.h
@@ -1,5 +1,5 @@
 /****************************************************************************
-* Copyright (c) 2024, CEA
+* Copyright (c) 2026, CEA
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
@@ -71,7 +71,7 @@ public :
   virtual int op_non_nul() const =0;
 
 protected :
-  std::string nom_inco_;
+  std::string nom_inco_, op_base_;
   OBS_PTR(Champ_Inc_base) le_champ_inco;
   Motcle typ;
 };
diff --git a/src/Kernel/Framework/Operateur_base.cpp b/src/Kernel/Framework/Operateur_base.cpp
index 8a0577b48e..d0dcc2f515 100644
--- a/src/Kernel/Framework/Operateur_base.cpp
+++ b/src/Kernel/Framework/Operateur_base.cpp
@@ -233,7 +233,11 @@ DoubleTab&  Operateur_base::ajouter(const DoubleTab& inco, DoubleTab& secmem) co
       if (equation().discretisation().is_poly_family())
         ajouter_blocs({}, secmem);
       else
-        ajouter_blocs({}, secmem, {{ equation().inconnue().le_nom().getString(),inco }} ); //pour prise en compte du parametre inco (qui est pas forcement l'inco de l'equation)
+        {
+          tabs_t semi_impl;
+          semi_impl[equation().inconnue().le_nom().getString()].ref(inco); /* evite la copie de inco dans tabs_t */
+          ajouter_blocs({}, secmem, semi_impl);
+        }
     }
   else Process::exit(que_suis_je() + " : ajouter() not coded!");
   return secmem;
diff --git a/src/Kernel/Framework/Solveur_Masse_Elem_proto.cpp b/src/Kernel/Framework/Solveur_Masse_Elem_proto.cpp
index 37a80f6d53..5a93bda027 100644
--- a/src/Kernel/Framework/Solveur_Masse_Elem_proto.cpp
+++ b/src/Kernel/Framework/Solveur_Masse_Elem_proto.cpp
@@ -36,24 +36,28 @@ void Solveur_Masse_Elem_proto::preparer_calcul_proto()
   solv_mass_->equation().init_champ_conserve();
 }
 
-DoubleTab& Solveur_Masse_Elem_proto::appliquer_impl_proto(DoubleTab& sm) const
+DoubleTab& Solveur_Masse_Elem_proto::appliquer_impl_proto(DoubleTab& tab_sm) const
 {
   const Domaine_VF& domaine = le_dom_.valeur();
-  const DoubleVect& ve = domaine.volumes(), &pe = solv_mass_->equation().milieu().porosite_elem();
-  const DoubleTab& der = solv_mass_->equation().champ_conserve().derivees().at(solv_mass_->equation().inconnue().le_nom().getString());
-
-  int e, ne_tot = domaine.nb_elem_tot(), n, N = sm.line_size();
-  assert(sm.dimension_tot(0) >= ne_tot && N == der.line_size());
+  int ne_tot = domaine.nb_elem_tot(), N = tab_sm.line_size();
+  const DoubleTab& tab_der = solv_mass_->equation().champ_conserve().derivees().at(solv_mass_->equation().inconnue().le_nom().getString());
+  assert(tab_sm.dimension_tot(0) >= ne_tot && N == tab_der.line_size());
 
   /* partie elem */
-  for (e = 0; e < ne_tot; e++)
-    for (n = 0; n < N; n++)
-      if (std::abs(der(e, n)) > 1e-10)
+  CDoubleArrView pe = solv_mass_->equation().milieu().porosite_elem().view_ro();
+  CDoubleArrView ve = domaine.volumes().view_ro();
+  CDoubleTabView der = tab_der.view_ro();
+  DoubleTabView sm = tab_sm.view_rw();
+  Kokkos::parallel_for(start_gpu_timer(__KERNEL_NAME__), range_1D(0, ne_tot), KOKKOS_LAMBDA( const int e)
+  {
+    for (int n = 0; n < N; n++)
+      if (Kokkos::fabs(der(e, n)) > 1e-10)
         sm(e, n) /= pe(e) * ve(e) * der(e, n);
       else
         sm(e, n) = 0; //cas d'une evanescence
-
-  return sm;
+  });
+  end_gpu_timer(__KERNEL_NAME__);
+  return tab_sm;
 }
 
 void Solveur_Masse_Elem_proto::dimensionner_blocs_proto(matrices_t matrices, const tabs_t& semi_impl) const
diff --git a/src/Kernel/Framework/Solveur_Masse_base.cpp b/src/Kernel/Framework/Solveur_Masse_base.cpp
index 01e61e9d18..d3e2b64e78 100644
--- a/src/Kernel/Framework/Solveur_Masse_base.cpp
+++ b/src/Kernel/Framework/Solveur_Masse_base.cpp
@@ -125,6 +125,7 @@ DoubleTab& Solveur_Masse_base::appliquer(DoubleTab& x) const
       DoubleTab_parts values_parts(values);
       tab_divide_any_shape(x, values_parts[0], VECT_REAL_ITEMS);
     }
+  Debog::verifier("Solveur_Masse_base::appliquer before appliquer_impl, x:",x);
   return appliquer_impl(x); // M-1.x
 }
 
diff --git a/src/Kernel/Framework/Sources.cpp b/src/Kernel/Framework/Sources.cpp
index 369295b583..f9a718a90b 100644
--- a/src/Kernel/Framework/Sources.cpp
+++ b/src/Kernel/Framework/Sources.cpp
@@ -1,5 +1,5 @@
 /****************************************************************************
-* Copyright (c) 2024, CEA
+* Copyright (c) 2026, CEA
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
@@ -16,6 +16,7 @@
 #include <Matrice_Morse.h>
 #include <TRUSTTab.h>
 #include <Sources.h>
+#include <Debog.h>
 
 Implemente_instanciable(Sources, "Sources", LIST(Source));
 // XD sources listobj sources -1 source_base -1 The sources.
@@ -84,7 +85,12 @@ Entree& Sources::readOn(Entree& is)
  */
 DoubleTab& Sources::ajouter(DoubleTab& xx) const
 {
-  for (const auto& itr : *this) itr.ajouter(xx);
+  Debog::verifier("Sources::ajouter() debut xx=",xx);
+  for (const auto& itr : *this)
+    {
+      itr.ajouter(xx);
+      Debog::verifier(itr.que_suis_je()+"::ajouter() apres xx=",xx);
+    }
   return xx;
 }
 
diff --git a/src/Kernel/Geometrie/DecoupeBord.cpp b/src/Kernel/Geometrie/DecoupeBord.cpp
index ce0fef8ff4..4a97cd026f 100644
--- a/src/Kernel/Geometrie/DecoupeBord.cpp
+++ b/src/Kernel/Geometrie/DecoupeBord.cpp
@@ -222,11 +222,11 @@ void Impl_32_64<_SIZE_>::create_listb_from_xyz(const Domaine_t& dom, const Noms&
                 for (int j=0; j<Objet_U::dimension; j++)
                   xg1[j]+=xs1(sommets_face1(face1,i),j);  // centre de gravite de la face1 (non divise par nb_som_face)
               xg1/=nb_som_face;
-              parser.setVar("x",xg1[0]);
+              parser.setVar(0,xg1[0]);
 
-              parser.setVar("y",xg1[1]);
+              parser.setVar(1,xg1[1]);
               if (Objet_U::dimension==3)
-                parser.setVar("z",xg1[2]);
+                parser.setVar(2,xg1[2]);
               double res=parser.eval();
               int face_min=static_cast<int>(res+0.5);
               nb=std::max(nb,face_min);
diff --git a/src/Kernel/Geometrie/DomaineCutter.cpp b/src/Kernel/Geometrie/DomaineCutter.cpp
index 581e49d8a4..98ce03ddbf 100644
--- a/src/Kernel/Geometrie/DomaineCutter.cpp
+++ b/src/Kernel/Geometrie/DomaineCutter.cpp
@@ -1304,7 +1304,6 @@ void DomaineCutter_32_64<_SIZE_>::ecrire_domaines(const Nom& basename, const Dom
 
   //To detect my parts (when running in parallel)
   ArrOfInt myDomaines(nb_parties_);
-  myDomaines = 0;
   ArrsOfInt otherProcDomaines(Process::nproc());
 
   //if some domains are splitted between multiple procs,
@@ -1681,7 +1680,6 @@ void DomaineCutter_32_64<_SIZE_>::ecrire_domaines(const Nom& basename, const Dom
                     }
                 }
               ArrOfInt tmp_edge_cut(nb_parties_);
-              tmp_edge_cut = 0;
               recevoir(tmp_edge_cut, proc, 0, proc+2008);
 
               for(int i_part=0; i_part<nb_parties_; i_part++)
diff --git a/src/Kernel/Geometrie/Domaine_bord.cpp b/src/Kernel/Geometrie/Domaine_bord.cpp
index 6f8e998573..9ade24cd19 100644
--- a/src/Kernel/Geometrie/Domaine_bord.cpp
+++ b/src/Kernel/Geometrie/Domaine_bord.cpp
@@ -102,7 +102,7 @@ void type_face_to_type_elem(const Elem_geom_base_32_64<_SIZE_>& type_elem, const
       type_elem_face = "quadrangle_VEF";
       break;
     case Type_Face::quadrangle_3D:
-      type_elem_face = (sub_type(Hexaedre,type_elem)?"rectangle":"quadrangle_VEF");
+      type_elem_face = ((sub_type(Hexaedre,type_elem) || sub_type(Hexaedre_64, type_elem))?"rectangle":"quadrangle_VEF");
       break;
     case Type_Face::quadrangle_3D_axi:
       type_elem_face = "quadrangle_VEF";
diff --git a/src/Kernel/Geometrie/Extraire_domaine.cpp b/src/Kernel/Geometrie/Extraire_domaine.cpp
index 41f49f8302..2b9dd71fa1 100644
--- a/src/Kernel/Geometrie/Extraire_domaine.cpp
+++ b/src/Kernel/Geometrie/Extraire_domaine.cpp
@@ -89,7 +89,7 @@ Entree& Extraire_domaine::interpreter_(Entree& is)
           parser_condition_elements.setVar(2,xp(elem,2),threadId);
         double res=parser_condition_elements.eval(threadId);
         parser_condition_elements.release(threadId);
-        if (std::fabs(res)>1e-5)
+        if (Kokkos::fabs(res)>1e-5)
           {
             marq_elem(elem)=1;
             local_nb_elem_m++;
diff --git a/src/Kernel/Geometrie/Extraire_surface.cpp b/src/Kernel/Geometrie/Extraire_surface.cpp
index d3a84e930e..e3089c6996 100644
--- a/src/Kernel/Geometrie/Extraire_surface.cpp
+++ b/src/Kernel/Geometrie/Extraire_surface.cpp
@@ -143,7 +143,7 @@ void Extraire_surface::extraire_surface_without_cleaning(Domaine& domaine_surfac
       parser_condition_elements.setVar(2,xp(elem,2),threadId);
     double res = parser_condition_elements.eval(threadId);
     parser_condition_elements.release(threadId);
-    marq_elem(elem) = std::fabs(res)>1e-5 ? 1 : 0;
+    marq_elem(elem) = Kokkos::fabs(res)>1e-5 ? 1 : 0;
   });
   end_gpu_timer(__KERNEL_NAME__);
   tab_marq_elem.echange_espace_virtuel();
@@ -226,7 +226,7 @@ void Extraire_surface::extraire_surface_without_cleaning(Domaine& domaine_surfac
               parser_condition_faces.setVar(2,xv(fac,2),threadId);
             double res=parser_condition_faces.eval(threadId);
             parser_condition_faces.release(threadId);
-            if (std::fabs(res)>1e-5)
+            if (Kokkos::fabs(res)>1e-5)
               if (marq[fac]!=-1)  // pas un joint, ou on est le proprietaire
                 {
                   marq[fac]=1;
diff --git a/src/Kernel/Geometrie/Transformer.cpp b/src/Kernel/Geometrie/Transformer.cpp
index 4d186ecd3b..dfdb3040d1 100644
--- a/src/Kernel/Geometrie/Transformer.cpp
+++ b/src/Kernel/Geometrie/Transformer.cpp
@@ -93,9 +93,9 @@ void Transformer_32_64<_SIZE_>::transformer(Domaine_t& dom, Noms& les_fcts)
         }
       for (int j = 0; j < Objet_U::dimension; j++)
         {
-          fxyz[j].setVar("x", x);
-          fxyz[j].setVar("y", y);
-          fxyz[j].setVar("z", z);
+          fxyz[j].setVar(0, x);
+          fxyz[j].setVar(1, y);
+          fxyz[j].setVar(2, z);
           new_sommets(i, j) = fxyz[j].eval();
         }
     }
diff --git a/src/Kernel/Math/MD_Vector_base.h b/src/Kernel/Math/MD_Vector_base.h
index d76550bf88..44118d0c61 100644
--- a/src/Kernel/Math/MD_Vector_base.h
+++ b/src/Kernel/Math/MD_Vector_base.h
@@ -1,5 +1,5 @@
 /****************************************************************************
-* Copyright (c) 2025, CEA
+* Copyright (c) 2026, CEA
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
diff --git a/src/Kernel/Math/MD_Vector_composite.h b/src/Kernel/Math/MD_Vector_composite.h
index c191caac4c..92ef1154ff 100644
--- a/src/Kernel/Math/MD_Vector_composite.h
+++ b/src/Kernel/Math/MD_Vector_composite.h
@@ -1,5 +1,5 @@
 /****************************************************************************
-* Copyright (c) 2025, CEA
+* Copyright (c) 2026, CEA
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
diff --git a/src/Kernel/Math/MD_Vector_mono.cpp b/src/Kernel/Math/MD_Vector_mono.cpp
index 09daa49370..302ffb6bf7 100644
--- a/src/Kernel/Math/MD_Vector_mono.cpp
+++ b/src/Kernel/Math/MD_Vector_mono.cpp
@@ -1,5 +1,5 @@
 /****************************************************************************
-* Copyright (c) 2025, CEA
+* Copyright (c) 2026, CEA
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
@@ -42,10 +42,8 @@ Sortie& MD_Vector_mono::printOn(Sortie& os) const
   return os;
 }
 
-void flatten(const ArrOfInt& blocs_items, ArrOfInt& items)
+static void flatten(const ArrOfInt& blocs_items, ArrOfInt& items)
 {
-  // Build from blocs_items_to_sum_
-  //const ArrOfInt& blocs_items_to_sum = get_blocs_items_to_sum();
   const int nblocs = blocs_items.size_array() >> 1;
   const int *bloc_itr = blocs_items.addr();
   int size = 0;
@@ -56,30 +54,27 @@ void flatten(const ArrOfInt& blocs_items, ArrOfInt& items)
       size += end_bloc - begin_bloc;
     }
   items.resize(size);
-  int item = 0;
+  int k = 0;
   bloc_itr = blocs_items.addr();
   for (int bloc=0; bloc<nblocs; bloc++)
     {
       const int begin_bloc = (*(bloc_itr++));
       const int end_bloc = (*(bloc_itr++));
       for (int i=begin_bloc; i<end_bloc; i++)
-        {
-          items(item) = i;
-          item++;
-        }
+        items(k++) = i;
     }
 }
 
 const ArrOfInt& MD_Vector_mono::get_items_to_sum() const
 {
-  if (items_to_sum_.size_array()==0)
-    flatten(get_blocs_items_to_sum(), items_to_sum_);
+  if (items_to_sum_.size_array() == 0 && blocs_items_to_sum_.size_array() > 0)
+    flatten(blocs_items_to_sum_, items_to_sum_);
   return items_to_sum_;
 }
 
 const ArrOfInt& MD_Vector_mono::get_items_to_compute() const
 {
-  if (items_to_compute_.size_array()==0)
-    flatten(get_blocs_items_to_compute(), items_to_compute_);
+  if (items_to_compute_.size_array() == 0 && blocs_items_to_compute_.size_array() > 0)
+    flatten(blocs_items_to_compute_, items_to_compute_);
   return items_to_compute_;
 }
diff --git a/src/Kernel/Math/MD_Vector_mono.h b/src/Kernel/Math/MD_Vector_mono.h
index 7e246f5ece..bfe8b97b42 100644
--- a/src/Kernel/Math/MD_Vector_mono.h
+++ b/src/Kernel/Math/MD_Vector_mono.h
@@ -1,5 +1,5 @@
 /****************************************************************************
-* Copyright (c) 2025, CEA
+* Copyright (c) 2026, CEA
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
diff --git a/src/Kernel/Math/Matrices/Matrice_Base.cpp b/src/Kernel/Math/Matrices/Matrice_Base.cpp
index fb0f2f987c..1439c7bd30 100644
--- a/src/Kernel/Math/Matrices/Matrice_Base.cpp
+++ b/src/Kernel/Math/Matrices/Matrice_Base.cpp
@@ -55,7 +55,7 @@ void Matrice_Base::build_stencil()
   Process::exit( );
 }
 
-int Matrice_Base::get_stencil_size() const
+auto Matrice_Base::get_stencil_size() const
 {
   return stencil_.dimension( 0 );
 }
diff --git a/src/Kernel/Math/Matrices/Matrice_Base.h b/src/Kernel/Math/Matrices/Matrice_Base.h
index 629e43e55f..845a404ead 100644
--- a/src/Kernel/Math/Matrices/Matrice_Base.h
+++ b/src/Kernel/Math/Matrices/Matrice_Base.h
@@ -78,15 +78,18 @@ public :
 
   virtual void get_symmetric_stencil_and_coefficients(Stencil& stencil, StencilCoeffs& coefficients) const;
 
-  int get_stencil_size() const ;
+  auto get_stencil_size() const ;
   virtual void build_stencil();
 
   void set_stencil( const Stencil& stencil );
 
   bool is_stencil_up_to_date() const ;
+  void set_has_constant_nullspace(bool has_constant_nullspace) { has_constant_nullspace_ = has_constant_nullspace; }
+  bool has_constant_nullspace() const { return has_constant_nullspace_; }
 
 protected:
   bool is_stencil_up_to_date_ = false;
+  bool has_constant_nullspace_ = false;
   Stencil stencil_ ;
 };
 
diff --git a/src/Kernel/Math/Matrices/Matrice_Bloc.cpp b/src/Kernel/Math/Matrices/Matrice_Bloc.cpp
index d91f2e2269..713d154663 100644
--- a/src/Kernel/Math/Matrices/Matrice_Bloc.cpp
+++ b/src/Kernel/Math/Matrices/Matrice_Bloc.cpp
@@ -20,6 +20,7 @@
 #include <TRUSTArrays.h>
 #include <TRUSTTabs.h>
 #include <Matrice_Nulle.h>
+#include <algorithm>
 
 Implemente_instanciable_sans_constructeur(Matrice_Bloc,"Matrice_Bloc",Matrice_Base);
 
@@ -229,8 +230,8 @@ void Matrice_Bloc::get_stencil( Stencil& stencil ) const
           Stencil& local_stencil = local_stencils[ i * nb_column_blocks + j ];
           local_matrix.get_stencil( local_stencil );
 
-          const int size = local_stencil.dimension( 0 );
-          for ( int k=0; k<size; ++k )
+          const auto size = local_stencil.dimension( 0 );
+          for ( auto k=0; k<size; ++k )
             {
               local_stencil( k, 0 ) += imin;
               local_stencil( k, 1 ) += jmin;
@@ -248,9 +249,9 @@ void Matrice_Bloc::get_stencil( Stencil& stencil ) const
   for ( int i=0; i<nb_stencils; ++i )
     {
       const Stencil& local_stencil = local_stencils[ i ];
-      const int size = local_stencil.dimension( 0 );
+      const auto size = local_stencil.dimension( 0 );
 
-      for ( int k=0; k<size; ++k )
+      for ( auto k=0; k<size; ++k )
         {
           const int line = local_stencil( k, 0 );
           offsets[ line + 1 ] += 1;
@@ -270,9 +271,9 @@ void Matrice_Bloc::get_stencil( Stencil& stencil ) const
   for ( int i=0; i<nb_stencils; ++i )
     {
       const Stencil& local_stencil = local_stencils[ i ];
-      const int size = local_stencil.dimension( 0 );
+      const auto size = local_stencil.dimension( 0 );
 
-      for ( int k=0; k<size; ++k )
+      for ( auto k=0; k<size; ++k )
         {
           const int line   = local_stencil( k, 0 );
           const int column = local_stencil( k, 1 );
@@ -328,8 +329,8 @@ void Matrice_Bloc::build_stencil()
           Stencil& local_stencil = local_stencils[ i * nb_column_blocks + j ];
           local_matrix.get_stencil( local_stencil );
 
-          const int size = local_stencil.dimension( 0 );
-          for ( int k=0; k<size; ++k )
+          const auto size = local_stencil.dimension( 0 );
+          for ( auto k=0; k<size; ++k )
             {
               local_stencil( k, 0 ) += imin;
               local_stencil( k, 1 ) += jmin;
@@ -347,9 +348,9 @@ void Matrice_Bloc::build_stencil()
   for ( int i=0; i<nb_stencils; ++i )
     {
       const Stencil& local_stencil = local_stencils[ i ];
-      const int size = local_stencil.dimension( 0 );
+      const auto size = local_stencil.dimension( 0 );
 
-      for ( int k=0; k<size; ++k )
+      for ( auto k=0; k<size; ++k )
         {
           const int line = local_stencil( k, 0 );
           offsets[ line + 1 ] += 1;
@@ -371,9 +372,9 @@ void Matrice_Bloc::build_stencil()
   for ( int i=0; i<nb_stencils; ++i )
     {
       const Stencil& local_stencil = local_stencils[ i ];
-      const int size = local_stencil.dimension( 0 );
+      const auto size = local_stencil.dimension( 0 );
 
-      for ( int k=0; k<size; ++k )
+      for ( auto k=0; k<size; ++k )
         {
           const int line   = local_stencil( k, 0 );
           const int column = local_stencil( k, 1 );
@@ -440,9 +441,9 @@ void Matrice_Bloc::get_stencil_coeff_templ( Stencil& stencil, _TAB_T_& coeff_sp)
 
           _get_sub_stencil_coeff<_TAB_T_>(local_matrix, local_stencil, local_coeff);
 
-          const int size = local_stencil.dimension( 0 );
+          const auto size = local_stencil.dimension( 0 );
 
-          for ( int k=0; k<size; ++k )
+          for ( auto k=0; k<size; ++k )
             {
               const int line           = local_stencil( k, 0 ) + imin;
               const int index          = offsets[ line ];
@@ -473,8 +474,8 @@ void Matrice_Bloc::get_stencil_coeff_templ( Stencil& stencil, _TAB_T_& coeff_sp)
 
           _get_sub_stencil_coeff<_TAB_T_>(local_matrix, local_stencil, local_coefficients);
 
-          const int size = local_stencil.dimension( 0 );
-          for ( int k=0; k<size; ++k )
+          const auto size = local_stencil.dimension( 0 );
+          for ( auto k=0; k<size; ++k )
             {
               local_stencil( k, 0 ) += imin;
               local_stencil( k, 1 ) += jmin;
@@ -492,9 +493,9 @@ void Matrice_Bloc::get_stencil_coeff_templ( Stencil& stencil, _TAB_T_& coeff_sp)
   for ( int i=0; i<nb_stencils; ++i )
     {
       const Stencil& local_stencil = vect_local_stencils[ i ];
-      const int size = local_stencil.dimension( 0 );
+      const auto size = local_stencil.dimension( 0 );
 
-      for ( int k=0; k<size; ++k )
+      for ( auto k=0; k<size; ++k )
         {
           const int line = local_stencil( k, 0 );
           offsets[ line + 1 ] += 1;
@@ -517,9 +518,9 @@ void Matrice_Bloc::get_stencil_coeff_templ( Stencil& stencil, _TAB_T_& coeff_sp)
       const Stencil&      local_stencil= vect_local_stencils[ i ];
       const _TAB_T_& local_coefficients = vect_local_coefficients[ i ];
 
-      const int size = local_stencil.dimension( 0 );
+      const auto size = local_stencil.dimension( 0 );
 
-      for ( int k=0; k<size; ++k )
+      for ( auto k=0; k<size; ++k )
         {
           const int line        = local_stencil( k, 0 );
           const int column      = local_stencil( k, 1 );
@@ -557,7 +558,7 @@ void Matrice_Bloc::get_stencil_and_coefficients(Stencil& stencil, StencilCoeffs&
 {
   if( is_stencil_up_to_date_ )
     {
-      const int stencil_size = stencil_.dimension(0);
+      const auto stencil_size = stencil_.dimension(0);
       coefficients.resize_array( stencil_size );
     }
 
@@ -683,236 +684,187 @@ void Matrice_Bloc::BlocToMatMorse( Matrice_Morse& result ) const
   Matrix_tools::convert_to_morse_matrix( (*this), result );
 }
 
-
-//Produit matrice-vecteur
-
-// Remplissage partiel (i premieres lignes) d'une matrice bloc par une matrice morse symetrique
-// RR | RV
-// ------
-// VR | VV
-void Matrice_Bloc::remplir(const IntLists& voisins, const DoubleLists& valeurs, const DoubleVect& terme_diag, const int i, const int n)
-{
-  dimensionner(2, 2) ;
-  get_bloc(0,0).typer("Matrice_Morse_Sym");        // Bloc RR
-  get_bloc(0,1).typer("Matrice_Morse");        // Bloc RV
-  get_bloc(1,0).typer("Matrice_Morse");        // Bloc VR
-  get_bloc(1,1).typer("Matrice_Morse");        // Bloc VV
-  remplir(voisins, valeurs, terme_diag, i, n, i, n);
-}
-
-// Remplissage partiel (n premieres lignes, m premieres colonnes) d'une matrice bloc par une matrice morse non symetrique
-// RR | RV
-// ------
-// VR | VV
-void Matrice_Bloc::remplir(const IntLists& voisins, const DoubleLists& valeurs, const int i, const int n, const int j, const int m)
-{
-  dimensionner(2, 2) ;
-  get_bloc(0,0).typer("Matrice_Morse");        // Bloc RR
-  get_bloc(0,1).typer("Matrice_Morse");        // Bloc RV
-  get_bloc(1,0).typer("Matrice_Morse");        // Bloc VR
-  get_bloc(1,1).typer("Matrice_Morse");        // Bloc VV
-  DoubleVect diagonale_vide;
-  remplir(voisins, valeurs, diagonale_vide, i, n, j, m);
-}
-
-// Remplissage partiel (i premieres lignes, j premieres colonnes) d'une matrice bloc par une matrice morse symetrique ou non
-// RR | RV
-// ------
-// VR | VV
-void Matrice_Bloc::remplir(const IntLists& voisins, const DoubleLists& valeurs, const DoubleVect& terme_diag, const int i, const int n, const int j, const int m)
-{
-  Matrice_Morse& RR=ref_cast(Matrice_Morse,get_bloc(0,0).valeur());
-  Matrice_Morse& RV=ref_cast(Matrice_Morse,get_bloc(0,1).valeur());
-  Matrice_Morse& VR=ref_cast(Matrice_Morse,get_bloc(1,0).valeur());
-  Matrice_Morse& VV=ref_cast(Matrice_Morse,get_bloc(1,1).valeur());
-
-  // Premiere passe pour le dimensionnement
-  int RR_compteur;
-  auto RR_rang=0;
-  int RV_compteur;
-  auto RV_rang=0;
-  int VR_compteur;
-  auto VR_rang=0;
-  int VV_compteur;
-  auto VV_rang=0;
-
-  int num_elem;
-  for (num_elem=0; num_elem<n; num_elem++)
-    {
-      IntList_Curseur liste_vois(voisins[num_elem]);
-      DoubleList_Curseur liste_val(valeurs[num_elem]);
-
-      RR_compteur=0;
-      RV_compteur=0;
-      VR_compteur=0;
-      VV_compteur=0;
-
-      // Diagonale
-      if (terme_diag.size_array()!=0)
-        {
-          if (num_elem<i)
-            RR_compteur++;
-          else
-            VV_compteur++;
-        }
-
-      while (liste_vois)
-        {
-          int colonne = liste_vois.valeur();
-          if (colonne<j)
-            {
-              if (num_elem<i)
-                RR_compteur++;         // Sous Bloc RR
-              else
-                VR_compteur++;        // Sous Bloc VR
-            }
-          else
-            {
-              if (num_elem<i)
-                RV_compteur++;        // Sous Bloc RV
-              else
-                VV_compteur++;         // Sous Bloc VV
-            }
-          ++liste_vois;
-          ++liste_val;
-        }
-      RR_rang += RR_compteur;
-      RV_rang += RV_compteur;
-      VR_rang += VR_compteur;
-      VV_rang += VV_compteur;
-    }
-  RR.dimensionner(i,        j,        RR_rang);        // Dimension RR
-  RV.dimensionner(i,        m-j,        RV_rang);        // Dimension RV
-  VR.dimensionner(n-i,        j,        VR_rang);        // Dimension VR
-  VV.dimensionner(n-i,        m-j,        VV_rang);        // Dimension VV
-
-  // Initialisations necessaires
-  RR.get_set_tab1()=1;
-  RV.get_set_tab1()=1;
-  VR.get_set_tab1()=1;
-  VV.get_set_tab1()=1;
-
-  // Deuxieme passe pour le remplissage
-  // Tableaux tab1, tab2 et coeff_ pour le bloc RR
-  auto* RR_tab1 = RR.get_set_tab1().addr();
-  int* RR_tab2 = RR.get_set_tab2().addr();
-  double* RR_coeff = RR.get_set_coeff().addr();
-  int* RR_tab2_ptr = RR_tab2;
-
-  // Tableaux tab1, tab2 et coeff_ pour le bloc RV
-  auto* RV_tab1 = RV.get_set_tab1().addr();
-  int* RV_tab2 = RV.get_set_tab2().addr();
-  double* RV_coeff = RV.get_set_coeff().addr();
-  int* RV_tab2_ptr = RV_tab2;
-
-  // Tableaux tab1, tab2 et coeff_ pour le bloc VR
-  auto* VR_tab1 = VR.get_set_tab1().addr();
-  int* VR_tab2 = VR.get_set_tab2().addr();
-  double* VR_coeff = VR.get_set_coeff().addr();
-  int* VR_tab2_ptr = VR_tab2;
-
-  // Tableaux tab1, tab2 et coeff_ pour le bloc VV
-  auto* VV_tab1 = VV.get_set_tab1().addr();
-  int* VV_tab2 = VV.get_set_tab2().addr();
-  double* VV_coeff = VV.get_set_coeff().addr();
-  int* VV_tab2_ptr = VV_tab2;
-
-  RR_rang=0;
-  RV_rang=0;
-  VR_rang=0;
-  VV_rang=0;
-  for (num_elem=0; num_elem<n; num_elem++)
-    {
-      IntList_Curseur liste_vois(voisins[num_elem]);
-      DoubleList_Curseur liste_val(valeurs[num_elem]);
-
-      RR_compteur=0;
-      RV_compteur=0;
-      VR_compteur=0;
-      VV_compteur=0;
-      if (num_elem<i)
-        {
-          *RR_tab1++ = RR_rang;
-          *RV_tab1++ = RV_rang;
-        }
-      else
-        {
-          *VR_tab1++ = VR_rang;
-          *VV_tab1++ = VV_rang;
-        }
-      // Diagonale eventuelle
-      if (terme_diag.size_array()!=0)
-        {
-          if (num_elem<i)
-            {
-              *RR_tab2_ptr++ = num_elem;
-              *RR_coeff++ = terme_diag[num_elem];
-              RR_compteur++;
-            }
-          else
-            {
-              *VV_tab2_ptr++ = num_elem-i;
-              *VV_coeff++ = terme_diag[num_elem];
-              VV_compteur++;
-            }
-        }
-      while (liste_vois)
-        {
-          int colonne = liste_vois.valeur();
-          double coeff = liste_val.valeur();
-          if (colonne<j)
-            {
-              if (num_elem<i) // Sous Bloc RR
-                {
-                  *RR_tab2_ptr++ = colonne;
-                  *RR_coeff++ = coeff;
-                  RR_compteur++;
-                }
-              else  // Sous Bloc VR
-                {
-                  *VR_tab2_ptr++ = colonne;
-                  *VR_coeff++ = coeff;
-                  VR_compteur++;
-                }
-            }
-          else
-            {
-              if (num_elem<i) // Sous Bloc RV
-                {
-                  *RV_tab2_ptr++ = colonne-j;
-                  *RV_coeff++ = coeff;
-                  RV_compteur++;
-                }
-              else // Sous Bloc VV
-                {
-                  *VV_tab2_ptr++ = colonne-j;
-                  *VV_coeff++ = coeff;
-                  VV_compteur++;
-                }
-            }
-          ++liste_vois;
-          ++liste_val;
-        }
-      RR_rang += RR_compteur;
-      RV_rang += RV_compteur;
-      VR_rang += VR_compteur;
-      VV_rang += VV_compteur;
-    }
-  RR.get_set_tab1()(i)=RR_rang;
-  RV.get_set_tab1()(i)=RV_rang;
-  VR.get_set_tab1()(n-i)=VR_rang;
-  VV.get_set_tab1()(n-i)=VV_rang;
-  // Passage a la numerotation Fortran
+// Fill a Matrice_Bloc with a Stencil
+// Precondition: No condition on stencil (possibly unordered)
+void Matrice_Bloc::remplir(const Stencil& tab_stencil,
+                           const int i, const int n,
+                           int j, int m)
+{
+  dimensionner(2, 2);
+  if (j==-1)
+    {
+      j = i;
+      m = n;
+      get_bloc(0, 0).typer("Matrice_Morse_Sym");
+    }
+  else
+    get_bloc(0,0).typer("Matrice_Morse");
+  get_bloc(0,1).typer("Matrice_Morse");
+  get_bloc(1,0).typer("Matrice_Morse");
+  get_bloc(1,1).typer("Matrice_Morse");
+  Matrice_Morse& RR = ref_cast(Matrice_Morse, get_bloc(0,0).valeur());
+  Matrice_Morse& RV = ref_cast(Matrice_Morse, get_bloc(0,1).valeur());
+  Matrice_Morse& VR = ref_cast(Matrice_Morse, get_bloc(1,0).valeur());
+  Matrice_Morse& VV = ref_cast(Matrice_Morse, get_bloc(1,1).valeur());
+  using nnz_t = decltype(tab_stencil.dimension(0));
+  nnz_t nnz = tab_stencil.dimension(0); // Rappel: nnz est int (CPU) ou trustIdType (GPU)
+
+  // Dimensionnement de la matrice:
+  ArrOfInt tab_cnt_RR(i), tab_cnt_RV(i), tab_cnt_VR(n-i), tab_cnt_VV(n-i); // nnz per row
+  TRUSTArray<nnz_t, int> tab_bloc_nnz(4);
+  auto bloc_nnz = tab_bloc_nnz.view_rw();
+  const auto stencil = tab_stencil.view_ro();
+  IntArrView cnt_RR = tab_cnt_RR.view_rw();
+  IntArrView cnt_RV = tab_cnt_RV.view_rw();
+  IntArrView cnt_VR = tab_cnt_VR.view_rw();
+  IntArrView cnt_VV = tab_cnt_VV.view_rw();
+  Kokkos::parallel_for(start_gpu_timer(__KERNEL_NAME__), nnz, KOKKOS_LAMBDA(const nnz_t k)
+  {
+    const int row = stencil(k,0);
+    const int col = stencil(k,1);
+    if (col < j)
+      {
+        if (row < i)
+          {
+            Kokkos::atomic_inc(&cnt_RR[row]);
+            Kokkos::atomic_inc(&bloc_nnz[0]);
+          }
+        else
+          {
+            Kokkos::atomic_inc(&cnt_VR[row-i]);
+            Kokkos::atomic_inc(&bloc_nnz[1]);
+          }
+      }
+    else
+      {
+        if (row < i)
+          {
+            Kokkos::atomic_inc(&cnt_RV[row]);
+            Kokkos::atomic_inc(&bloc_nnz[2]);
+          }
+        else
+          {
+            Kokkos::atomic_inc(&cnt_VV[row-i]);
+            Kokkos::atomic_inc(&bloc_nnz[3]);
+          }
+      }
+  });
+  end_gpu_timer(__KERNEL_NAME__);
+  RR.dimensionner(i,   j,   tab_bloc_nnz(0));
+  RV.dimensionner(i,   m-j, tab_bloc_nnz(2));
+  VR.dimensionner(n-i, j,   tab_bloc_nnz(1));
+  VV.dimensionner(n-i, m-j, tab_bloc_nnz(3));
+  // Fill tab1:
+  TRUSTArray<nnz_t, int> tab_ptr_RR(i), tab_ptr_RV(i), tab_ptr_VR(n-i), tab_ptr_VV(n-i);
+  auto ptr_RR = tab_ptr_RR.view_rw();
+  auto ptr_RV = tab_ptr_RV.view_rw();
+  auto ptr_VR = tab_ptr_VR.view_rw();
+  auto ptr_VV = tab_ptr_VV.view_rw();
+  auto RR_tab1 = RR.get_set_tab1().view_wo();
+  Kokkos::parallel_scan(start_gpu_timer(__KERNEL_NAME__), i+1, KOKKOS_LAMBDA(const int row, nnz_t& offset, const bool final)
+  {
+    if (final) RR_tab1[row] = offset;
+    if (row < i)
+      {
+        if (final) ptr_RR[row] = offset;
+        offset += cnt_RR[row];
+      }
+  });
+  end_gpu_timer(__KERNEL_NAME__);
+  auto RV_tab1 = RV.get_set_tab1().view_wo();
+  Kokkos::parallel_scan(start_gpu_timer(__KERNEL_NAME__), i+1, KOKKOS_LAMBDA(const int row, nnz_t& offset, const bool final)
+  {
+    if (final) RV_tab1[row] = offset;
+    if (row < i)
+      {
+        if (final) ptr_RV[row] = offset;
+        offset += cnt_RV[row];
+      }
+  });
+  end_gpu_timer(__KERNEL_NAME__);
+  auto VR_tab1 = VR.get_set_tab1().view_wo();
+  Kokkos::parallel_scan(start_gpu_timer(__KERNEL_NAME__), n-i+1, KOKKOS_LAMBDA(const int row, nnz_t& offset, const bool final)
+  {
+    if (final) VR_tab1[row] = offset;
+    if (row < n-i)
+      {
+        if (final) ptr_VR[row] = offset;
+        offset += cnt_VR[row];
+      }
+  });
+  end_gpu_timer(__KERNEL_NAME__);
+  auto VV_tab1 = VV.get_set_tab1().view_wo();
+  Kokkos::parallel_scan(start_gpu_timer(__KERNEL_NAME__), n-i+1, KOKKOS_LAMBDA(const int row, nnz_t& offset, const bool final)
+  {
+    if (final) VV_tab1[row] = offset;
+    if (row < n-i)
+      {
+        if (final) ptr_VV[row] = offset;
+        offset += cnt_VV[row];
+      }
+  });
+  end_gpu_timer(__KERNEL_NAME__);
+
+  // Fill tab2 and coeff:
+  auto RR_tab2 = RR.get_set_tab2().view_wo();
+  auto RR_coeff = RR.get_set_coeff().view_wo();
+  auto RV_tab2 = RV.get_set_tab2().view_wo();
+  auto RV_coeff = RV.get_set_coeff().view_wo();
+  auto VR_tab2 = VR.get_set_tab2().view_wo();
+  auto VR_coeff = VR.get_set_coeff().view_wo();
+  auto VV_tab2 = VV.get_set_tab2().view_wo();
+  auto VV_coeff = VV.get_set_coeff().view_wo();
+  Kokkos::parallel_for(start_gpu_timer(__KERNEL_NAME__), nnz, KOKKOS_LAMBDA(const nnz_t k)
+  {
+    const int row = stencil(k,0);
+    const int col = stencil(k,1);
+    const double coeff = 0.;
+    if (col < j)
+      {
+        if (row < i)
+          {
+            auto slot = Kokkos::atomic_fetch_inc(&ptr_RR[row]);
+            RR_tab2[slot] = col;
+            RR_coeff[slot] = coeff;
+          }
+        else
+          {
+            auto slot = Kokkos::atomic_fetch_inc(&ptr_VR[row-i]);
+            VR_tab2[slot] = col;
+            VR_coeff[slot] = coeff;
+          }
+      }
+    else
+      {
+        if (row < i)
+          {
+            auto slot = Kokkos::atomic_fetch_inc(&ptr_RV[row]);
+            RV_tab2[slot] = col-j;
+            RV_coeff[slot] = coeff;
+          }
+        else
+          {
+            auto slot = Kokkos::atomic_fetch_inc(&ptr_VV[row-i]);
+            VV_tab2[slot] = col-j;
+            VV_coeff[slot] = coeff;
+          }
+      }
+  });
+  end_gpu_timer(__KERNEL_NAME__);
+  // Convertit en notation Fortran
   RR.formeF();
   RV.formeF();
   VR.formeF();
   VV.formeF();
-
-  // Compactage de la matrice
   RR.compacte();
   RV.compacte();
   VR.compacte();
   VV.compacte();
+  // Tri par colonne croissante
+  RR.sort_stencil();
+  RV.sort_stencil();
+  VR.sort_stencil();
+  VV.sort_stencil();
 }
 
 Matrice_Bloc& Matrice_Bloc::operator *=( double x )
diff --git a/src/Kernel/Math/Matrices/Matrice_Bloc.h b/src/Kernel/Math/Matrices/Matrice_Bloc.h
index a0d0bb21ea..2b457e9c51 100644
--- a/src/Kernel/Math/Matrices/Matrice_Bloc.h
+++ b/src/Kernel/Math/Matrices/Matrice_Bloc.h
@@ -20,6 +20,7 @@
 #include <Matrice_Base.h>
 #include <Matrice.h>
 #include <TRUSTLists.h>
+#include <Matrix_tools.h>
 #include <vector>
 #include <TRUST_Vector.h>
 
@@ -93,14 +94,8 @@ public :
   int nb_bloc_lignes() const;            // retourne N_
   int nb_bloc_colonnes(void ) const;           // retourne M_
 
-  // Remplissage par une matrice morse symetrique
-  void remplir(const IntLists& voisins, const DoubleLists& valeurs, const DoubleVect& terme_diag, const int i, const int n);
-
-  // // Remplissage par une matrice morse
-  void remplir(const IntLists& voisins, const DoubleLists& valeurs, const int i, const int n, const int j, const int m);
-
-  // Remplissage par une matrice morse symetrique ou non
-  void remplir(const IntLists& voisins, const DoubleLists& valeurs, const DoubleVect& terme_diag, const int i, const int n, const int j, const int m);
+  // Remplissage depuis un Stencil
+  void remplir(const Stencil& stencil, const int i, const int n, int j=-1, int m=-1);
 
   // // Conversion vers une Matrice_Morse
   void block_to_morse( Matrice_Morse& matrix ) const;
diff --git a/src/Kernel/Math/Matrices/Matrice_Bloc_Sym.cpp b/src/Kernel/Math/Matrices/Matrice_Bloc_Sym.cpp
index 1a3a730c79..cb0c80e0eb 100644
--- a/src/Kernel/Math/Matrices/Matrice_Bloc_Sym.cpp
+++ b/src/Kernel/Math/Matrices/Matrice_Bloc_Sym.cpp
@@ -343,8 +343,8 @@ void Matrice_Bloc_Sym::get_symmetric_stencil( Stencil& stencil ) const
               local_matrix.get_stencil( local_stencil_ );
             }
 
-          const int size = local_stencil_.dimension( 0 );
-          for ( int k=0; k<size; ++k )
+          const auto size = local_stencil_.dimension( 0 );
+          for ( auto k=0; k<size; ++k )
             {
               local_stencil_( k, 0 ) += imin;
               local_stencil_( k, 1 ) += jmin;
@@ -362,9 +362,9 @@ void Matrice_Bloc_Sym::get_symmetric_stencil( Stencil& stencil ) const
   for ( int i=0; i<nb_stencils; ++i )
     {
       const Stencil& local_stencil_ = local_stencils[ i ];
-      const int size = local_stencil_.dimension( 0 );
+      const auto size = local_stencil_.dimension( 0 );
 
-      for ( int k=0; k<size; ++k )
+      for ( auto k=0; k<size; ++k )
         {
           const int line = local_stencil_( k, 0 );
           offsets[ line + 1 ] += 1;
@@ -384,9 +384,9 @@ void Matrice_Bloc_Sym::get_symmetric_stencil( Stencil& stencil ) const
   for ( int i=0; i<nb_stencils; ++i )
     {
       const Stencil& local_stencil_ = local_stencils[ i ];
-      const int size = local_stencil_.dimension( 0 );
+      const auto size = local_stencil_.dimension( 0 );
 
-      for ( int k=0; k<size; ++k )
+      for ( auto k=0; k<size; ++k )
         {
           const int line   = local_stencil_( k, 0 );
           const int column = local_stencil_( k, 1 );
@@ -465,8 +465,8 @@ void Matrice_Bloc_Sym::get_symmetric_stencil_and_coefficients( Stencil&      ste
               local_matrix.get_stencil_and_coefficients( local_stencil_, coefficients_ );
             }
 
-          const int size = local_stencil_.dimension( 0 );
-          for ( int k=0; k<size; ++k )
+          const auto size = local_stencil_.dimension( 0 );
+          for ( auto k=0; k<size; ++k )
             {
               local_stencil_( k, 0 ) += imin;
               local_stencil_( k, 1 ) += jmin;
@@ -484,9 +484,9 @@ void Matrice_Bloc_Sym::get_symmetric_stencil_and_coefficients( Stencil&      ste
   for ( int i=0; i<nb_stencils; ++i )
     {
       const Stencil& local_stencil_ = local_stencils[ i ];
-      const int size = local_stencil_.dimension( 0 );
+      const auto size = local_stencil_.dimension( 0 );
 
-      for ( int k=0; k<size; ++k )
+      for ( auto k=0; k<size; ++k )
         {
           const int line = local_stencil_( k, 0 );
           offsets[ line + 1 ] += 1;
@@ -509,10 +509,10 @@ void Matrice_Bloc_Sym::get_symmetric_stencil_and_coefficients( Stencil&      ste
       const Stencil&      local_stencil_       = local_stencils[ i ];
       const StencilCoeffs&        coefficients_ = local_coefficients[ i ];
 
-      const int size = local_stencil_.dimension( 0 );
+      const auto size = local_stencil_.dimension( 0 );
       assert( coefficients_.size_array( ) == size );
 
-      for ( int k=0; k<size; ++k )
+      for ( auto k=0; k<size; ++k )
         {
           const int line        = local_stencil_( k, 0 );
           const int column      = local_stencil_( k, 1 );
diff --git a/src/Kernel/Math/Matrices/Matrice_Dense.cpp b/src/Kernel/Math/Matrices/Matrice_Dense.cpp
index 29a9ebe925..b11384f7d4 100644
--- a/src/Kernel/Math/Matrices/Matrice_Dense.cpp
+++ b/src/Kernel/Math/Matrices/Matrice_Dense.cpp
@@ -365,7 +365,7 @@ void Matrice_Dense::get_stencil( Stencil& stencil ) const
         }
     }
 
-  const int new_size = stencil.dimension( 0 );
+  const auto new_size = stencil.dimension( 0 );
   stencil.resize( new_size, 2 );
 
 }
diff --git a/src/Kernel/Math/Matrices/Matrice_Morse.cpp b/src/Kernel/Math/Matrices/Matrice_Morse.cpp
index d403dbb212..252f33477b 100644
--- a/src/Kernel/Math/Matrices/Matrice_Morse.cpp
+++ b/src/Kernel/Math/Matrices/Matrice_Morse.cpp
@@ -520,214 +520,137 @@ int Matrice_Morse::ordre() const
 void Matrice_Morse::compacte(int elim_coeff_nul)
 {
   int n=nb_lignes();
-  int coeff_nuls=0;
-  int coeff_quasi_nuls=0;
-  auto tab_elim_coeff(tab2_); // Possibly BigArrOfInt
-  tab_elim_coeff = 0;
+  using nnz_t = decltype(tab2_.size_array());
+  TRUSTArray<int, nnz_t> tab_elim_coeff(tab2_.size_array());
+  int nb_coefficient_to_suppress=0; // Nombre de coefficients supprimes
   if (elim_coeff_nul)
     {
       ArrOfDouble tab_coeff_max(n);
-      tab_coeff_max = 0.;
       // Recherche des coefficients nuls hors diagonale a supprimer de la matrice morse
+      const auto tab1 = tab1_.view_ro();
+      const auto coeff = coeff_.view_ro();
+      DoubleArrView coeff_max = tab_coeff_max.view_rw();
+      auto elim_coeff = tab_elim_coeff.view_rw();
+      int coeff_nuls = 0;
+      Kokkos::parallel_reduce(start_gpu_timer(__KERNEL_NAME__), range_1D(0, n), KOKKOS_LAMBDA(const int i, int& coeff_nuls_local)
       {
-        ArrOfInt tab_cnt(1);
-        tab_cnt = 0;
-        auto tab1 = tab1_.view_ro();
-        CDoubleArrView coeff = coeff_.view_ro();
-        DoubleArrView coeff_max = tab_coeff_max.view_rw();
-        auto elim_coeff = tab_elim_coeff.view_rw();
-        IntArrView cnt = tab_cnt.view_rw();
-        Kokkos::parallel_for(start_gpu_timer(__KERNEL_NAME__), range_1D(0, n), KOKKOS_LAMBDA(const int i)
-        {
-          auto k1 = tab1(i)-1;
-          auto k2 = tab1(i+1)-1;
-          for (auto k = k1; k < k2; k++)
-            {
-              double abs_c = Kokkos::fabs(coeff(k));
-              if (abs_c > coeff_max(i)) coeff_max(i) = abs_c;
-              if (coeff(k) == 0)
-                {
-                  Kokkos::atomic_add(&cnt(0), 1);
-                  elim_coeff(k) = 1;
-                }
-            }
-        });
-        end_gpu_timer(__KERNEL_NAME__);
-        coeff_nuls = tab_cnt(0);
-      }
-
+        nnz_t k1 = tab1(i) - 1;
+        nnz_t k2 = tab1(i + 1) - 1;
+        for (nnz_t k = k1; k < k2; k++)
+          {
+            if (Kokkos::fabs(coeff(k)) > coeff_max(i)) coeff_max(i) = Kokkos::fabs(coeff(k));
+            if (coeff(k) == 0)
+              {
+                coeff_nuls_local++;
+                elim_coeff(k) = 1;
+              }
+          }
+      }, coeff_nuls);
+      end_gpu_timer(__KERNEL_NAME__);
+      nb_coefficient_to_suppress+=coeff_nuls;
       if (elim_coeff_nul==2)
         {
           // Recherche des coefficients quasi nuls hors diagonale (1.e-12 plus petit que le coefficient le plus grand de la ligne) a supprimer de la matrice morse
           const double eps = Objet_U::precision_geom;
-          ArrOfInt tab_cnt(1);
-          tab_cnt = 0;
-          auto tab1 = tab1_.view_ro();
-          CDoubleArrView coeff = coeff_.view_ro();
-          CDoubleArrView coeff_max = tab_coeff_max.view_ro();
-          IntArrView elim_coeff = tab_elim_coeff.view_rw();
-          IntArrView cnt = tab_cnt.view_rw();
-          Kokkos::parallel_for(start_gpu_timer(__KERNEL_NAME__), range_1D(0, n), KOKKOS_LAMBDA(const int i)
+          int coeff_quasi_nuls = 0;
+          Kokkos::parallel_reduce(start_gpu_timer(__KERNEL_NAME__), range_1D(0, n), KOKKOS_LAMBDA(const int i, int& coeff_quasi_nuls_local)
           {
             double cm = coeff_max(i);
-            if (!est_egal(cm, 0., eps) && cm < 1e10)
+            if (!est_egal(cm, 0., eps) && cm < 1e10)  // Le plus grand coefficient doit etre strictement positif
               {
-                auto k1 = tab1(i) - 1;
-                auto k2 = tab1(i + 1) - 1;
-                for (auto k = k1; k < k2; k++)
-                  if (coeff(k) != 0 && est_egal(Kokkos::fabs(coeff(k)) / cm, 0., eps))
+                nnz_t k1 = tab1(i) - 1;
+                nnz_t k2 = tab1(i + 1) - 1;
+                for (nnz_t k = k1; k < k2; k++)
+                  if (coeff(k) != 0                   // Les coefficients nuls ont deja ete trouves
+                      && est_egal(Kokkos::fabs(coeff(k)) / cm, 0., eps))
                     {
-                      Kokkos::atomic_add(&cnt(0), 1);
+                      coeff_quasi_nuls_local++;
                       elim_coeff(k) = 1;
                     }
               }
-          });
+          }, coeff_quasi_nuls);
           end_gpu_timer(__KERNEL_NAME__);
-          coeff_quasi_nuls = tab_cnt(0);
+          nb_coefficient_to_suppress+=coeff_quasi_nuls;
         }
     }
   // Recherche des coefficients doublons
-  int nb_doublons=0;
+  int doublons = 0;
+  const auto tab1 = tab1_.view_ro();
+  const auto tab2 = tab2_.view_ro();
+  const auto coeff = coeff_.view_ro();
+  auto elim_coeff = tab_elim_coeff.view_rw();
+  Kokkos::parallel_reduce(start_gpu_timer(__KERNEL_NAME__), n, KOKKOS_LAMBDA(const int i, int& doublons_local)
   {
-    auto tab1 = tab1_.view_ro();
-    CIntArrView tab2 = tab2_.view_ro();
-    CDoubleArrView coeff = coeff_.view_ro();
-    IntArrView elim_coeff = tab_elim_coeff.view_rw();
-    ArrOfInt tab_doublons(1);
-    tab_doublons = 0;
-    ArrOfInt tab_error(1);
-    tab_error = 0;
-    IntArrView doublons = tab_doublons.view_rw();
-    IntArrView error = tab_error.view_rw();
-    Kokkos::parallel_for(start_gpu_timer(__KERNEL_NAME__), range_1D(0, n), KOKKOS_LAMBDA(const int i)
-    {
-      auto k1 = tab1(i)-1;
-      auto k2 = tab1(i+1)-1;
-      int jmax = -1; // Highest column of a coefficient in the line i
-      for (auto k = k1; k < k2; k++)
-        {
-          int j = tab2(k)-1;
-          if (j > jmax)
-            jmax = j;
-          else
-            {
-              // Found a column j lower than jmax, check if not defined before:
-              for (auto kk = k-1; kk >= k1; kk--)
-                {
-                  int jj = tab2(kk)-1;
-                  if (jj == j)
-                    {
-                      // Already defined!
-                      Kokkos::atomic_add(&doublons(0), 1);
-                      elim_coeff(k) = 1;
-                      // Check if same coefficients:
-                      if (coeff(kk) != coeff(k))
-                        Kokkos::atomic_add(&error(0), 1);
-                      break;
-                    }
-                }
-            }
-        }
-    });
-    end_gpu_timer(__KERNEL_NAME__);
-    nb_doublons = tab_doublons(0);
-    if (tab_error(0))
-      {
-        Cerr << "Error in a Matrix Morse: duplicate entries with different values!" << finl;
-        exit();
-      }
-  }
-
-  auto nnz(tab1_(0));
-  nnz=0;
-  if (nb_doublons || coeff_nuls || coeff_quasi_nuls)
-    {
-      // Step 1: Count kept entries per row (parallel_for over rows)
-      ArrOfInt tab_kept_per_row(n);
+    nnz_t k1 = tab1(i) - 1;
+    nnz_t k2 = tab1(i + 1) - 1;
+    int jmax = -1; // Highest column of a coefficient in the line i
+    for (nnz_t k = k1; k < k2; k++)
       {
-        auto tab1 = tab1_.view_ro();
-        CIntArrView elim_coeff = tab_elim_coeff.view_ro();
-        IntArrView kept_per_row = tab_kept_per_row.view_wo();
-        Kokkos::parallel_for(start_gpu_timer(__KERNEL_NAME__), range_1D(0, n), KOKKOS_LAMBDA(const int i)
-        {
-          int count = 0;
-          auto k1 = tab1(i)-1;
-          auto k2 = tab1(i+1)-1;
-          for (auto k = k1; k < k2; k++)
-            if (!elim_coeff(k)) count++;
-          kept_per_row(i) = count;
-        });
-        end_gpu_timer(__KERNEL_NAME__);
-      }
-
-      // Step 2: Save old tab1_ (needed for source offsets in scatter step)
-      auto old_tab1(tab1_);
-
-      // Step 3: Update tab1_ via prefix scan (updates tab1_(1..n), tab1_(0)=1 unchanged)
-      using tab1_scan_t = decltype(nnz);
-      {
-        auto tab1 = tab1_.view_rw();
-        CIntArrView kept_per_row = tab_kept_per_row.view_ro();
-        Kokkos::parallel_scan(start_gpu_timer(__KERNEL_NAME__), range_1D(0, n), KOKKOS_LAMBDA(const int i, tab1_scan_t& update, const bool final)
-        {
-          update += kept_per_row(i);
-          if (final) tab1(i+1) = update + 1;
-        });
-        end_gpu_timer(__KERNEL_NAME__);
-      }
-
-      // Step 4: Out-of-place scatter of coeff_ and tab2_ to new positions (parallel_for over rows)
-      // Safe because new_pos(i) <= old_pos(i) always, and rows are processed independently
-      nnz = tab1_[n] - 1;
-      auto new_coeff(coeff_);
-      auto new_tab2(tab2_);
-      {
-        auto tab1 = tab1_.view_ro();
-        auto old_tab1_ro = old_tab1.view_ro();
-        CDoubleArrView coeff_src = coeff_.view_ro();
-        CIntArrView tab2_src = tab2_.view_ro();
-        DoubleArrView coeff_dst = new_coeff.view_wo();
-        IntArrView tab2_dst = new_tab2.view_wo();
-        CIntArrView elim_coeff = tab_elim_coeff.view_ro();
-        Kokkos::parallel_for(start_gpu_timer(__KERNEL_NAME__), range_1D(0, n), KOKKOS_LAMBDA(const int i)
-        {
-          auto new_pos = tab1(i) - 1;
-          auto k1 = old_tab1_ro(i)-1;
-          auto k2 = old_tab1_ro(i+1)-1;
-          for (auto k = k1; k < k2; k++)
-            if (!elim_coeff(k))
+        int j = tab2(k) - 1;
+        if (j > jmax)
+          jmax = j;
+        else
+          {
+            // Found a column j lower than jmax, check if not defined before:
+            for (nnz_t kk = k-1; kk >= k1; kk--)
               {
-                coeff_dst(new_pos) = coeff_src(k);
-                tab2_dst(new_pos) = tab2_src(k);
-                new_pos++;
+                int jj = tab2(kk) - 1;
+                if (jj == j)
+                  {
+                    // Already defined!
+                    doublons_local++;
+                    elim_coeff(k) = 1;
+                    // Check if same coefficients:
+                    if (coeff(kk) != coeff(k)) Process::Kokkos_exit("Error in Matrice_Morse::compacte !");
+                    break;
+                  }
               }
-        });
-        end_gpu_timer(__KERNEL_NAME__);
-      }
-
-      // Step 5: Copy compacted data back
-      {
-        auto tab2 = tab2_.view_rw();
-        auto coeff = coeff_.view_rw();
-        CIntArrView new_tab2_ro = new_tab2.view_ro();
-        CDoubleArrView new_coeff_ro = new_coeff.view_ro();
-        Kokkos::parallel_for(start_gpu_timer(__KERNEL_NAME__), range_1D(0, nnz), KOKKOS_LAMBDA(const int i)
-        {
-          tab2(i) = new_tab2_ro(i);
-          coeff(i) = new_coeff_ro(i);
-        });
-        end_gpu_timer(__KERNEL_NAME__);
+          }
       }
-    }
-  else
+  }, doublons);
+  end_gpu_timer(__KERNEL_NAME__);
+  nb_coefficient_to_suppress+=doublons;
+  if (nb_coefficient_to_suppress)
     {
-      nnz = tab1_[n] - 1;
+      Cerr << nb_coefficient_to_suppress << " null or duplicated coefficients removed from a CSR matrix." << finl;
+      // Copie de la matrice:
+      ToDo_Kokkos("avoid this 3 copy...");
+      auto tab_old_tab1(tab1_);
+      auto tab_old_tab2(tab2_);
+      auto tab_old_coeff(coeff_);
+      // Redimensionnement de l'actuelle:
+      tab2_.resize(tab_old_coeff.size() - nb_coefficient_to_suppress);
+      coeff_.resize(tab_old_coeff.size() - nb_coefficient_to_suppress);
+      // Copie des coefficients a garder:
+      const auto old_tab1  = tab_old_tab1.view_ro();
+      const auto old_tab2  = tab_old_tab2.view_ro();
+      const auto old_coeff = tab_old_coeff.view_ro();
+      auto new_tab1 = tab1_.view_wo();
+      auto new_tab2 = tab2_.view_wo();
+      auto new_coeff = coeff_.view_wo();
+      Kokkos::parallel_scan(start_gpu_timer(__KERNEL_NAME__), n, KOKKOS_LAMBDA(const int i, nnz_t& offset, const bool final)
+      {
+        nnz_t k1 = old_tab1(i) - 1;
+        nnz_t k2 = old_tab1(i+1) - 1;
+        nnz_t count = 0;
+        for (nnz_t k = k1; k < k2; k++)
+          if (!elim_coeff(k)) count++;
+        if (final)
+          {
+            new_tab1(i+1) = offset + count + 1;
+            nnz_t nnz = offset;
+            for (nnz_t k = k1; k < k2; k++)
+              if (!elim_coeff(k))
+                {
+                  new_coeff(nnz) = old_coeff(k);
+                  new_tab2(nnz) = old_tab2(k);
+                  nnz++;
+                }
+          }
+        offset += count;
+      });
+      end_gpu_timer(__KERNEL_NAME__);
     }
-
-  // On redimensionne les tableaux
-  tab2_.resize(nnz);
-  coeff_.resize(nnz);
-
   morse_matrix_structure_has_changed_=1, sorted_ = 0;
   assert_check_morse_matrix_structure( );
 }
@@ -815,9 +738,10 @@ Matrice_Morse& Matrice_Morse::diagmulmat(const DoubleVect& x)
       Cerr << "Matrice_Morse::diagmulmat bad dimensions" << finl;
       exit();
     }
+  set_tab1_int32();
   F77NAME(DIAMUA)(&m ,&l,
-                  coeff_.addr(),tab2_.addr(),reinterpret_cast<const int*>(tab1_.addr()),x.addr(),
-                  coeff_.addr(),tab2_.addr(),reinterpret_cast<int*>(tab1_.addr()));
+                  coeff_.addr(),tab2_.addr(),get_tab1_int32().addr(),x.addr(),
+                  coeff_.addr(),tab2_.addr(),const_cast<int*>(get_tab1_int32().addr()));
   return(*this);
 }
 
@@ -1262,6 +1186,7 @@ int Matrice_Morse::inverse(const DoubleVect& secmem, DoubleVect& solution,
   int minits = 10;
   int maxits = std::max(minits, retry_on_failure ? nn : max_iter);
   int io = 0;
+  set_tab1_int32();
   F77NAME(PGMRES)(&nn, &ima, toto.addr(), solution.addr(), vv.addr(), &coeff_seuilr,
                   &maxits, &io, coeff_.addr(), tab2_.addr(), get_tab1_int32().addr(),
                   alu.addr(), jlu.addr(), ju.addr(), &ie);
@@ -1748,10 +1673,13 @@ void Matrice_Morse::remplir(const int ideb, const int jdeb, const int n, const i
 void Matrice_Morse::formeC()
 {
   int n=nb_lignes();
+  ToDo_Kokkos("critical");
   for(int ii=0; ii<=n; ii++)
     tab1_(ii)--;
+  ToDo_Kokkos("critical");
   for(int ii=0; ii<n; ii++)
     tab2_(tab1_(ii))=nb_vois(ii);
+  ToDo_Kokkos("critical");
   for(auto k=0; k<nb_coeff(); k++)
     tab2_(k)--;
   morse_matrix_structure_has_changed_=1;
@@ -1761,10 +1689,19 @@ void Matrice_Morse::formeC()
 void Matrice_Morse::formeF()
 {
   int n=nb_lignes();
-  for(int ii=0; ii<=n; ii++)
-    tab1_(ii)++;
-  for(auto k=0; k<nb_coeff(); k++)
-    tab2_(k)++;
+  auto tab1 = tab1_.view_rw();
+  Kokkos::parallel_for(start_gpu_timer(__KERNEL_NAME__), n+1, KOKKOS_LAMBDA(const int ii)
+  {
+    tab1(ii)++;
+  });
+  end_gpu_timer(__KERNEL_NAME__);
+  auto tab2 = tab2_.view_rw();
+  using nnz_t = decltype(nb_coeff());
+  Kokkos::parallel_for(start_gpu_timer(__KERNEL_NAME__), nb_coeff(), KOKKOS_LAMBDA(const nnz_t k)
+  {
+    tab2(k)++;
+  });
+  end_gpu_timer(__KERNEL_NAME__);
   morse_matrix_structure_has_changed_=1;
   is_stencil_up_to_date_=false;
 }
@@ -2060,6 +1997,103 @@ bool Matrice_Morse::is_diagonal()
   return is_diagonal;
 }
 
+/*! @brief Build the full (non-symmetric) Morse matrix from an upper-triangular symmetric one.
+ *
+ *  Matrice_Morse_Sym stores only the upper triangle with all diagonal entries guaranteed present.
+ *  This method expands it to a full CSR matrix: each off-diagonal entry (i,j) generates both
+ *  (i,j) and the mirror (j,i).  nnz_full = 2*nnz_sym - n.
+ *  Entries within each row are sorted by column index (sorted_ = 1).
+ *
+ *  Two-pass direct construction: no intermediate Matrice_Morse objects are allocated.
+ *  Peak extra memory is O(n) integers (write-cursor array) versus the old 4*nnz_sym peak.
+ */
+Matrice_Morse& Matrice_Morse::convert(const Matrice_Morse_Sym& MS)
+{
+  const int n = MS.nb_lignes();
+  auto MS_tab1  = MS.get_tab1().view_ro();
+  auto MS_tab2  = MS.get_tab2().view_ro();
+  auto MS_coeff = MS.get_coeff().view_ro();
+
+  // Step 1: we count the added coeff
+  ArrOfInt tab_count(n);
+  IntArrView count = tab_count.view_rw();
+  Kokkos::parallel_for(start_gpu_timer(__KERNEL_NAME__), range_1D(0, n), KOKKOS_LAMBDA(const int i)
+  {
+    for (auto k = MS_tab1(i) - 1; k < MS_tab1(i + 1) - 1; k++)
+      {
+        const int j = MS_tab2(k) - 1;
+        Kokkos::atomic_add(&count(i), 1);
+        if (j != i) Kokkos::atomic_add(&count(j), 1);
+      }
+  });
+  end_gpu_timer(__KERNEL_NAME__);
+
+  // Step 2: Allocation
+  const auto nnz = 2 * MS.nb_coeff() - n;
+  dimensionner(n, n, nnz);
+  tab1_[0] = 1;
+
+  auto tab1 = tab1_.view_rw();
+  using tab1_scan_t = typename decltype(tab1)::value_type;
+  Kokkos::parallel_scan(start_gpu_timer(__KERNEL_NAME__), range_1D(0, n), KOKKOS_LAMBDA(const int i, tab1_scan_t& update, const bool final)
+  {
+    update += count(i);
+    if (final) tab1(i + 1) = update + 1;
+  });
+  end_gpu_timer(__KERNEL_NAME__);
+
+  // Step 3: Add the entries
+  auto tab_pos(tab1_);
+  tab_pos.resize(n);
+  auto pos   = tab_pos.view_rw();
+  auto tab2  = tab2_.view_rw();
+  auto coeff = coeff_.view_rw();
+  Kokkos::parallel_for(start_gpu_timer(__KERNEL_NAME__), range_1D(0, n), KOKKOS_LAMBDA(const int i)
+  {
+    for (auto k = MS_tab1(i) - 1; k < MS_tab1(i + 1) - 1; k++)
+      {
+        const int j      = MS_tab2(k) - 1;
+        const double val = MS_coeff(k);
+        const auto slot_i = Kokkos::atomic_fetch_add(&pos(i), 1) - 1;
+        tab2(slot_i)  = j + 1;
+        coeff(slot_i) = val;
+        if (j != i)
+          {
+            const auto slot_j = Kokkos::atomic_fetch_add(&pos(j), 1) - 1;
+            tab2(slot_j)  = i + 1;
+            coeff(slot_j) = val;
+          }
+      }
+  });
+  end_gpu_timer(__KERNEL_NAME__);
+
+  // Step 4: sorting coefficients per row
+  Kokkos::parallel_for(start_gpu_timer(__KERNEL_NAME__), range_1D(0, n), KOKKOS_LAMBDA(const int i)
+  {
+    const auto k1 = tab1(i) - 1;
+    const auto k2 = tab1(i + 1) - 1;
+    for (auto k = k1 + 1; k < k2; k++)
+      {
+        const int    col_k = tab2(k);
+        const double val_k = coeff(k);
+        auto m = k;
+        while (m > k1 && tab2(m - 1) > col_k)
+          {
+            tab2(m)  = tab2(m - 1);
+            coeff(m) = coeff(m - 1);
+            --m;
+          }
+        tab2(m)  = col_k;
+        coeff(m) = val_k;
+      }
+  });
+  end_gpu_timer(__KERNEL_NAME__);
+
+  morse_matrix_structure_has_changed_ = 1;
+  sorted_ = 1;
+  return *this;
+}
+
 // Explicit instantiations for 'auto nnz' abbreviated function templates
 template Matrice_Morse::Matrice_Morse(int, int);
 template Matrice_Morse::Matrice_Morse(int, int, int);
diff --git a/src/Kernel/Math/Matrices/Matrice_Morse.h b/src/Kernel/Math/Matrices/Matrice_Morse.h
index aa0caea9dd..590e90a7da 100644
--- a/src/Kernel/Math/Matrices/Matrice_Morse.h
+++ b/src/Kernel/Math/Matrices/Matrice_Morse.h
@@ -46,6 +46,7 @@
  *
  * @sa Matrice_Base Matrice_Morse_Sym
  */
+class Matrice_Morse_Sym;
 class Matrice_Morse : public Matrice_Base
 {
 
@@ -164,6 +165,9 @@ public :
   // A= creat_transposee(B)
   virtual Matrice_Morse& transpose(const Matrice_Morse& a);
 
+  // Build full (non-symmetric) Morse matrix from a symmetric one (upper triangle storage)
+  Matrice_Morse& convert(const Matrice_Morse_Sym& MS);
+
   // A=x*A (x vecteur diag)
   virtual Matrice_Morse& diagmulmat(const DoubleVect& x);
 
diff --git a/src/Kernel/Math/Matrices/Matrice_Morse_Sym.cpp b/src/Kernel/Math/Matrices/Matrice_Morse_Sym.cpp
index bffbbb31a1..66702d300d 100644
--- a/src/Kernel/Math/Matrices/Matrice_Morse_Sym.cpp
+++ b/src/Kernel/Math/Matrices/Matrice_Morse_Sym.cpp
@@ -561,7 +561,7 @@ void Matrice_Morse_Sym::get_symmetric_stencil( Stencil& stencil ) const
         }
     }
 
-  const int new_size = stencil.dimension( 0 );
+  const auto new_size = stencil.dimension( 0 );
 
   stencil.resize( new_size, 2 );
 }
@@ -633,7 +633,7 @@ void Matrice_Morse_Sym::get_symmetric_stencil_and_coefficients( Stencil&      st
         }
     }
 
-  const int new_size = stencil.dimension( 0 );
+  const auto new_size = stencil.dimension( 0 );
   assert( coefficients.size_array( ) == new_size );
 
 
diff --git a/src/Kernel/Math/Matrices/Matrice_Sym.cpp b/src/Kernel/Math/Matrices/Matrice_Sym.cpp
index c385a1d1c2..24cd5a07df 100644
--- a/src/Kernel/Math/Matrices/Matrice_Sym.cpp
+++ b/src/Kernel/Math/Matrices/Matrice_Sym.cpp
@@ -35,8 +35,8 @@ void Matrice_Sym::unsymmetrize_stencil( const int nb_lines,
   ArrOfInt offsets( nb_lines + 1 );
   offsets[ 0 ] = 0;
 
-  const int symmetric_stencil_size = symmetric_stencil.dimension( 0 );
-  for ( int k=0; k<symmetric_stencil_size; ++k )
+  const auto symmetric_stencil_size = symmetric_stencil.dimension( 0 );
+  for ( auto k=0; k<symmetric_stencil_size; ++k )
     {
       const int line   = symmetric_stencil( k, 0 );
       const int column = symmetric_stencil( k, 1 );
@@ -55,7 +55,7 @@ void Matrice_Sym::unsymmetrize_stencil( const int nb_lines,
   stencil = -1;
 
   int index;
-  for ( int k=0; k<symmetric_stencil_size; ++k )
+  for ( auto k=0; k<symmetric_stencil_size; ++k )
     {
       const int line   = symmetric_stencil( k, 0 );
       const int column = symmetric_stencil( k, 1 );
@@ -97,9 +97,9 @@ void Matrice_Sym::unsymmetrize_stencil_and_coefficients( const int          nb_l
   ArrOfInt offsets( nb_lines + 1 );
   offsets[ 0 ] = 0;
 
-  const int symmetric_stencil_size = symmetric_stencil.dimension( 0 );
+  const auto symmetric_stencil_size = symmetric_stencil.dimension( 0 );
   assert( symmetric_stencil_size == symmetric_coefficients.size_array( ) );
-  for ( int k=0; k<symmetric_stencil_size; ++k )
+  for ( auto k=0; k<symmetric_stencil_size; ++k )
     {
       const int line   = symmetric_stencil( k, 0 );
       const int column = symmetric_stencil( k, 1 );
@@ -119,7 +119,7 @@ void Matrice_Sym::unsymmetrize_stencil_and_coefficients( const int          nb_l
   stencil = -1;
 
   int index;
-  for ( int k=0; k<symmetric_stencil_size; ++k )
+  for ( auto k=0; k<symmetric_stencil_size; ++k )
     {
       const int line        = symmetric_stencil( k, 0 );
       const int column      = symmetric_stencil( k, 1 );
diff --git a/src/Kernel/Math/Matrices/Matrix_tools.cpp b/src/Kernel/Math/Matrices/Matrix_tools.cpp
index 261ceadd04..b0f70dd5f2 100644
--- a/src/Kernel/Math/Matrices/Matrix_tools.cpp
+++ b/src/Kernel/Math/Matrices/Matrix_tools.cpp
@@ -108,8 +108,8 @@ bool Matrix_tools::is_normalized_symmetric_stencil( const Stencil& stencil )
       return false;
     }
 
-  const int size = stencil.dimension( 0 );
-  for ( int i=0; i<size; ++i )
+  const auto size = stencil.dimension( 0 );
+  for ( auto i=0; i<size; ++i )
     {
       int delta = stencil( i, 0 ) - stencil( i, 1 );
       if ( delta > 0 )
@@ -167,66 +167,63 @@ void Matrix_tools::allocate_morse_matrix( const int nb_lines,
                                           Matrice_Morse& matrix ,
                                           const bool& attach_stencil_to_matrix )
 {
-  assert( is_normalized_stencil( stencil ) );
-
-  const int nb_coefficients = stencil.dimension( 0 );
-  matrix.dimensionner( nb_lines, nb_columns, nb_coefficients );
-  {
-    auto& tab1 = matrix.get_set_tab1();
-    auto& tab2 = matrix.get_set_tab2();
-    if ( nb_coefficients > 0 )
-      {
-        tab1 = 0;
-        tab1[0] = 1;
-        for ( int i=0; i<nb_coefficients; ++i )
-          {
-            tab1[stencil(i, 0) + 1] += 1;
-            tab2[i] = stencil(i, 1) + 1;
-          }
-        for ( int i=0; i<nb_lines; ++i )
-          tab1[i + 1] += tab1[i];
-      }
-  }
-  if( attach_stencil_to_matrix )
-    matrix.set_stencil( stencil );
+  assert(is_normalized_stencil(stencil));
+  StencilCoeffs coefficients; // Allocate <=> Build with no coefficients
+  build_morse_matrix(nb_lines, nb_columns, stencil, coefficients, matrix, attach_stencil_to_matrix);
 }
 
 void Matrix_tools::build_morse_matrix( const int     nb_lines,
                                        const int     nb_columns,
-                                       const Stencil&      stencil,
-                                       const StencilCoeffs& coefficients,
-                                       Matrice_Morse&     matrix )
+                                       const Stencil&      tab_stencil,
+                                       const StencilCoeffs& tab_coefficients,
+                                       Matrice_Morse&     matrix,
+                                       const bool& attach_stencil_to_matrix)
 {
   // No : stencil do not rely on sorted columns
   //assert( is_normalized_stencil( stencil ) );
 
-  const int nb_coefficients = stencil.dimension( 0 );
-  assert( nb_coefficients == coefficients.size_array( ) );
+  using nnz_t = decltype(tab_stencil.dimension(0));
+  const nnz_t nnz = tab_stencil.dimension( 0 );
+  bool has_coefficients = tab_coefficients.size_array() != 0;
+  assert(!has_coefficients || nnz == tab_coefficients.size_array());
 
   matrix.dimensionner( nb_lines,
                        nb_columns,
-                       nb_coefficients );
+                       nnz );
 
-  if ( nb_coefficients > 0 )
+  if (nnz > 0)
     {
-      matrix.get_set_tab1() =0  ;
-      matrix.get_set_tab1()( 0 ) = 1;
-      for ( int i=0; i<nb_coefficients; ++i )
-        {
-          assert( stencil( i ,0 ) >= 0         );
-          assert( stencil( i ,0 ) < nb_lines   );
-          assert( stencil( i ,1 ) >= 0         );
-          assert( stencil( i ,1 ) < nb_columns );
-
-          matrix.get_set_tab1()( stencil( i, 0 ) + 1 ) += 1;
-          matrix.get_set_tab2()( i ) = stencil( i, 1 ) + 1;
-          matrix.get_set_coeff()( i ) = coefficients[ i ];
-        }
-      for ( int i=0; i<nb_lines; ++i )
-        {
-          matrix.get_set_tab1()( i + 1 ) += matrix.get_tab1()( i );
-        }
+      auto stencil      = tab_stencil.view_ro();
+      auto coefficients = tab_coefficients.view_ro();
+      auto tab1  = matrix.get_set_tab1().view_rw();
+      // Init tab1
+      Kokkos::parallel_for(start_gpu_timer(__KERNEL_NAME__), nb_lines + 1, KOKKOS_LAMBDA(const int i)
+      {
+        tab1(i) = i == 0 ? 1 : 0;
+      });
+      end_gpu_timer(__KERNEL_NAME__);
+      auto tab2  = matrix.get_set_tab2().view_wo();
+      auto coeff = matrix.get_set_coeff().view_wo();
+      // Fill tab2, coeff and count nnz per row:
+      Kokkos::parallel_for(start_gpu_timer(__KERNEL_NAME__), nnz, KOKKOS_LAMBDA(const nnz_t i)
+      {
+        const int row = stencil(i, 0);
+        const int col = stencil(i, 1);
+        Kokkos::atomic_add(&tab1(row + 1), 1);
+        tab2(i) = col + 1;
+        if (has_coefficients) coeff(i) = coefficients(i);
+      });
+      end_gpu_timer(__KERNEL_NAME__);
+      // Fill tab1:
+      Kokkos::parallel_scan(start_gpu_timer(__KERNEL_NAME__), nb_lines, KOKKOS_LAMBDA(const int i, nnz_t& offset, const bool final)
+      {
+        offset += tab1(i + 1);
+        if (final) tab1(i + 1) = offset + 1;
+      });
+      end_gpu_timer(__KERNEL_NAME__);
     }
+  if( attach_stencil_to_matrix )
+    matrix.set_stencil( tab_stencil );
 }
 
 
@@ -236,35 +233,9 @@ void Matrix_tools::allocate_symmetric_morse_matrix( const int     order,
                                                     const Stencil&      stencil,
                                                     Matrice_Morse_Sym& matrix )
 {
-  assert( is_normalized_symmetric_stencil( stencil ) );
-
-  const int nb_coefficients = stencil.dimension( 0 );
-
-  matrix.dimensionner( order,
-                       order,
-                       nb_coefficients );
-
-  if ( nb_coefficients > 0 )
-    {
-      matrix.get_set_tab1()= 0 ;
-      matrix.get_set_tab1()( 0 ) = 1;
-      for ( int i=0; i<nb_coefficients; ++i )
-        {
-          assert( stencil( i ,0 ) >= 0               );
-          assert( stencil( i ,0 ) < order            );
-          assert( stencil( i ,1 ) >= 0               );
-          assert( stencil( i ,1 ) < order            );
-          assert( stencil( i, 0 ) <= stencil( i, 1 ) );
-
-          matrix.get_set_tab1()( stencil( i, 0 ) + 1 ) += 1;
-          matrix.get_set_tab2()( i ) = stencil( i, 1 ) + 1;
-        }
-      for ( int i=0; i<order; ++i )
-        {
-          matrix.get_set_tab1()( i + 1 ) += matrix.get_tab1()( i );
-        }
-    }
-  matrix.set_symmetric( 1 );
+  assert(is_normalized_symmetric_stencil( stencil));
+  allocate_morse_matrix(order, order, stencil, matrix);
+  matrix.set_symmetric(1);
 }
 
 
@@ -273,38 +244,8 @@ void Matrix_tools::build_symmetric_morse_matrix( const int     order,
                                                  const StencilCoeffs& coefficients,
                                                  Matrice_Morse_Sym& matrix )
 {
-  // No : stencil do not rely on sorted columns
-  // assert( is_normalized_symmetric_stencil( stencil ) );
-
-  const int nb_coefficients = stencil.dimension( 0 );
-  assert( nb_coefficients == coefficients.size_array( ) );
-
-  matrix.dimensionner( order,
-                       order,
-                       nb_coefficients );
-
-  if ( nb_coefficients > 0 )
-    {
-      matrix.get_set_tab1() = 0 ;
-      matrix.get_set_tab1()( 0 ) = 1;
-      for ( int i=0; i<nb_coefficients; ++i )
-        {
-          assert( stencil( i ,0 ) >= 0               );
-          assert( stencil( i ,0 ) < order            );
-          assert( stencil( i ,1 ) >= 0               );
-          assert( stencil( i ,1 ) < order            );
-          assert( stencil( i, 0 ) <= stencil( i, 1 ) );
-
-          matrix.get_set_tab1()( stencil( i, 0 ) + 1 ) += 1;
-          matrix.get_set_tab2()( i )  = stencil( i, 1 ) + 1;
-          matrix.get_set_coeff()( i ) = coefficients[ i ];
-        }
-      for ( int i=0; i<order; ++i )
-        {
-          matrix.get_set_tab1()( i + 1 ) += matrix.get_tab1()( i );
-        }
-    }
-  matrix.set_symmetric( 1 );
+  build_morse_matrix(order, order, stencil, coefficients, matrix);
+  matrix.set_symmetric(1);
 }
 
 
@@ -318,24 +259,24 @@ void Matrix_tools::allocate_for_scaled_addition( const Matrice& A,
 
   Stencil A_stencil;
   A.valeur( ).get_stencil( A_stencil );
-  const int A_size = (int)A_stencil.dimension( 0 );
+  const auto A_size = A_stencil.dimension( 0 );
 
   Stencil B_stencil;
   B.valeur( ).get_stencil( B_stencil );
-  const int B_size = (int)B_stencil.dimension( 0 );
+  const auto B_size = B_stencil.dimension( 0 );
 
-  int size = A_size + B_size;
+  auto size = A_size + B_size;
   Stencil stencil;
 
   stencil.resize( size, 2 );
 
-  for ( int i=0; i<A_size; ++i )
+  for ( auto i=0; i<A_size; ++i )
     {
       stencil( i , 0 ) = A_stencil( i, 0 );
       stencil( i , 1 ) = A_stencil( i, 1 ) ;
     }
 
-  for ( int i=0; i<B_size; ++i )
+  for ( auto i=0; i<B_size; ++i )
     {
       stencil( i+A_size , 0 ) = B_stencil( i, 0 );
       stencil( i+A_size , 1 ) = B_stencil( i, 1 ) ;
@@ -362,24 +303,24 @@ void Matrix_tools::allocate_for_symmetric_scaled_addition( const Matrice& A,
 
   Stencil A_stencil;
   A.valeur( ).get_symmetric_stencil( A_stencil );
-  const int A_size = (int)A_stencil.dimension( 0 );
+  const auto A_size = A_stencil.dimension( 0 );
 
   Stencil B_stencil;
   B.valeur( ).get_symmetric_stencil( B_stencil );
-  const int B_size = (int)B_stencil.dimension( 0 );
+  const auto B_size = B_stencil.dimension( 0 );
 
-  int size = A_size + B_size;
+  auto size = A_size + B_size;
   Stencil stencil;
 
   stencil.resize( size, 2 );
 
-  for ( int i=0; i<A_size; ++i )
+  for ( auto i=0; i<A_size; ++i )
     {
       stencil( i , 0 ) = A_stencil( i, 0 );
       stencil( i , 1 ) = A_stencil( i, 1 ) ;
     }
 
-  for ( int i=0; i<B_size; ++i )
+  for ( auto i=0; i<B_size; ++i )
     {
       stencil( i+A_size , 0 ) = B_stencil( i, 0 );
       stencil( i+A_size , 1 ) = B_stencil( i, 1 ) ;
@@ -520,14 +461,14 @@ bool Matrix_tools::is_diagonal_stencil( const int nb_lines,
       return false;
     }
 
-  const int size = stencil.dimension( 0 );
+  const auto size = stencil.dimension( 0 );
 
   if ( size == 0 )
     {
       return false;
     }
 
-  for ( int i=0; i<size; ++i )
+  for ( auto i=0; i<size; ++i )
     {
       if ( stencil( i, 0 ) != stencil( i, 1 ) )
         {
@@ -597,13 +538,13 @@ void Matrix_tools::extend_matrix_stencil( const Stencil& stencil,
       Stencil full_stencil;
       matrix.valeur( ).get_stencil( full_stencil );
 
-      const int size = stencil.dimension( 0 );
+      const auto size = stencil.dimension( 0 );
 
-      const int old_size = full_stencil.size( ) / 2 ;
+      const auto old_size = full_stencil.size( ) / 2 ;
 
       full_stencil.resize( old_size + size, 2);
 
-      for ( int i=0; i<size; ++i )
+      for ( auto i=0; i<size; ++i )
         {
           full_stencil( old_size + i , 0 ) = stencil( i, 0 );
           full_stencil( old_size + i , 1 ) = stencil( i, 1 );
diff --git a/src/Kernel/Math/Matrices/Matrix_tools.h b/src/Kernel/Math/Matrices/Matrix_tools.h
index 9b86977cd3..cf8d73e219 100644
--- a/src/Kernel/Math/Matrices/Matrix_tools.h
+++ b/src/Kernel/Math/Matrices/Matrix_tools.h
@@ -62,7 +62,7 @@ public :
   // so we need to specify is the stencil is to attach or not to the matrix
   static void allocate_morse_matrix(const int nb_lines, const int nb_columns, const Stencil& stencil, Matrice_Morse& matrix, const bool& attach_stencil_to_matrix = false);
 
-  static void build_morse_matrix(const int nb_lines, const int nb_columns, const Stencil& stencil, const StencilCoeffs& coefficients, Matrice_Morse& matrix);
+  static void build_morse_matrix(const int nb_lines, const int nb_columns, const Stencil& stencil, const StencilCoeffs& coefficients, Matrice_Morse& matrix,  const bool& attach_stencil_to_matrix = false);
 
 // building symmetric morse matrices
   static void allocate_symmetric_morse_matrix(const int order, const Stencil& stencil, Matrice_Morse_Sym& matrix);
diff --git a/src/Kernel/Math/SolvSys/Solv_AMG.cpp b/src/Kernel/Math/SolvSys/Solv_AMG.cpp
index 1de2213bd1..870cd92cfe 100644
--- a/src/Kernel/Math/SolvSys/Solv_AMG.cpp
+++ b/src/Kernel/Math/SolvSys/Solv_AMG.cpp
@@ -14,14 +14,12 @@
 *****************************************************************************/
 
 #include <Solv_AMG.h>
-#include <EChaine.h>
 #include <Motcle.h>
 #include <Solv_AMGX.h>
 #include <Solv_Petsc_GPU.h>
 #ifdef TRUST_USE_ROCM
 #include <rocm-core/rocm_version.h>
 #endif
-#include <comm_incl.h> // Mandatory to have MPIX_CUDA_AWARE_SUPPORT defined or not
 #include <MD_Vector_composite.h>
 
 Implemente_instanciable(Solv_AMG,"Solv_AMG",SolveurSys_base);
@@ -57,7 +55,13 @@ Entree& Solv_AMG::readOn(Entree& is)
 {
   // amg GCP|BISGTSTAB|GMRES { atol|rtol doublee [st double] [impr]  }
   is >> solver_;
-  if ((Motcle)solver_!="GCP")
+  if ((Motcle)solver_=="GCP")
+    ksp_type_ = "cg";
+  else if ((Motcle)solver_=="GMRES")
+    ksp_type_ = "gmres";
+  else if ((Motcle)solver_=="BICGSTAB")
+    ksp_type_ = "bcgs";
+  else
     {
       Cerr << solver_ << " not supported yet for AMG !" << finl;
       Process::exit();
@@ -87,6 +91,19 @@ Entree& Solv_AMG::readOn(Entree& is)
   return is;
 }
 
+
+// On some GFX cards, hipsparse crashes so we take Kokkos brackend for PETSc:
+Nom petsc_use_kokkos()
+{
+  Nom chaine_lue("");
+  const char* value = std::getenv("ROCM_ARCH");
+  if (value != nullptr && std::string(value) == "gfx1100")
+    {
+      chaine_lue = " -vec_type kokkos -mat_type aijkokkos ";
+    }
+  return chaine_lue;
+}
+
 void Solv_AMG::create_block_amg(int n, Nom precond)
 {
   if (getenv("TRUST_AMG")!=nullptr) precond = getenv("TRUST_AMG");
@@ -95,6 +112,7 @@ void Solv_AMG::create_block_amg(int n, Nom precond)
   chaine_lue_+=petsc_cg_issue_ ? "gmres" : "cg"; // Switch CG to GMRES for more robustness (BiCGstab is slower than GMRES 2xSPMV vs 1)
   chaine_lue_+=rtol_>0 ? Nom(rtol_, " -ksp_rtol %e") : "";
   chaine_lue_+=atol_>0 ? Nom(atol_, " -ksp_atol %e") : "";
+  chaine_lue_+=petsc_use_kokkos();
   chaine_lue_+=" -ksp_norm_type UNPRECONDITIONED \
 -pc_type fieldsplit \
 -pc_fieldsplit_type additive";
@@ -107,19 +125,22 @@ void Solv_AMG::create_block_amg(int n, Nom precond)
       Cerr << "Use more GPUs, or try slower options: -fieldsplit_P0_pc_gamg_agg_nsmooths 0 -fieldsplit_P1_pc_gamg_agg_nsmooths 0" << finl;
       chaine_lue_+=" -info :pc -fieldsplit_P0_ksp_type preonly \
 -fieldsplit_P0_pc_type gamg \
--fieldsplit_P0_pc_gamg_threshold 0.01 \
 -fieldsplit_P0_pc_gamg_square_graph 1 \
--fieldsplit_P1_ksp_type preonly \
+-fieldsplit_P0_pc_gamg_threshold";
+      chaine_lue_+=st_>0 ? Nom(st_, "  %e") : " 0.01";
+      chaine_lue_+=" -fieldsplit_P1_ksp_type preonly \
 -fieldsplit_P1_pc_type gamg \
--fieldsplit_P1_pc_gamg_threshold 0.01 \
--fieldsplit_P1_pc_gamg_square_graph 1";
+-fieldsplit_P1_pc_gamg_square_graph 1 \
+-fieldsplit_P1_pc_gamg_threshold";
+      chaine_lue_+=st_>0 ? Nom(st_, "  %e") : " 0.01";
       if (n==3)
         {
           chaine_lue_+=" -fieldsplit_Pa_ksp_type preonly \
 -fieldsplit_Pa_ksp_type preonly \
 -fieldsplit_Pa_pc_type gamg \
--fieldsplit_Pa_pc_gamg_threshold 0.01 \
--fieldsplit_Pa_pc_gamg_square_graph 1";
+-fieldsplit_Pa_pc_gamg_square_graph 1 \
+-fieldsplit_Pa_pc_gamg_threshold";
+          chaine_lue_+=st_>0 ? Nom(st_, "  %e") : " 0.01";
         }
       // Use Kokkos backend (slower though) to avoid memory issue on Nvidia:
       // src/mat/impls/aij/seq/seqcusparse/aijcusparse.cu:3269 cuda error 2 (cudaErrorMemoryAllocation) : out of memory
@@ -133,23 +154,27 @@ void Solv_AMG::create_block_amg(int n, Nom precond)
       chaine_lue_+=" -fieldsplit_P0_ksp_type preonly \
 -fieldsplit_P0_pc_type hypre \
 -fieldsplit_P0_pc_hypre_type boomeramg \
--fieldsplit_P0_pc_hypre_boomeramg_strong_threshold 0.1 \
 -fieldsplit_P0_pc_hypre_boomeramg_print_statistics 1 \
--fieldsplit_P1_ksp_type preonly \
+-fieldsplit_P0_pc_hypre_boomeramg_strong_threshold";
+      chaine_lue_+=st_>0 ? Nom(st_, "  %e") : " 0.1";
+      chaine_lue_+=" -fieldsplit_P1_ksp_type preonly \
 -fieldsplit_P1_pc_type hypre \
 -fieldsplit_P1_pc_hypre_type boomeramg \
--fieldsplit_P1_pc_hypre_boomeramg_strong_threshold 0.1 \
--fieldsplit_P1_pc_hypre_boomeramg_print_statistics 1";
+-fieldsplit_P1_pc_hypre_boomeramg_print_statistics 1 \
+-fieldsplit_P1_pc_hypre_boomeramg_strong_threshold";
+      chaine_lue_+=st_>0 ? Nom(st_, "  %e") : " 0.1";
       if (n==3)
         {
           chaine_lue_+=" -fieldsplit_Pa_ksp_type preonly \
 -fieldsplit_Pa_pc_type hypre \
 -fieldsplit_Pa_pc_hypre_type boomeramg \
--fieldsplit_Pa_pc_hypre_boomeramg_strong_threshold 0.1 \
--fieldsplit_Pa_pc_hypre_boomeramg_print_statistics 1";
+-fieldsplit_Pa_pc_hypre_boomeramg_print_statistics 1 \
+-fieldsplit_Pa_pc_hypre_boomeramg_strong_threshold";
+          chaine_lue_+=st_>0 ? Nom(st_, "  %e") : " 0.1";
         }
       // To avoid this issue on Nvidia:  CUSPARSE ERROR (code = 11, insufficient resources) at csr_spgemm_device_cusparse.c:152
-#ifdef TRUST_USE_CUDA
+      // Seen also on HIP on gfx1100
+#ifdef TRUST_USE_GPU
       if (n==2) chaine_lue_+=" -fieldsplit_P0_pc_mg_galerkin_mat_product_algorithm hypre";
       if (n==2) chaine_lue_+=" -fieldsplit_P1_pc_mg_galerkin_mat_product_algorithm hypre";
       if (n==3) chaine_lue_+=" -fieldsplit_Pa_pc_mg_galerkin_mat_product_algorithm hypre";
@@ -160,21 +185,24 @@ void Solv_AMG::create_block_amg(int n, Nom precond)
       Cerr << "Warning! PETSc with AmgX preconditioner was not tested yet for nnz>2^31 !" << finl;
       chaine_lue_+=" -fieldsplit_P0_ksp_type preonly \
 -fieldsplit_P0_pc_type amgx \
--fieldsplit_P0_pc_amgx_strength_threshold 0.1 \
 -fieldsplit_P0_pc_amgx_verbose 1 \
 -fieldsplit_P0_pc_amgx_print_grid_stats 1 \
--fieldsplit_P1_ksp_type preonly \
+-fieldsplit_P0_pc_amgx_strength_threshold";
+      chaine_lue_+=st_>0 ? Nom(st_, "  %e") : " 0.1";
+      chaine_lue_+=" -fieldsplit_P1_ksp_type preonly \
 -fieldsplit_P1_pc_type amgx \
--fieldsplit_P1_pc_amgx_strength_threshold 0.1 \
 -fieldsplit_P1_pc_amgx_verbose 1 \
--fieldsplit_P1_pc_amgx_print_grid_stats 1";
+-fieldsplit_P1_pc_amgx_print_grid_stats 1 \
+-fieldsplit_P1_pc_amgx_strength_threshold";
+      chaine_lue_+=st_>0 ? Nom(st_, "  %e") : " 0.1";
       if (n==3)
         {
           chaine_lue_+=" -fieldsplit_Pa_ksp_type preonly \
 -fieldsplit_Pa_pc_type amgx \
--fieldsplit_Pa_pc_amgx_strength_threshold 0.5 \
 -fieldsplit_Pa_pc_amgx_verbose 1 \
--fieldsplit_Pa_pc_amgx_print_grid_stats 1";
+-fieldsplit_Pa_pc_amgx_print_grid_stats 1 \
+-fieldsplit_Pa_pc_amgx_strength_threshold";
+          chaine_lue_+=st_>0 ? Nom(st_, "  %e") : " 0.1";
         }
     }
   else
@@ -182,52 +210,84 @@ void Solv_AMG::create_block_amg(int n, Nom precond)
   chaine_lue_ +=" }";
 }
 
-Nom boomeramg(double st)
+void Solv_AMG::create_gamg()
 {
-  Nom chaine(" { precond boomeramg { }");
-  if (st>=0)
-    {
-      chaine += " cli { -pc_hypre_boomeramg_strong_threshold";
-      chaine += Nom(st, "%e");
-      chaine += " }";
-    }
-  return chaine;
+  // Possibibly faster on VDF mesh than boomerAMG ?
+  chaine_lue_="cli { -ksp_type ";
+  chaine_lue_+=petsc_cg_issue_ ? "gmres" : ksp_type_; // Switch to GMRES for more robustness (BiCGstab is slower than GMRES 2xSPMV vs 1)
+  chaine_lue_+=rtol_>0 ? Nom(rtol_, " -ksp_rtol %e") : "";
+  chaine_lue_+=atol_>0 ? Nom(atol_, " -ksp_atol %e") : "";
+  chaine_lue_+=petsc_use_kokkos();
+  chaine_lue_+=" -ksp_norm_type UNPRECONDITIONED \
+-info :pc \
+-pc_type gamg \
+-mg_levels_ksp_max_it 1";
+  chaine_lue_+=" -pc_gamg_threshold";
+  chaine_lue_+=st_>0 ? Nom(st_, "  %e") : " 0.01";
+  chaine_lue_ +=" }";
+}
+
+void Solv_AMG::create_boomeramg()
+{
+  // We enable explicitly a lot of option to have same convergence on CPU and GPU
+  // cause by default, PETSc enables differently on CPU and GPU...
+  // But warning, we have a different behaviour for boomeramg on CPU than 1.9.8 !
+  // For instance, by default we have:  Coarsening Type = Falgout-CLJP and modified classical interpolation
+  // Warning ! ext+i seems RAM costly. ext+i-cc is better ?
+  // Add -pc_hypre_boomeramg_P_max 6 ideal for 3D VDF with 6 neighbours ?
+  // Chebyshev reduit pas mal le nombre d'iterations mais ms/it plus eleve donc pas de gain sur le total
+  chaine_lue_="cli { -ksp_type ";
+  chaine_lue_+=petsc_cg_issue_ ? "gmres" : ksp_type_; // Switch to GMRES for more robustness (BiCGstab is slower than GMRES 2xSPMV vs 1)
+  chaine_lue_+=rtol_>0 ? Nom(rtol_, " -ksp_rtol %e") : "";
+  chaine_lue_+=atol_>0 ? Nom(atol_, " -ksp_atol %e") : "";
+  chaine_lue_+=petsc_use_kokkos();
+  chaine_lue_+=" -ksp_norm_type UNPRECONDITIONED \
+-pc_type hypre \
+-pc_hypre_type boomeramg \
+-pc_mg_galerkin_mat_product_algorithm hypre \
+-pc_hypre_boomeramg_relax_type_all l1scaled-Jacobi \
+-pc_hypre_boomeramg_no_CF true \
+-pc_hypre_boomeramg_coarsen_type pmis \
+-pc_hypre_boomeramg_interp_type ext+i \
+-pc_hypre_boomeramg_print_statistics 1";
+  chaine_lue_+=" -pc_hypre_boomeramg_strong_threshold";
+  chaine_lue_+=st_>0 ? Nom(st_, "  %e") : " 0.3";
+  chaine_lue_ +=" }";
 }
+
 void Solv_AMG::create_amg()
 {
   // We select the more efficient/robust one:
-  chaine_lue_ = solver_;
 #if defined(TRUST_USE_CUDA)
-  library_ = "petsc_gpu";
-  chaine_lue_ += boomeramg(st_); // Best GPU solver
-  // KSP divergence with cg+boomeramg/amgx on multi-node with MPI GPU Aware (seen also on Lumi) so we switch to gmres (bgcs slower) !
-  if (Process::nproc()>4) petsc_cg_issue_ = true;
-#if defined(MPIX_CUDA_AWARE_SUPPORT)
-  if (Process::nproc()>4)
+  Nom precond = Process::nproc()<=4 ? "boomeramg" : "amgx";
+  if (getenv("TRUST_AMG")!=nullptr)
+    precond = getenv("TRUST_AMG"); // Surcharge possible pour test rapide
+
+  if (precond=="boomeramg")
+    {
+      library_ = "petsc_gpu";
+      return create_boomeramg(); // Best GPU solver
+    }
+  else if (precond=="amgx")
     {
+      // Switch to AmgX+AmgXWrapper (soon deprecated?)
       library_ = "amgx";
       chaine_lue_ = solver_;
       chaine_lue_ += " { precond c-amg {";
       if (st_>=0) chaine_lue_ += Nom(st_, " p:strength_threshold %e");
       chaine_lue_ += " }";
     }
-#endif
-#elif defined(TRUST_USE_ROCM)
-  library_ = "petsc_gpu";
-  const char* value = std::getenv("ROCM_ARCH");
-  if (value != nullptr && std::string(value) == "gfx1100")
+  else
     {
-      if (st_>=0) Process::exit("st option not supported yet in Solv_AMG");
-      if (Process::is_parallel())
-        chaine_lue_ += " { precond ua-amg { }";  // Converge mais plus lent que sa-amg
-      else
-        chaine_lue_ += " { precond sa-amg { }";  // Crash en parallele
+      library_ = "petsc_gpu";
+      return create_gamg();
     }
-  else
-    chaine_lue_ += boomeramg(st_); // Best GPU solver (// sa-amg is slow...)
+#elif defined(TRUST_USE_ROCM)
+  library_ = "petsc_gpu";
+  return create_boomeramg(); // Best GPU solver (// sa-amg is slow...)
 #else
   library_ = "petsc";
-  chaine_lue_ += boomeramg(st_); // Best CPU solver
+  return create_boomeramg(); // Best CPU solver
 #endif
   chaine_lue_ += rtol_>0 ? Nom(rtol_, " rtol %e") : Nom(atol_, " atol %e");
   if (impr_) chaine_lue_ += " impr";
@@ -235,49 +295,74 @@ void Solv_AMG::create_amg()
   chaine_lue_ += " }";
 }
 
+void Solv_AMG::create_solver()
+{
+  create_amg();
+  if (nb_blocks_>1)
+    {
+      // Block matrix : we use PCFieldsplit (eg: VEF) for preconditioner
+      // Much better convergence for P0P1 for instance
+      Cerr << "Detecting " << nb_blocks_ << "x" << nb_blocks_ << " blocks into the matrix. Creating a specific block preconditioning:" << finl;
+      if (chaine_lue_.contient("gamg"))
+        create_block_amg(nb_blocks_, "gamg");
+      else if (chaine_lue_.contient("boomeramg"))
+        create_block_amg(nb_blocks_, "boomeramg");
+      else if (library_=="amgx")
+        {
+          library_ = "petsc_gpu";
+          create_block_amg(nb_blocks_, "amgx");
+        }
+    }
+  Cerr << "====================================================================" << finl;
+  Cerr << "[AMG] Creating solver: " << library_ << " " << chaine_lue_ << finl;
+  Cerr << "====================================================================" << finl;
+  EChaine entree(chaine_lue_);
+  Nom nom_solveur("Solv_");
+  nom_solveur+=library_;
+  solveur_.typer(nom_solveur);
+  solveur_.nommer("solveur_pression");
+  if (library_=="amgx")
+    ref_cast(Solv_AMGX, solveur_.valeur()).create_solver(entree);
+  else if (library_=="petsc")
+    ref_cast(Solv_Petsc, solveur_.valeur()).create_solver(entree);
+  else if (library_=="petsc_gpu")
+    ref_cast(Solv_Petsc_GPU, solveur_.valeur()).create_solver(entree);
+  else
+    Process::exit("Unsupported case in Solv_AMG::readOn");
+  solveur_->set_save_matrix(save_matrix());
+  solveur_->set_read_matrix(read_matrix());
+}
+
 int Solv_AMG::resoudre_systeme(const Matrice_Base& mat, const DoubleVect& b, DoubleVect& x)
 {
   // We don't create solver during readOn as usual but just before solve to get more infos about matrix/vectors to fine tune
   if (!solveur_)
     {
-      create_amg();
-      int nb_blocks = sub_type(MD_Vector_composite, b.get_md_vector().valeur()) ? ref_cast(MD_Vector_composite, b.get_md_vector().valeur()).nb_parts() : 1;
-      if (nb_blocks>1)
-        {
-          // Block matrix : we use PCFieldsplit (eg: VEF) for preconditioner
-          // Much better convergence for P0P1 for instance
-          Cerr << "Detecting " << nb_blocks << "x" << nb_blocks << " blocks into the matrix. Creating a specific block preconditioning:" << finl;
-          if (chaine_lue_.contient("gamg"))
-            create_block_amg(nb_blocks, "gamg");
-          else if (chaine_lue_.contient("boomeramg"))
-            create_block_amg(nb_blocks, "boomeramg");
-          else if (library_=="amgx")
-            {
-              library_ = "petsc_gpu";
-              create_block_amg(nb_blocks, "amgx");
-            }
-        }
-      Cerr << "====================================================================" << finl;
-      Cerr << "Creating AMG solver: " << library_ << " " << chaine_lue_ << finl;
-      Cerr << "====================================================================" << finl;
-      EChaine entree(chaine_lue_);
-      Nom nom_solveur("Solv_");
-      nom_solveur+=library_;
-      solveur_.typer(nom_solveur);
-      solveur_.nommer("solveur_pression");
-      if (library_=="amgx")
-        ref_cast(Solv_AMGX, solveur_.valeur()).create_solver(entree);
-      else if (library_=="petsc")
-        ref_cast(Solv_Petsc, solveur_.valeur()).create_solver(entree);
-      else if (library_=="petsc_gpu")
-        ref_cast(Solv_Petsc_GPU, solveur_.valeur()).create_solver(entree);
-      else
-        Process::exit("Unsupported case in Solv_AMG::readOn");
-      solveur_->set_save_matrix(save_matrix());
-      solveur_->set_read_matrix(read_matrix());
+      // Seen on Cuda, multi-node MPI-Cuda Aware but also on Lumi (amg unsymmetric preconditioner, cg may diverge)
+      // KSP divergence with cg+amg so we switch to gmres+amg (bcgs slower)
+      if (Process::nproc()>4) petsc_cg_issue_ = true;
+      nb_blocks_ = sub_type(MD_Vector_composite, b.get_md_vector().valeur()) ? ref_cast(MD_Vector_composite, b.get_md_vector().valeur()).nb_parts() : 1;
+      create_solver();
+      Cerr << "[AMG] If you experience OOM during setup, try to increase the strong threshold (st keyword): AMG XXX { rtol XXX impr st XXX }" << finl;
     }
   statistics().end_count(STD_COUNTERS::system_solver,-1,0);
-  int res = solveur_.resoudre_systeme(mat, b, x);
+  int nb_iter=0;
+  try
+    {
+      nb_iter = solveur_.resoudre_systeme(mat, b, x);
+    }
+  catch(...)
+    {
+      statistics().end_count(STD_COUNTERS::system_solver,1,nb_iter);
+      petsc_cg_issue_ = true;
+      create_solver();
+      nb_iter = solveur_.resoudre_systeme(mat, b, x);
+    }
   statistics().begin_count(STD_COUNTERS::system_solver,statistics().get_last_opened_counter_level()+1);
-  return res;
+  if (rtol_<0)
+    {
+      Cout << "Warning: you define only atol (absolute tolerance, dimensional value) for the AMG solver." << finl;
+      Cout << "Strongly recomended to rather define rtol (relative tolerance) as the first convergence criteria and atol as a second criteria." << finl;
+    }
+  return nb_iter;
 }
diff --git a/src/Kernel/Math/SolvSys/Solv_AMG.h b/src/Kernel/Math/SolvSys/Solv_AMG.h
index 831e5a2ec3..443f21c81f 100644
--- a/src/Kernel/Math/SolvSys/Solv_AMG.h
+++ b/src/Kernel/Math/SolvSys/Solv_AMG.h
@@ -40,11 +40,16 @@ public :
 private :
   void create_amg();
   void create_block_amg(int,Nom);
+  void create_gamg();
+  void create_boomeramg();
+  void create_solver();
   SolveurSys solveur_;
   Nom library_="", solver_="", options_="";
   double rtol_=-1, atol_=-1, st_=-1;
   bool impr_ = false;
   bool petsc_cg_issue_ = false;
+  int nb_blocks_ = 1;
+  std::string ksp_type_ = "";
 };
 
 #endif
diff --git a/src/Kernel/Math/SolvSys/Solv_Externe.cpp b/src/Kernel/Math/SolvSys/Solv_Externe.cpp
index e5255625c5..3ffe07daeb 100644
--- a/src/Kernel/Math/SolvSys/Solv_Externe.cpp
+++ b/src/Kernel/Math/SolvSys/Solv_Externe.cpp
@@ -36,14 +36,7 @@ Entree& Solv_Externe::readOn(Entree& is)
 
 void Solv_Externe::MorseSymToMorse(const Matrice_Morse_Sym& MS, Matrice_Morse& M)
 {
-  M = MS;
-  Matrice_Morse mattmp(MS);
-  M.transpose(mattmp);
-  int ordre = M.ordre();
-  for (int i=0; i<ordre; i++)
-    if (M.nb_vois(i))
-      M(i, i) = 0.;
-  M = mattmp + M;
+  M.convert(MS);
 }
 
 void Solv_Externe::construit_matrice_morse_intermediaire(const Matrice_Base& la_matrice, Matrice_Morse& matrice_morse_intermediaire)
@@ -101,6 +94,7 @@ const ArrOfInt& Solv_Externe::indice_coeff_to_keep(const Matrice_Morse& matrice_
       const int n = tab1.size_array() - 1;
       auto nnz(tab1[0]);
       nnz = 0;
+      ToDo_Kokkos("critical");
       for (int i = 0; i < n; i++)
         {
           if (items_to_keep_[i])
@@ -108,6 +102,7 @@ const ArrOfInt& Solv_Externe::indice_coeff_to_keep(const Matrice_Morse& matrice_
         }
       indice_coeff_to_keep_.resize((int)nnz);
       nnz = 0;
+      ToDo_Kokkos("critical");
       for (int i = 0; i < n; i++)
         {
           if (items_to_keep_[i])
diff --git a/src/Kernel/Math/SolvSys/Solv_Externe.h b/src/Kernel/Math/SolvSys/Solv_Externe.h
index f831cdffca..222a53410c 100644
--- a/src/Kernel/Math/SolvSys/Solv_Externe.h
+++ b/src/Kernel/Math/SolvSys/Solv_Externe.h
@@ -40,7 +40,7 @@ class Solv_Externe : public SolveurSys_base, public Solv_tools
   void construit_matrice_morse_intermediaire(const Matrice_Base&, Matrice_Morse& );
   void MorseSymToMorse(const Matrice_Morse_Sym& MS, Matrice_Morse& M);
   void Create_lhs_rhs_onDevice();
-  public_for_cuda
+  protected_but_public_for_cuda
   template<typename ExecSpace>
   void Update_lhs_rhs(const DoubleVect& b, DoubleVect& x);
   template<typename ExecSpace>
diff --git a/src/Kernel/Math/SolvSys/Solv_Gmres.h b/src/Kernel/Math/SolvSys/Solv_Gmres.h
index c78ef1143e..a7e256b458 100644
--- a/src/Kernel/Math/SolvSys/Solv_Gmres.h
+++ b/src/Kernel/Math/SolvSys/Solv_Gmres.h
@@ -42,7 +42,6 @@ protected :
   int lire_motcle_non_standard(const Motcle&, Entree&) override;
 
   int Gmres(const Matrice_Morse&, const DoubleVect&, DoubleVect& );
-  int gmres_local( const Matrice_Morse& A, const DoubleVect& b, DoubleVect& tab_x1);
 
   DoubleVects v; //espcace Krilov
   bool is_local_gmres = false;
@@ -52,6 +51,8 @@ protected :
   DoubleTab h;
   DoubleVect r;
   DoubleVect h_loc, dh_loc;
+  protected_but_public_for_cuda
+  int gmres_local( const Matrice_Morse& A, const DoubleVect& b, DoubleVect& tab_x1);
 };
 
 #endif /* Solv_Gmres_included */
diff --git a/src/Kernel/Math/SolvSys/Solv_Petsc.cpp b/src/Kernel/Math/SolvSys/Solv_Petsc.cpp
index 84c97a67d7..0dd11a0c20 100644
--- a/src/Kernel/Math/SolvSys/Solv_Petsc.cpp
+++ b/src/Kernel/Math/SolvSys/Solv_Petsc.cpp
@@ -1315,6 +1315,9 @@ void Solv_Petsc::create_solver(Entree& entree)
                 PCSetType(PreconditionneurPetsc_, PCHYPRE);
                 PCHYPRESetType(PreconditionneurPetsc_, "boomeramg"); // Classical C-AMG
                 pc_supported_on_gpu_by_petsc=1;
+#ifdef TRUST_USE_CUDA
+                add_option("pc_mg_galerkin_mat_product_algorithm", "hypre"); // AVoid OOM on device on CUDA
+#endif
                 // Changement pc_hypre_boomeramg_relax_type_all pour PETSc 3.10, la matrice de
                 // preconditionnement etant seqaij, symetric-SOR/jacobi (defaut) provoque KSP_DIVERGED_INDEFINITE_PC
                 // Voir: https://lists.mcs.anl.gov/mailman/htdig/petsc-users/2012-December/015922.html
@@ -2042,7 +2045,7 @@ int Solv_Petsc::resoudre_systeme(const Matrice_Base& la_matrice, const DoubleVec
           if (dm_!=nullptr)
             DMDestroy(&dm_);
         }
-
+      has_constant_nullspace_ = la_matrice.has_constant_nullspace();
       matrice_symetrique_ = true;      // On suppose que la matrice est symetrique
 
       // Construction de la numerotation globale:
@@ -2285,16 +2288,13 @@ int Solv_Petsc::solve(ArrOfDouble& residu)
           Cerr << "KSP_DIVERGED_ITS" << finl;
           Cerr << "That means the solver didn't converge within the maximal iterations number." << finl;
           Cerr << "You can change the maximal number of iterations with the -ksp_max_it option." << finl;
-#ifdef MPIX_CUDA_AWARE_SUPPORT
-          // Probleme vu avec GPU direct si >= 4 GPUs et preconditinneurs C-AMG ou BOOMERAMG
-          // OK pour SA-AMG et Jacobi
-          // Il faudrait faire un reproducer a soumettre a PETSc...
-          Cerr << "It seems there is a convergence issue (bug?) with MPI GPU Aware library with PETSc CG and some preconditioners." << finl;
-          Cerr << "Try using BICGSTAB instead of GCP to bypass the issue." << finl;
-          Process::exit();
-#endif
         }
       else Cerr << (int)Reason << finl;
+      if ((Reason==KSP_DIVERGED_INDEFINITE_PC || Reason==KSP_DIVERGED_INDEFINITE_MAT) && type_ksp_ == "cg")
+        {
+          Cerr << "It seems you are using GCP but with an unsymmetric preconditioning." << finl;
+          Cerr << "Try using GMRES or BICCGTAB to bypass this issue of non convergence." << finl;
+        }
       throw Reason;
     }
   if (Reason<0 && !return_on_error_) exit();
@@ -3272,7 +3272,6 @@ void Solv_Petsc::Update_matrix(Mat& MatricePetsc, const Matrice_Morse& mat_morse
       // On dimensionne ces tableaux a la taille la plus grande possible
       // ToDo : recalcul de nnz utile ?
       ArrOfInt nnz(nb_rows_);
-      nnz = 0;
       ArrOfTID& renum_array = renum_;  // tab seen as a flat array (can't use ArrOfPetscInt& because of C++ ref cast...)
       const auto& tab1 = mat_morse.get_tab1();
       const auto& tab2 = mat_morse.get_tab2();
@@ -3338,6 +3337,26 @@ void Solv_Petsc::Update_matrix(Mat& MatricePetsc, const Matrice_Morse& mat_morse
       MatAssemblyEnd(MatricePetsc, MAT_FINAL_ASSEMBLY);
     }
 
+  /* attach null space if any */
+  if (has_constant_nullspace_)
+    {
+      MatNullSpace nullsp;
+      MatNullSpaceCreate(PETSC_COMM_WORLD, PETSC_TRUE, 0, PETSC_NULLPTR, &nullsp);
+      MatSetNullSpace(MatricePetsc, nullsp);
+      MatSetNearNullSpace(MatricePetsc, nullsp);   // useful for AMG
+      PetscBool isNull;
+      MatNullSpaceTest(nullsp,MatricePetsc,&isNull);
+      if (!isNull)
+        {
+          Cerr << "[Petsc] Warning, matrix has not null space as specified! We remove it..." << finl;
+          MatSetNullSpace(MatricePetsc, PETSC_NULLPTR);
+          MatSetNearNullSpace(MatricePetsc, PETSC_NULLPTR);
+        }
+      else
+        Cerr << "[Petsc] Creating null space on the matrix." << finl;
+      MatNullSpaceDestroy(&nullsp);
+    }
+
   if (!nouveau_stencil_ && reorder_matrix_)
     {
       Mat Aperm;
diff --git a/src/Kernel/Math/SolvSys/Solv_Petsc.h b/src/Kernel/Math/SolvSys/Solv_Petsc.h
index e1a8048def..76fa80082e 100644
--- a/src/Kernel/Math/SolvSys/Solv_Petsc.h
+++ b/src/Kernel/Math/SolvSys/Solv_Petsc.h
@@ -105,7 +105,7 @@ public :
   {
     return amgx_initialized_;
   };
-#if PETSC_VERSION_GE(3,24,0)
+#if PETSC_VERSION_GE(3,24,0) && PETSC_VERSION_LT(3,25,0)
   PetscErrorCode set_convergence_test(PetscErrorCode (*converge)(KSP,PetscInt,PetscReal,KSPConvergedReason*,void*),void *cctx,PetscErrorCode (*destroy)(void**))
 #else
   PetscErrorCode set_convergence_test(PetscErrorCode (*converge)(KSP,PetscInt,PetscReal,KSPConvergedReason*,void*),void *cctx,PetscErrorCode (*destroy)(void*))
@@ -124,7 +124,7 @@ public :
   }
 #endif
 
-  public_for_cuda
+  protected_but_public_for_cuda
 #ifdef PETSCKSP_H
   virtual void Update_matrix(Mat& MatricePetsc, const Matrice_Morse& mat_morse); // Fill the (previously allocated) PETSc matrix with mat_morse coefficients
 #endif
@@ -197,7 +197,7 @@ protected :
   VecScatter VecScatter_;	// Scatter context needed when petsc_decide_=1 to gather values of global to local solution
 #endif
 
-
+  bool has_constant_nullspace_ = false;          // To enable Null Space treatment
   int solveur_direct_ = no;          // Pour savoir si l'on manipule un solveur direct et non iteratif
   bool gpu_ = false;                    // Utilisation des solveurs GPU de PETSc
   bool amgx_ = false;			// Utilisation des solveurs GPU de AMGX
diff --git a/src/Kernel/Math/TRUSTTab.tpp b/src/Kernel/Math/TRUSTTab.tpp
index ae67efae1c..ae6c61bd6b 100644
--- a/src/Kernel/Math/TRUSTTab.tpp
+++ b/src/Kernel/Math/TRUSTTab.tpp
@@ -1,5 +1,5 @@
 /****************************************************************************
-* Copyright (c) 2025, CEA
+* Copyright (c) 2026, CEA
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
@@ -17,6 +17,7 @@
 #define TRUSTTab_TPP_included
 
 #include <TRUSTTab.h>
+#include <MD_Vector_seq.h>
 
 // TODO : FIXME : delete
 template<typename _TYPE_, typename _SIZE_>
@@ -675,8 +676,13 @@ inline void TRUSTTab<_TYPE_,_SIZE_>::set_md_vector(const MD_Vector& md_vector)
 #ifndef LATATOOLS
   _SIZE_ dim0 = dimension_tot_0_;
   if (md_vector.non_nul())
-    // renvoie -1 si l'appel est invalide ou si le MD_Vector est mix (cf doc MD_Vector_base):
-    dim0 = md_vector->get_nb_items_reels();
+    {
+      if (sub_type(MD_Vector_seq, md_vector.valeur()))
+        dim0 = (_SIZE_)md_vector->nb_items_seq_tot();
+      else
+        // renvoie -1 si l'appel est invalide ou si le MD_Vector est mix (cf doc MD_Vector_base):
+        dim0 = md_vector->get_nb_items_reels();
+    }
   dimensions_[0] = dim0;
   assert(verifie_LINE_SIZE());
   // a appeler meme pour un md_vector nul (pour remettre size_reelle_):
@@ -759,9 +765,6 @@ template<typename _TYPE_, typename _SIZE_>
 template <typename _T_>
 inline void TRUSTTab<_TYPE_,_SIZE_>::ajoute_produit_tensoriel(_T_ alpha, const TRUSTTab<_T_,_SIZE_>& x, const TRUSTTab<_T_,_SIZE_>& y)
 {
-  this->ensureDataOnHost();
-  x.ensureDataOnHost();
-  y.ensureDataOnHost();
   // Tableaux vus comme des tableaux unidimensionnels (pour ne pas avoir a gerer nb_dim)
   const TRUSTVect<_T_,_SIZE_>& vx = x, &vy = y;
   TRUSTVect<_T_,_SIZE_>& v = *this;
@@ -790,23 +793,9 @@ inline void TRUSTTab<_TYPE_,_SIZE_>::ajoute_produit_tensoriel(_T_ alpha, const T
           bloc_itr = Block_Iter<_SIZE_>(0, v.size_totale() / v.line_size());   // iterator on a single (big) block
         }
     }
+  if (nblocs_left == 0) return;
 
-  for (; nblocs_left; nblocs_left--)
-    {
-      const _SIZE_ debut = (*(bloc_itr++)), fin = (*(bloc_itr++));
-      _SIZE_ v_index = debut * line_size_xy;
-      for (_SIZE_ i = debut; i < fin; i++)
-        for (_SIZE_ j = 0; j < line_size_x; j++)
-          {
-            _T_ xval = vx[i * line_size_x + j];
-            for (_SIZE_ k = 0; k < line_size_y; k++)
-              {
-                _T_ yval = vy[i * line_size_y + k];
-                v[v_index] += alpha * xval * yval;
-                v_index++;
-              }
-          }
-    }
+  ::ajoute_produit_tensoriel(alpha, v, vx, vy, nblocs_left, bloc_itr, line_size_x, line_size_y, line_size_xy);
 }
 
 //  Resolution du systeme Ax=b
diff --git a/src/Kernel/Math/TRUSTTab_tools.cpp b/src/Kernel/Math/TRUSTTab_tools.cpp
index 04063838b9..a385ef92d4 100644
--- a/src/Kernel/Math/TRUSTTab_tools.cpp
+++ b/src/Kernel/Math/TRUSTTab_tools.cpp
@@ -141,3 +141,29 @@ template void local_carre_norme_tab<double>(const TRUSTTab<double,int>& tableau,
 template void local_carre_norme_tab<float>(const TRUSTTab<float,int>& tableau, TRUSTArray<float,int>& norme_colonne);
 template void local_max_abs_tab<double>(const TRUSTTab<double,int>& tableau, TRUSTArray<double,int>& max_colonne);
 template void local_max_abs_tab<float>(const TRUSTTab<float,int>& tableau, TRUSTArray<float,int>& max_colonne);
+
+template <typename _T_, typename _SIZE_>
+void ajoute_produit_tensoriel(_T_ alpha, TRUSTVect<_T_,_SIZE_>& tab_v, const TRUSTVect<_T_,_SIZE_>& tab_x, const TRUSTVect<_T_,_SIZE_>& tab_y,
+                              int nblocs_left, Block_Iter<_SIZE_> bloc_itr,
+                              int line_size_x, int line_size_y, int line_size_xy)
+{
+  auto x = tab_x.template view_ro<1>().data();
+  auto y = tab_y.template view_ro<1>().data();
+  auto v = tab_v.template view_rw<1>().data();
+#ifdef TRUST_USE_GPU
+  if (nblocs_left > 3) ToDo_Kokkos("nblocs_left too high, optimize by rewriting as ajoute_produit_tensoriel_kernel");
+#endif
+  for (; nblocs_left; nblocs_left--)
+    {
+      const _SIZE_ debut = (*(bloc_itr++)), fin = (*(bloc_itr++));
+      Kokkos::parallel_for(start_gpu_timer(__KERNEL_NAME__), range_3D({debut, 0, 0}, {fin, line_size_x, line_size_y}),
+                           KOKKOS_LAMBDA(const int i, const int j, const int k)
+      {
+        v[i * line_size_xy + j * line_size_y + k] += alpha * x[i * line_size_x + j] * y[i * line_size_y + k];
+      });
+      end_gpu_timer(__KERNEL_NAME__);
+    }
+}
+
+template void ajoute_produit_tensoriel<double, int>(double alpha, TRUSTVect<double,int>& v, const TRUSTVect<double,int>& vx, const TRUSTVect<double,int>& vy, int nblocs_left, Block_Iter<int> bloc_itr, int line_size_x, int line_size_y, int line_size_xy);
+template void ajoute_produit_tensoriel<float,  int>(float  alpha, TRUSTVect<float, int>& v, const TRUSTVect<float, int>& vx, const TRUSTVect<float, int>& vy, int nblocs_left, Block_Iter<int> bloc_itr, int line_size_x, int line_size_y, int line_size_xy);
diff --git a/src/Kernel/Math/TRUSTTab_tools.tpp b/src/Kernel/Math/TRUSTTab_tools.tpp
index 84e6974d06..8f18f5cce5 100644
--- a/src/Kernel/Math/TRUSTTab_tools.tpp
+++ b/src/Kernel/Math/TRUSTTab_tools.tpp
@@ -64,6 +64,11 @@ inline void mp_max_abs_tab(const TRUSTTab<_T_,int>& tableau, TRUSTArray<_T_,int>
   Process::mp_max_for_each_item(max_colonne);
 }
 
+template <typename _T_, typename _SIZE_>
+extern void ajoute_produit_tensoriel(_T_ alpha, TRUSTVect<_T_,_SIZE_>& v, const TRUSTVect<_T_,_SIZE_>& vx, const TRUSTVect<_T_,_SIZE_>& vy,
+                                     int nblocs_left, Block_Iter<_SIZE_> bloc_itr,
+                                     int line_size_x, int line_size_y, int line_size_xy);
+
 #ifndef LATATOOLS
 /**
  * @brief Compares two `TRUSTTab<double, _SZ_>` objects for equality.
diff --git a/src/Kernel/Math/TRUSTVect_tools.cpp b/src/Kernel/Math/TRUSTVect_tools.cpp
index 1a06a09268..7ecb662cd9 100644
--- a/src/Kernel/Math/TRUSTVect_tools.cpp
+++ b/src/Kernel/Math/TRUSTVect_tools.cpp
@@ -142,43 +142,30 @@ template void ajoute_produit_scalaire<float, int>(TRUSTVect<float, int>& resu, f
 namespace
 {
 template<typename ExecSpace, typename _TYPE_, typename _SIZE_, bool IS_MUL>
-void operation_speciale_tres_generic_kernel(TRUSTVect<_TYPE_, _SIZE_>& resu, const TRUSTVect<_TYPE_, _SIZE_>& vx, int nblocs_left,
+void operation_speciale_tres_generic_kernel(TRUSTVect<_TYPE_, _SIZE_>& tab_resu, const TRUSTVect<_TYPE_, _SIZE_>& tab_vx, int nblocs_left,
                                             Block_Iter<_SIZE_>& bloc_itr, const int line_size_vx, const _SIZE_ vect_size_tot, const int delta_line_size)
 {
-  auto vx_view= vx.template view_ro<1, ExecSpace>().data();
-  auto resu_view= resu.template view_rw<1, ExecSpace>().data();
-#ifdef TRUST_USE_GPU
-  if (nblocs_left>3) ToDo_Kokkos("nblocs_left too high, optimize by rewriting as local_operations_vect_bis_generic_kernel");
-#endif
-  for (; nblocs_left; nblocs_left--)
-    {
-      // Get index of next bloc start:
-      const int begin_bloc = (*(bloc_itr++)) * line_size_vx;
-      const int end_bloc = (*(bloc_itr++)) * line_size_vx;
-
-      assert(begin_bloc >= 0 && end_bloc <= vect_size_tot && end_bloc >= begin_bloc);
-
-      // Adjust pointers to indices
-      const int resu_start_idx = begin_bloc * delta_line_size;
-
-      Kokkos::RangePolicy<ExecSpace> policy(begin_bloc, end_bloc);
-      if (statistics().get_use_gpu()) start_gpu_timer(__KERNEL_NAME__);
-      Kokkos::parallel_for(policy, KOKKOS_LAMBDA(const int i)
+  auto vx = tab_vx.template view_ro<1, ExecSpace>().data();
+  auto resu = tab_resu.template view_rw<1, ExecSpace>().data();
+  const bool has_items = static_cast<bool>(bloc_itr.items_);
+  auto items = has_items ? bloc_itr.items_->template view_ro<1, ExecSpace>().data() : nullptr;
+  const _SIZE_ n_items = has_items ? bloc_itr.items_->size_array() : vect_size_tot / line_size_vx;
+  if (statistics().get_use_gpu()) start_gpu_timer(__KERNEL_NAME__);
+  Kokkos::parallel_for(Kokkos::RangePolicy<ExecSpace>(0, n_items), KOKKOS_LAMBDA(const _SIZE_ i)
+  {
+    for (int jv = 0; jv < line_size_vx; jv++)
       {
-        const _TYPE_ x = vx_view[i];
-
-        //The // for could be also placed there
+        const _SIZE_ vx_i = (has_items ? items[i] : i) * line_size_vx + jv;
+        const _TYPE_ x = vx[vx_i];
         for (int j = 0; j < delta_line_size; ++j)
           {
-            const int resu_idx = resu_start_idx + i * delta_line_size + j;
-            if (IS_MUL)
-              resu_view[resu_idx] *= x;
-            else //If it's not MUL, it's DIV
-              resu_view[resu_idx] *= ((_TYPE_)1 / x);
+            const _SIZE_ resu_idx = vx_i * delta_line_size + j;
+            if (IS_MUL) resu[resu_idx] *= x;
+            else resu[resu_idx] *= ((_TYPE_)1 / x);
           }
-      });
-      if (statistics().get_use_gpu()) end_gpu_timer(__KERNEL_NAME__, is_default_exec_space<ExecSpace>);
-    }
+      }
+  });
+  if (statistics().get_use_gpu()) end_gpu_timer(__KERNEL_NAME__, is_default_exec_space<ExecSpace>);
 }
 }
 #endif
@@ -192,17 +179,17 @@ void operation_speciale_tres_generic(TRUSTVect<_TYPE_, _SIZE_>& resu, const TRUS
   static constexpr bool IS_MUL = (_TYPE_OP_ == TYPE_OPERATION_VECT_SPEC_GENERIC::MUL_); //it's either MUL or DIV
 
   // get info for computation
-  const int line_size = resu.line_size(), line_size_vx = vx.line_size(), vect_size_tot = resu.size_totale();
+  const int line_size = resu.line_size(), line_size_vx = vx.line_size();
   const MD_Vector& md = resu.get_md_vector();
   // Le line_size du vecteur resu doit etre un multiple du line_size du vecteur vx
   assert(line_size > 0 && line_size_vx > 0 && line_size % line_size_vx == 0);
   const int delta_line_size = line_size / line_size_vx;
-  assert(vx.size_totale() * delta_line_size == vect_size_tot); // this test is necessary if md is null
+  assert(vx.size_totale() * delta_line_size == resu.size_totale()); // this test is necessary if md is null
   assert(vx.get_md_vector() == md);
 
-  // Determine blocs of data to process, depending on " opt"
+  // Determine blocs of data to process using vx dimensions so items_ holds flat vx indices
   int nblocs_left;
-  Block_Iter<_SIZE_> bloc_itr = ::determine_blocks(opt, md, vect_size_tot, line_size, nblocs_left);
+  Block_Iter<_SIZE_> bloc_itr = ::determine_blocks(opt, md, vx.size_totale(), line_size_vx, nblocs_left);
   // Shortcut for empty arrays (avoid case line_size == 0)
   if (bloc_itr.empty())
     return;
@@ -211,9 +198,9 @@ void operation_speciale_tres_generic(TRUSTVect<_TYPE_, _SIZE_>& resu, const TRUS
 
   //Lauch computation with the execution space and view types as (template) parameters
   if (kernelOnDevice)
-    operation_speciale_tres_generic_kernel<Kokkos::DefaultExecutionSpace, _TYPE_, _SIZE_, IS_MUL>(resu, vx, nblocs_left, bloc_itr, line_size_vx, vect_size_tot, delta_line_size);
+    operation_speciale_tres_generic_kernel<Kokkos::DefaultExecutionSpace, _TYPE_, _SIZE_, IS_MUL>(resu, vx, nblocs_left, bloc_itr, line_size_vx, vx.size_totale(), delta_line_size);
   else
-    operation_speciale_tres_generic_kernel<Kokkos::DefaultHostExecutionSpace, _TYPE_, _SIZE_, IS_MUL>(resu, vx, nblocs_left, bloc_itr, line_size_vx, vect_size_tot, delta_line_size);
+    operation_speciale_tres_generic_kernel<Kokkos::DefaultHostExecutionSpace, _TYPE_, _SIZE_, IS_MUL>(resu, vx, nblocs_left, bloc_itr, line_size_vx, vx.size_totale(), delta_line_size);
 
 #ifndef NDEBUG
   // In debug mode, put invalid values where data has not been computed
@@ -237,35 +224,26 @@ template void operation_speciale_tres_generic<TYPE_OPERATION_VECT_SPEC_GENERIC::
 namespace
 {
 template<typename ExecSpace, typename _TYPE_, typename _SIZE_, bool IS_ADD>
-void operation_speciale_generic_kernel(TRUSTVect<_TYPE_, _SIZE_>& resu, const TRUSTVect<_TYPE_, _SIZE_>& vx, _TYPE_ alpha, int nblocs_left,
+void operation_speciale_generic_kernel(TRUSTVect<_TYPE_, _SIZE_>& tab_resu, const TRUSTVect<_TYPE_, _SIZE_>& tab_vx, _TYPE_ alpha, int nblocs_left,
                                        Block_Iter<_SIZE_>& bloc_itr, const _SIZE_ vect_size_tot, const int line_size)
 {
-  auto vx_view= vx.template view_ro<1, ExecSpace>().data();
-  auto resu_view= resu.template view_rw<1, ExecSpace>().data();
-#ifdef TRUST_USE_GPU
-  if (nblocs_left>3) ToDo_Kokkos("nblocs_left too high, optimize by rewriting as local_operations_vect_bis_generic_kernel");
-#endif
-  for (; nblocs_left; nblocs_left--)
-    {
-      // Get index of next bloc start:
-      const _SIZE_ begin_bloc = (*(bloc_itr++)) * line_size;
-      const _SIZE_ end_bloc = (*(bloc_itr++)) * line_size;
-
-      assert(begin_bloc >= 0 && end_bloc <= vect_size_tot && end_bloc >= begin_bloc);
-
-      Kokkos::RangePolicy<ExecSpace> policy(begin_bloc, end_bloc);
-      if (statistics().get_use_gpu()) start_gpu_timer(__KERNEL_NAME__);
-      Kokkos::parallel_for(policy, KOKKOS_LAMBDA(const int i)
+  auto vx = tab_vx.template view_ro<1, ExecSpace>().data();
+  auto resu = tab_resu.template view_rw<1, ExecSpace>().data();
+  const bool has_items = static_cast<bool>(bloc_itr.items_);
+  auto items = has_items ? bloc_itr.items_->template view_ro<1, ExecSpace>().data() : nullptr;
+  const _SIZE_ n_items = has_items ? bloc_itr.items_->size_array() : vect_size_tot / line_size;
+  if (statistics().get_use_gpu()) start_gpu_timer(__KERNEL_NAME__);
+  Kokkos::parallel_for(Kokkos::RangePolicy<ExecSpace>(0, n_items), KOKKOS_LAMBDA(const _SIZE_ i)
+  {
+    for (int j = 0; j < line_size; j++)
       {
-        const _TYPE_ x = vx_view[i];
-
-        if (IS_ADD) //done at compile time
-          resu_view[i] += alpha * x;
-        else //If it's not ADD, it's SQUARE
-          resu_view[i] += alpha * x * x;
-      });
-      if (statistics().get_use_gpu()) end_gpu_timer(__KERNEL_NAME__, is_default_exec_space<ExecSpace>);
-    }
+        const _SIZE_ item = (has_items ? items[i] : i) * line_size + j;
+        const _TYPE_ x = vx[item];
+        if (IS_ADD) resu[item] += alpha * x;
+        else resu[item] += alpha * x * x;
+      }
+  });
+  if (statistics().get_use_gpu()) end_gpu_timer(__KERNEL_NAME__, is_default_exec_space<ExecSpace>);
 }
 }
 #endif
@@ -316,7 +294,7 @@ template void ajoute_operation_speciale_generic<TYPE_OPERATION_VECT_SPEC::SQUARE
 namespace
 {
 template<typename ExecSpace, typename _TYPE_, typename _SIZE_, TYPE_OPERATOR_VECT _TYPE_OP_>
-void operator_vect_vect_generic_kernel(TRUSTVect<_TYPE_,_SIZE_>& resu, const TRUSTVect<_TYPE_, _SIZE_>& vx, int nblocs_left,
+void operator_vect_vect_generic_kernel(TRUSTVect<_TYPE_,_SIZE_>& tab_resu, const TRUSTVect<_TYPE_, _SIZE_>& tab_vx, int nblocs_left,
                                        Block_Iter<_SIZE_>& bloc_itr,  const _SIZE_ vect_size_tot, const int line_size)
 {
   static constexpr bool IS_ADD = (_TYPE_OP_ == TYPE_OPERATOR_VECT::ADD_), IS_SUB = (_TYPE_OP_ == TYPE_OPERATOR_VECT::SUB_),
@@ -324,35 +302,30 @@ void operator_vect_vect_generic_kernel(TRUSTVect<_TYPE_,_SIZE_>& resu, const TRU
                         IS_EGAL = (_TYPE_OP_ == TYPE_OPERATOR_VECT::EGAL_);
 
 #ifdef TRUST_USE_GPU
-  auto vx_view= vx.template view_ro<1, ExecSpace>().data();
-  auto resu_view= resu.template view_rw<1, ExecSpace>().data();
-#ifdef TRUST_USE_GPU
-  if (nblocs_left>3) ToDo_Kokkos("nblocs_left too high, optimize by rewriting as local_operations_vect_bis_generic_kernel");
-#endif
-  for (; nblocs_left; nblocs_left--)
-    {
-      // Get index of next bloc start:
-      const _SIZE_ begin_bloc = (*(bloc_itr++)) * line_size;
-      const _SIZE_ end_bloc = (*(bloc_itr++)) * line_size;
-
-      assert(begin_bloc >= 0 && end_bloc <= vect_size_tot && end_bloc >= begin_bloc);
-      Kokkos::RangePolicy<ExecSpace> policy(begin_bloc, end_bloc);
-      if (statistics().get_use_gpu()) start_gpu_timer(__KERNEL_NAME__);
-      Kokkos::parallel_for(policy, KOKKOS_LAMBDA(const _SIZE_ i)
+  auto vx = tab_vx.template view_ro<1, ExecSpace>().data();
+  auto resu = tab_resu.template view_rw<1, ExecSpace>().data();
+  const bool has_items = static_cast<bool>(bloc_itr.items_);
+  auto items = has_items ? bloc_itr.items_->template view_ro<1, ExecSpace>().data() : nullptr;
+  const _SIZE_ n_items = has_items ? bloc_itr.items_->size_array() : vect_size_tot / line_size;
+  if (statistics().get_use_gpu()) start_gpu_timer(__KERNEL_NAME__);
+  Kokkos::parallel_for(Kokkos::RangePolicy<ExecSpace>(0, n_items), KOKKOS_LAMBDA(const _SIZE_ i)
+  {
+    for (int j = 0; j < line_size; j++)
       {
-        const _TYPE_ x = vx_view[i];
-        if (IS_ADD) resu_view[i] += x;
-        if (IS_SUB) resu_view[i] -= x;
-        if (IS_MULT) resu_view[i] *= x;
-        if (IS_DIV) resu_view[i] /= x;
-        if (IS_EGAL) resu_view[i] = x;
-      });
-      if (statistics().get_use_gpu()) end_gpu_timer(__KERNEL_NAME__, is_default_exec_space<ExecSpace>);
-    }
+        const _SIZE_ item = (has_items ? items[i] : i) * line_size + j;
+        const _TYPE_ x = vx[item];
+        if (IS_ADD)  resu[item] += x;
+        if (IS_SUB)  resu[item] -= x;
+        if (IS_MULT) resu[item] *= x;
+        if (IS_DIV)  resu[item] /= x;
+        if (IS_EGAL) resu[item] = x;
+      }
+  });
+  if (statistics().get_use_gpu()) end_gpu_timer(__KERNEL_NAME__, is_default_exec_space<ExecSpace>);
 #else
   // Need to keep C++ optimized (pointer) implementation for PolyMAC_CDO in Flica5
-  _TYPE_ *resu_base = resu.data();
-  const _TYPE_ *x_base = vx.data();
+  _TYPE_ *resu_base = tab_resu.data();
+  const _TYPE_ *x_base = tab_vx.data();
   for (; nblocs_left; nblocs_left--)
     {
       // Get index of next bloc start:
@@ -429,7 +402,7 @@ template void operator_vect_vect_generic<float, int, TYPE_OPERATOR_VECT::EGAL_>(
 namespace
 {
 template<typename ExecSpace, typename _TYPE_, typename _SIZE_, TYPE_OPERATOR_SINGLE _TYPE_OP_>
-void operator_vect_single_generic_kernel(TRUSTVect<_TYPE_,_SIZE_>& resu, const _TYPE_ x, int nblocs_left,
+void operator_vect_single_generic_kernel(TRUSTVect<_TYPE_,_SIZE_>& tab_resu, const _TYPE_ x, int nblocs_left,
                                          Block_Iter<_SIZE_>& bloc_itr,  const _SIZE_ vect_size_tot, const int line_size)
 {
   static constexpr bool IS_ADD = (_TYPE_OP_ == TYPE_OPERATOR_SINGLE::ADD_), IS_SUB = (_TYPE_OP_ == TYPE_OPERATOR_SINGLE::SUB_),
@@ -437,34 +410,29 @@ void operator_vect_single_generic_kernel(TRUSTVect<_TYPE_,_SIZE_>& resu, const _
                         IS_NEGATE = (_TYPE_OP_ == TYPE_OPERATOR_SINGLE::NEGATE_), IS_INV = (_TYPE_OP_ == TYPE_OPERATOR_SINGLE::INV_), IS_ABS = (_TYPE_OP_ == TYPE_OPERATOR_SINGLE::ABS_),
                         IS_SQRT = (_TYPE_OP_ == TYPE_OPERATOR_SINGLE::SQRT_), IS_SQUARE = (_TYPE_OP_ == TYPE_OPERATOR_SINGLE::SQUARE_);
 
-  auto resu_view= resu.template view_rw<1, ExecSpace>().data();
-#ifdef TRUST_USE_GPU
-  if (nblocs_left>3) ToDo_Kokkos("nblocs_left too high, optimize by rewriting as local_operations_vect_bis_generic_kernel");
-#endif
-  for (; nblocs_left; nblocs_left--)
-    {
-      // Get index of next bloc start:
-      const _SIZE_ begin_bloc = (*(bloc_itr++)) * line_size;
-      const _SIZE_ end_bloc = (*(bloc_itr++)) * line_size;
-
-      assert(begin_bloc >= 0 && end_bloc <= vect_size_tot && end_bloc >= begin_bloc);
-      Kokkos::RangePolicy<ExecSpace> policy(begin_bloc, end_bloc);
-      if (statistics().get_use_gpu()) start_gpu_timer(__KERNEL_NAME__);
-      Kokkos::parallel_for(policy, KOKKOS_LAMBDA(const _SIZE_ i)
+  auto resu = tab_resu.template view_rw<1, ExecSpace>().data();
+  const bool has_items = static_cast<bool>(bloc_itr.items_);
+  auto items = has_items ? bloc_itr.items_->template view_ro<1, ExecSpace>().data() : nullptr;
+  const _SIZE_ n_items = has_items ? bloc_itr.items_->size_array() : vect_size_tot / line_size;
+  if (statistics().get_use_gpu()) start_gpu_timer(__KERNEL_NAME__);
+  Kokkos::parallel_for(Kokkos::RangePolicy<ExecSpace>(0, n_items), KOKKOS_LAMBDA(const _SIZE_ i)
+  {
+    for (int j = 0; j < line_size; j++)
       {
-        if (IS_SUB) resu_view[i] -= x;
-        if (IS_ADD) resu_view[i] += x;
-        if (IS_MULT) resu_view[i] *= x;
-        if (IS_EGAL) resu_view[i] = x;
-        if (IS_NEGATE) resu_view[i] = -resu_view[i];
-        if (IS_ABS) resu_view[i] = (_TYPE_) Kokkos::abs(resu_view[i]);
-        if (IS_SQRT) resu_view[i] = (_TYPE_) Kokkos::sqrt(resu_view[i]);
-        if (IS_SQUARE) resu_view[i] = resu_view[i]*resu_view[i];
-        if (IS_DIV) resu_view[i] /= x;
-        if (IS_INV) resu_view[i] = (_TYPE_) ((_TYPE_)1 /resu_view[i]);
-      });
-      if (statistics().get_use_gpu()) end_gpu_timer(__KERNEL_NAME__, is_default_exec_space<ExecSpace>);
-    }
+        const _SIZE_ item = (has_items ? items[i] : i) * line_size + j;
+        if (IS_SUB)    resu[item] -= x;
+        if (IS_ADD)    resu[item] += x;
+        if (IS_MULT)   resu[item] *= x;
+        if (IS_EGAL)   resu[item] = x;
+        if (IS_NEGATE) resu[item] = -resu[item];
+        if (IS_ABS)    resu[item] = (_TYPE_) Kokkos::abs(resu[item]);
+        if (IS_SQRT)   resu[item] = (_TYPE_) Kokkos::sqrt(resu[item]);
+        if (IS_SQUARE) resu[item] = resu[item]*resu[item];
+        if (IS_DIV)    resu[item] /= x;
+        if (IS_INV)    resu[item] = (_TYPE_) ((_TYPE_)1 /resu[item]);
+      }
+  });
+  if (statistics().get_use_gpu()) end_gpu_timer(__KERNEL_NAME__, is_default_exec_space<ExecSpace>);
 }
 }
 #endif
@@ -556,7 +524,7 @@ namespace
 {
 template<typename ExecSpace, typename _TYPE_, typename _SIZE_,typename _TYPE_RETURN_,  TYPE_OPERATION_VECT _TYPE_OP_>
 void local_extrema_vect_generic_kernel(const TRUSTVect<_TYPE_,_SIZE_>& vx, int nblocs_left, Block_Iter<_SIZE_>& bloc_itr,
-                                       const _SIZE_ vect_size_tot, const int line_size, _TYPE_& min_max_val, int& i_min_max)
+                                       const _SIZE_ vect_size_tot, const int line_size, _TYPE_& min_max_val, _SIZE_& i_min_max)
 {
   // Shortcut for empty arrays (avoid case line_size == 0)
   if (bloc_itr.empty()) return ;
@@ -570,53 +538,40 @@ void local_extrema_vect_generic_kernel(const TRUSTVect<_TYPE_,_SIZE_>& vx, int n
   static constexpr bool IS_ABS = (IS_MAX_ABS || IS_MIN_ABS);
 
   // Define the reducer, based on the reduction type
-  using reducer = typename std::conditional<IS_MAXS, Kokkos::MaxLoc<_TYPE_, int>, Kokkos::MinLoc<_TYPE_, int>>::type;
+  using reducer = typename std::conditional<IS_MAXS, Kokkos::MaxLoc<_TYPE_, _SIZE_>, Kokkos::MinLoc<_TYPE_, _SIZE_>>::type;
   // Define the type of what the reducer will return ( a value + a index)
   using reducer_value_type  = typename reducer::value_type;
 
   if (not(IS_MAXS || IS_MINS)) {Process::exit("Wrong operation type in local_extrema_vect_generic_kernel");}
 
   auto vx_view= vx.template view_ro<1, ExecSpace>().data();
-#ifdef TRUST_USE_GPU
-  if (nblocs_left>3) ToDo_Kokkos("nblocs_left too high, optimize by rewriting as local_operations_vect_bis_generic_kernel");
-#endif
-  for (; nblocs_left; nblocs_left--)
-    {
-      // Get index of next bloc start:
-      const _SIZE_ begin_bloc = (*(bloc_itr++)) * line_size;
-      const _SIZE_ end_bloc = (*(bloc_itr++)) * line_size;
-
-      //Asserts
-      assert(begin_bloc >= 0 && end_bloc <= vect_size_tot && end_bloc >= begin_bloc);
+  const bool has_items = static_cast<bool>(bloc_itr.items_);
+  auto items = has_items ? bloc_itr.items_->template view_ro<1, ExecSpace>().data() : nullptr;
+  const _SIZE_ n_items = has_items ? bloc_itr.items_->size_array() : vect_size_tot / line_size;
 
-      //Define Policy
-      Kokkos::RangePolicy<ExecSpace> policy(begin_bloc, end_bloc);
-
-      // Define the object in which the reduction is saved
-      reducer_value_type bloc_min_max;
-
-      //Reduction
-      if (statistics().get_use_gpu()) start_gpu_timer(__KERNEL_NAME__);
-      Kokkos::parallel_reduce(policy,
-                              KOKKOS_LAMBDA(const int i, reducer_value_type& local_min_max)
+  reducer_value_type global_min_max;
+  if (statistics().get_use_gpu()) start_gpu_timer(__KERNEL_NAME__);
+  Kokkos::parallel_reduce(Kokkos::RangePolicy<ExecSpace>(0, n_items),
+                          KOKKOS_LAMBDA(const _SIZE_ i, reducer_value_type& local_min_max)
+  {
+    for (int j = 0; j < line_size; j++)
       {
-        const _TYPE_ val = (IS_ABS) ? Kokkos::abs(vx_view[i]) : vx_view[i];
-
+        const _SIZE_ item = (has_items ? items[i] : i) * line_size + j;
+        const _TYPE_ val = (IS_ABS) ? Kokkos::abs(vx_view[item]) : vx_view[item];
         if ( (IS_MAXS && val>local_min_max.val) || (IS_MINS && val<local_min_max.val) )
           {
-            local_min_max.val=val;
-            local_min_max.loc=i; // not begin_bloc + i ? This seems to be what was done before, although this is weird to me (dont we want the global index ?)
+            local_min_max.val = val;
+            local_min_max.loc = item;
           }
       }
-      ,reducer(bloc_min_max)); //Reduce in bloc_min_max
-      if (statistics().get_use_gpu()) end_gpu_timer(__KERNEL_NAME__, is_default_exec_space<ExecSpace>);
+  },
+  reducer(global_min_max));
+  if (statistics().get_use_gpu()) end_gpu_timer(__KERNEL_NAME__, is_default_exec_space<ExecSpace>);
 
-      //Bloc-level reduction
-      if ( (IS_MAXS && bloc_min_max.val > min_max_val) || (IS_MINS && bloc_min_max.val < min_max_val) )
-        {
-          min_max_val=bloc_min_max.val;
-          i_min_max= bloc_min_max.loc;
-        }
+  if ( (IS_MAXS && global_min_max.val > min_max_val) || (IS_MINS && global_min_max.val < min_max_val) )
+    {
+      min_max_val = global_min_max.val;
+      i_min_max = global_min_max.loc;
     }
 }
 }
@@ -643,7 +598,7 @@ _TYPE_RETURN_ local_extrema_vect_generic(const TRUSTVect<_TYPE_,_SIZE_>& vx, Mp_
 
   //Initialize results
   _TYPE_ min_max_val = neutral_value<_TYPE_,_TYPE_OP_>(); // _TYPE_ et pas _TYPE_RETURN_ desole ...
-  int i_min_max = -1 ; // seulement pour IMAX_ et IMIN_
+  _SIZE_ i_min_max = -1 ; // seulement pour IMAX_ et IMIN_
 
   //Localize data
   bool kernelOnDevice = vx.checkDataOnDevice();
@@ -695,60 +650,30 @@ template trustIdType local_extrema_vect_generic<trustIdType, trustIdType, trustI
 namespace
 {
 template<typename ExecSpace, typename _TYPE_, typename _SIZE_, TYPE_OPERATION_VECT_BIS _TYPE_OP_>
-void local_operations_vect_bis_generic_kernel(const TRUSTVect<_TYPE_,_SIZE_>& vx, int nblocs_left,
+void local_operations_vect_bis_generic_kernel(const TRUSTVect<_TYPE_,_SIZE_>& tab_vx, int nblocs_left,
                                               Block_Iter<_SIZE_>& bloc_itr, const _SIZE_ vect_size_tot, const int line_size, _TYPE_& sum)
 {
   static constexpr bool IS_SQUARE = (_TYPE_OP_ == TYPE_OPERATION_VECT_BIS::SQUARE_), IS_SUM = (_TYPE_OP_ == TYPE_OPERATION_VECT_BIS::SOMME_);
   // Performance important point for TRUSTArray dynamic kernel to have serial mode performance:
   // Use pointer access into Kokkos loop with [] and getting raw pointer to view with .data() !
-  auto vx_view = vx.template view_ro<1, ExecSpace>().data();
-  if (nblocs_left>3)
-    {
-      // We use flattened items_blocs cause possible huge number in parallel of nblocs_left/kernel launch (e.g. during moyenne(Ps))
-      auto items = bloc_itr.items_->template view_ro<1, ExecSpace>().data();
-      // Reduction
-      if (statistics().get_use_gpu()) start_gpu_timer(__KERNEL_NAME__);
-      Kokkos::parallel_reduce(__KERNEL_NAME__,
-                              Kokkos::RangePolicy<ExecSpace>(0, bloc_itr.items_->size_array()),
-                              KOKKOS_LAMBDA(const int i, _TYPE_& local_sum)
+  auto vx = tab_vx.template view_ro<1, ExecSpace>().data();
+  const bool has_items = static_cast<bool>(bloc_itr.items_);
+  auto items = has_items ? bloc_itr.items_->template view_ro<1, ExecSpace>().data() : nullptr;
+  const _SIZE_ n_items = has_items ? bloc_itr.items_->size_array() : vect_size_tot / line_size;
+  if (statistics().get_use_gpu()) start_gpu_timer(__KERNEL_NAME__);
+  Kokkos::parallel_reduce(__KERNEL_NAME__,
+                          Kokkos::RangePolicy<ExecSpace>(0, n_items),
+                          KOKKOS_LAMBDA(const _SIZE_ i, _TYPE_& local_sum)
+  {
+    for (int j = 0; j < line_size; j++)
       {
-        _SIZE_ item = items[i] * line_size;
-        const _TYPE_ x = vx_view[item];
+        const _SIZE_ item = (has_items ? items[i] : i) * line_size + j;
+        const _TYPE_ x = vx[item];
         if (IS_SQUARE) local_sum += x * x;
-        if (IS_SUM) local_sum += x;
-      },sum);
-      if (statistics().get_use_gpu()) end_gpu_timer(__KERNEL_NAME__, is_default_exec_space<ExecSpace>);
-    }
-  else
-    {
-      for (; nblocs_left; nblocs_left--)
-        {
-          // Get index of next bloc start:
-          const _SIZE_ begin_bloc = (*(bloc_itr++)) * line_size;
-          const _SIZE_ end_bloc = (*(bloc_itr++)) * line_size;
-          //Asserts
-          assert(begin_bloc >= 0 && end_bloc <= vect_size_tot && end_bloc >= begin_bloc);
-          //Define Policy
-          Kokkos::RangePolicy <ExecSpace> policy(begin_bloc, end_bloc);
-          // Define the bloc sum
-          _TYPE_ bloc_sum = 0;
-          //Reduction
-          if (statistics().get_use_gpu()) start_gpu_timer(__KERNEL_NAME__);
-          Kokkos::parallel_reduce(policy, KOKKOS_LAMBDA(
-                                    const _SIZE_ i, _TYPE_
-                                    &local_sum)
-          {
-            const _TYPE_ x = vx_view[i];
-            if (IS_SQUARE) local_sum += x * x;
-            if (IS_SUM) local_sum += x;
-          }
-          ,bloc_sum); //Reduce in bloc_sum
-          if (statistics().get_use_gpu()) end_gpu_timer(__KERNEL_NAME__, is_default_exec_space<ExecSpace>);
-
-          //Bloc-level reduction
-          sum += bloc_sum;
-        }
-    }
+        if (IS_SUM)    local_sum += x;
+      }
+  }, sum);
+  if (statistics().get_use_gpu()) end_gpu_timer(__KERNEL_NAME__, is_default_exec_space<ExecSpace>);
 }
 }
 #endif
@@ -807,11 +732,11 @@ template double local_operations_vect_bis_generic<double, trustIdType, TYPE_OPER
 namespace
 {
 template<typename ExecSpace, typename _TYPE_, typename _SIZE_>
-void invalidate_data_kernel(TRUSTVect<_TYPE_,_SIZE_>& resu,
+void invalidate_data_kernel(TRUSTVect<_TYPE_,_SIZE_>& tab_resu,
                             const ArrOfInt& items_blocs, const int line_size, const int blocs_size)
 {
   _TYPE_ invalid = (_TYPE_)-987654321;
-  auto resu_view= resu.template view_rw<1, ExecSpace>().data();
+  auto resu = tab_resu.template view_rw<1, ExecSpace>().data();
 
   int i = 0;
   for (int blocs_idx = 0; blocs_idx < blocs_size; blocs_idx += 2) // process data until beginning of next bloc, or end of array
@@ -823,19 +748,19 @@ void invalidate_data_kernel(TRUSTVect<_TYPE_,_SIZE_>& resu,
       if (statistics().get_use_gpu()) start_gpu_timer(__KERNEL_NAME__);
       Kokkos::parallel_for(policy,KOKKOS_LAMBDA(const int count)
       {
-        resu_view[count]=invalid;
+        resu[count]=invalid;
       });
       if (statistics().get_use_gpu()) end_gpu_timer(__KERNEL_NAME__, is_default_exec_space<ExecSpace>);
       i = items_blocs[blocs_idx+1] * line_size;
     }
-  const _SIZE_ bloc_end = resu.size_array(); // Process until end of vector
+  const _SIZE_ bloc_end = tab_resu.size_array(); // Process until end of vector
   //Define Policy
   Kokkos::RangePolicy<ExecSpace> policy(i, bloc_end);
   //Loop
   if (statistics().get_use_gpu()) start_gpu_timer(__KERNEL_NAME__);
   Kokkos::parallel_for(policy,KOKKOS_LAMBDA(const int count)
   {
-    resu_view[count]=invalid;
+    resu[count]=invalid;
   });
   if (statistics().get_use_gpu()) end_gpu_timer(__KERNEL_NAME__, is_default_exec_space<ExecSpace>);
 }
@@ -878,43 +803,26 @@ template void invalidate_data<int>(TRUSTVect<int, int>& resu, Mp_vect_options op
 namespace
 {
 template<typename ExecSpace, typename _TYPE_, typename _SIZE_>
-void local_prodscal_kernel(const TRUSTVect<_TYPE_,_SIZE_>& vx, const TRUSTVect<_TYPE_,_SIZE_>& vy, int nblocs_left,
+void local_prodscal_kernel(const TRUSTVect<_TYPE_,_SIZE_>& tab_vx, const TRUSTVect<_TYPE_,_SIZE_>& tab_vy, int nblocs_left,
                            Block_Iter<_SIZE_>& bloc_itr, const int vect_size_tot, const int line_size, _TYPE_& sum)
 {
-  auto vx_view= vx.template view_ro<1, ExecSpace>().data();
-  auto vy_view= vy.template view_ro<1, ExecSpace>().data();
-#ifdef TRUST_USE_GPU
-  if (nblocs_left>3) ToDo_Kokkos("nblocs_left too high, optimize by rewriting as local_operations_vect_bis_generic_kernel");
-#endif
-  for (; nblocs_left; nblocs_left--)
-    {
-      // Get index of next bloc start:
-      const _SIZE_ begin_bloc = (*(bloc_itr++)) * line_size;
-      const _SIZE_ end_bloc = (*(bloc_itr++)) * line_size;
-
-      //Asserts
-      assert(begin_bloc >= 0 && end_bloc <= vect_size_tot && end_bloc >= begin_bloc);
-
-      //Define Policy
-      Kokkos::RangePolicy<ExecSpace> policy(begin_bloc, end_bloc);
-
-      // Define the bloc sum
-      _TYPE_ bloc_sum=0;
-
-      //Reduction
-      if (statistics().get_use_gpu()) start_gpu_timer(__KERNEL_NAME__);
-      Kokkos::parallel_reduce(policy, KOKKOS_LAMBDA(const _SIZE_ i, _TYPE_& local_sum)
+  auto vx = tab_vx.template view_ro<1, ExecSpace>().data();
+  auto vy = tab_vy.template view_ro<1, ExecSpace>().data();
+  const bool has_items = static_cast<bool>(bloc_itr.items_);
+  auto items = has_items ? bloc_itr.items_->template view_ro<1, ExecSpace>().data() : nullptr;
+  const _SIZE_ n_items = has_items ? bloc_itr.items_->size_array() : vect_size_tot / line_size;
+  if (statistics().get_use_gpu()) start_gpu_timer(__KERNEL_NAME__);
+  Kokkos::parallel_reduce(__KERNEL_NAME__,
+                          Kokkos::RangePolicy<ExecSpace>(0, n_items),
+                          KOKKOS_LAMBDA(const _SIZE_ i, _TYPE_& local_sum)
+  {
+    for (int j = 0; j < line_size; j++)
       {
-        local_sum += vx_view[i]*vy_view[i];
+        const _SIZE_ item = (has_items ? items[i] : i) * line_size + j;
+        local_sum += vx[item]*vy[item];
       }
-      , Kokkos::Sum<_TYPE_>(bloc_sum)); //Reduce in bloc_sum
-
-      //timer
-      if (statistics().get_use_gpu()) end_gpu_timer(__KERNEL_NAME__, is_default_exec_space<ExecSpace>);
-
-      //Bloc-level reduction
-      sum += bloc_sum;
-    }
+  }, sum);
+  if (statistics().get_use_gpu()) end_gpu_timer(__KERNEL_NAME__, is_default_exec_space<ExecSpace>);
 }
 }
 #endif
diff --git a/src/Kernel/Operateurs/Operateur_Grad_base.cpp b/src/Kernel/Operateurs/Operateur_Grad_base.cpp
index 228bd6f6fd..a769754d1f 100644
--- a/src/Kernel/Operateurs/Operateur_Grad_base.cpp
+++ b/src/Kernel/Operateurs/Operateur_Grad_base.cpp
@@ -1,5 +1,5 @@
 /****************************************************************************
-* Copyright (c) 2024, CEA
+* Copyright (c) 2026, CEA
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
@@ -50,7 +50,14 @@ void Operateur_Grad_base::dimensionner(Matrice_Morse& mat) const
 DoubleTab& Operateur_Grad_base::ajouter(const DoubleTab& inco, DoubleTab& secmem) const
 {
   if (has_interface_blocs())
-    secmem *= -1, ajouter_blocs({}, secmem, {{ "pression", inco }}), secmem *= -1; /* pour avoir le bon signe */
+    {
+      secmem *= -1;
+      tabs_t semi_impl;
+      //ajouter_blocs({}, secmem, {{"pression", inco}});
+      semi_impl["pression"].ref(inco); /* evite la copie de inco dans tabs_t */
+      ajouter_blocs({}, secmem, semi_impl);
+      secmem *= -1; /* pour avoir le bon signe */
+    }
   else Process::exit(que_suis_je() + " : ajouter() not coded!");
   return secmem;
 }
diff --git a/src/Kernel/Postraitement/MED/Ecrire_MED.cpp b/src/Kernel/Postraitement/MED/Ecrire_MED.cpp
index 1044f73c57..2a117c7d5f 100644
--- a/src/Kernel/Postraitement/MED/Ecrire_MED.cpp
+++ b/src/Kernel/Postraitement/MED/Ecrire_MED.cpp
@@ -38,6 +38,7 @@ using namespace MEDCoupling;
 
 Implemente_instanciable_32_64(Ecrire_MED_32_64,"Write_MED",Interprete);
 Add_synonym(Ecrire_MED,"Ecrire_MED");
+Add_synonym(Ecrire_MED_64, "Ecrire_MED_64");
 
 // Anonymous namespace for local functions:
 namespace
@@ -417,14 +418,6 @@ void Ecrire_MED_32_64<_SIZE_>::ecrire_domaine_dis(bool append)
 #endif
 }
 
-#if INT_is_64_ == 2
-template <>
-void Ecrire_MED_32_64<trustIdType>::ecrire_domaine_dis(bool append)
-{
-  Process::exit("Ecrire_MED_32_64<trustIdType>::ecrire_domaine_dis() -- Not allowed with a 64b object!");
-}
-#endif
-
 /*! @brief Permet d'ecrire le tableau de valeurs val comme un champ dans le fichier med de nom nom_fichier_, avec pour support le domaine de nom nom_dom.
  *
  *   @param type: CHAMPPOINT,CHAMPMAILLE,CHAMPFACES
diff --git a/src/Kernel/Postraitement/MED/TRUST_2_MED.cpp b/src/Kernel/Postraitement/MED/TRUST_2_MED.cpp
index c03a794cf7..4e776792d3 100644
--- a/src/Kernel/Postraitement/MED/TRUST_2_MED.cpp
+++ b/src/Kernel/Postraitement/MED/TRUST_2_MED.cpp
@@ -1,5 +1,5 @@
 /****************************************************************************
-* Copyright (c) 2025, CEA
+* Copyright (c) 2026, CEA
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
@@ -285,7 +285,7 @@ INTERP_KERNEL::NormalizedCellType type_geo_trio_to_type_medcoupling(const Nom& t
       type_cell = INTERP_KERNEL::NORM_QUAD4;
       mesh_dimension = 2;
     }
-  else if  ((type_elem=="HEXAEDRE") || (type_elem=="HEXAEDRE_VEF"))
+  else if  ((type_elem.debute_par("HEXAEDRE")))
     {
       type_cell = INTERP_KERNEL::NORM_HEXA8;
       mesh_dimension = 3;
@@ -295,7 +295,7 @@ INTERP_KERNEL::NormalizedCellType type_geo_trio_to_type_medcoupling(const Nom& t
       type_cell = INTERP_KERNEL::NORM_TRI3;
       mesh_dimension = 2;
     }
-  else if  (type_elem=="TETRAEDRE")
+  else if ((type_elem.debute_par("TETRAEDRE")))
     {
       type_cell = INTERP_KERNEL::NORM_TETRA4;
       mesh_dimension = 3;
diff --git a/src/Kernel/Statistiques_temps/Integrale_tps_produit_champs.cpp b/src/Kernel/Statistiques_temps/Integrale_tps_produit_champs.cpp
index 04e4bdb357..122b35ec88 100644
--- a/src/Kernel/Statistiques_temps/Integrale_tps_produit_champs.cpp
+++ b/src/Kernel/Statistiques_temps/Integrale_tps_produit_champs.cpp
@@ -16,6 +16,7 @@
 #include <Integrale_tps_produit_champs.h>
 #include <Domaine_VF.h>
 #include <TRUSTTab.h>
+#include <TRUSTTrav.h>
 
 Implemente_instanciable(Integrale_tps_produit_champs, "Integrale_tps_produit_champs", Integrale_tps_Champ);
 
@@ -46,6 +47,16 @@ void Integrale_tps_produit_champs::mettre_a_jour_integrale()
 
   if (t_courant != mon_second_champ()->get_time())
     {
+      // Lors d'une reprise, certains champs (ex. gradient_vitesse) repartent a t=0
+      // tandis que d'autres (ex. pression) sont relus depuis la sauvegarde.
+      // Si tps_integrale_ est deja avance (reprise), on ignore cette incoherence
+      // au premier pas et on synchronise tps_integrale_ sur le temps courant du
+      // premier champ pour repartir proprement.
+      if (tps_integrale_ > 0. && mon_second_champ()->get_time() == 0.)
+        {
+          tps_integrale_ = t_courant;
+          return;
+        }
       Cerr << "Integrale_tps_produit_champs::mettre_a_jour_integrale()" << finl;
       Cerr << "the current time of the field named " << nom[0] << " =" << t_courant << finl;
       Cerr << "is different of the second field current time " << nom2[0] << " =" << source2.temps() << finl;
@@ -85,17 +96,13 @@ void Integrale_tps_produit_champs::ajoute_produit_tensoriel(double alpha, const
 {
   if (support_different_)
     {
-      ToDo_Kokkos("Use DoubleTrav and don't resize...");
       // On ramene au centre des elements
+      const Domaine& dom = le_champ_->domaine_dis_base().domaine();
       const DoubleTab& xp = ref_cast(Domaine_VF,le_champ_->domaine_dis_base()).xp();
       int nb_elem_tot = xp.dimension_tot(0);
-      DoubleTab val_a, val_b;
-      // Le jour ou les champs seront mieux foutus, on n'aura
-      // pas a faire ca:
-      val_a.resize(nb_elem_tot, a.nb_comp());
-      val_b.resize(nb_elem_tot, b.nb_comp());
-      a.valeur_aux(xp, val_a);
-      b.valeur_aux(xp, val_b);
+      DoubleTrav val_a(nb_elem_tot, a.nb_comp()), val_b(nb_elem_tot, b.nb_comp());
+      a.valeur_aux_centres_de_gravite(dom, val_a);
+      b.valeur_aux_centres_de_gravite(dom, val_b);
       le_champ_->valeurs().ajoute_produit_tensoriel(alpha, val_a, val_b);
       le_champ_->valeurs().echange_espace_virtuel();
     }
diff --git a/src/Kernel/Statistiques_temps/Moyenne_volumique.cpp b/src/Kernel/Statistiques_temps/Moyenne_volumique.cpp
index ddd02105b7..8679f61329 100644
--- a/src/Kernel/Statistiques_temps/Moyenne_volumique.cpp
+++ b/src/Kernel/Statistiques_temps/Moyenne_volumique.cpp
@@ -258,7 +258,7 @@ int Moyenne_volumique::get_champ(const Nom& nom_pb,
               if (tmp == mc_nom_champ)
                 {
                   Operateur_Statistique_tps_base& stat = stats[i_stat].valeur();
-                  ref_cast_non_const(DoubleTab, stat.integrale().le_champ_calcule().valeurs()) = stat.calculer_valeurs();
+                  stat.calculer(ref_cast_non_const(DoubleTab, stat.integrale().le_champ_calcule().valeurs()));
                   ref_champ = stat.integrale().le_champ_calcule();
                   return 1;
                 }
diff --git a/src/Kernel/Statistiques_temps/Op_Correlation.cpp b/src/Kernel/Statistiques_temps/Op_Correlation.cpp
index 70c923076f..aab2c7d213 100644
--- a/src/Kernel/Statistiques_temps/Op_Correlation.cpp
+++ b/src/Kernel/Statistiques_temps/Op_Correlation.cpp
@@ -186,7 +186,7 @@ void Op_Correlation::completer(const Probleme_base& Pb, const Nom& prefix)
   integrale_tps_ab_.le_champ_calcule().changer_temps(Pb.schema_temps().temps_courant());
 }
 
-DoubleTab Op_Correlation::calculer_valeurs() const
+void Op_Correlation::calculer(DoubleTab& tab_correlation) const
 {
   Integrale_tps_produit_champs correlation(integrale_tps_ab_);
   const double dt_ab = dt_integration_ab();
@@ -200,7 +200,7 @@ DoubleTab Op_Correlation::calculer_valeurs() const
       assert(est_egal(dt_b, dt_ab));
       correlation.ajoute_produit_tensoriel(-1 / (dt_a * dt_b), integrale_tps_a_->le_champ_calcule(), integrale_tps_b_->le_champ_calcule());
     }
-  return correlation.le_champ_calcule().valeurs();
+  tab_correlation = correlation.le_champ_calcule().valeurs();
 }
 
 int Op_Correlation::completer_post_statistiques(const Domaine& dom, const int is_axi, Format_Post_base& format)
diff --git a/src/Kernel/Statistiques_temps/Op_Correlation.h b/src/Kernel/Statistiques_temps/Op_Correlation.h
index 686b02dfce..fa748407be 100644
--- a/src/Kernel/Statistiques_temps/Op_Correlation.h
+++ b/src/Kernel/Statistiques_temps/Op_Correlation.h
@@ -51,7 +51,7 @@ class Op_Correlation : public Operateur_Statistique_tps_base
   inline int reprendre(Entree& is) override;
   inline void associer_op_stat(const Operateur_Statistique_tps_base&) override;
   void completer(const Probleme_base&, const Nom&) override;
-  DoubleTab calculer_valeurs() const override;
+  void calculer(DoubleTab&) const override;
 
 protected:
   OBS_PTR(Op_Moyenne) la_moyenne_a_;
diff --git a/src/Kernel/Statistiques_temps/Op_Correlation_Triple.cpp b/src/Kernel/Statistiques_temps/Op_Correlation_Triple.cpp
new file mode 100644
index 0000000000..8266e88517
--- /dev/null
+++ b/src/Kernel/Statistiques_temps/Op_Correlation_Triple.cpp
@@ -0,0 +1,415 @@
+/****************************************************************************
+* Copyright (c) 2026, CEA
+* All rights reserved.
+*
+* Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
+* 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
+* 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.
+* 3. Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission.
+*
+* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+* IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+* OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*
+*****************************************************************************/
+
+#include <Champ_Generique_refChamp.h>
+#include <Discretisation_base.h>
+#include <Schema_Temps_base.h>
+#include <Op_Correlation_Triple.h>
+#include <Domaine_VF.h>
+#include <TRUSTTrav.h>
+#include <algorithm>
+
+Implemente_instanciable(Op_Correlation_Triple, "Op_Correlation_Triple", Operateur_Statistique_tps_base);
+
+Sortie& Op_Correlation_Triple::printOn(Sortie& s) const { return s << que_suis_je() << " " << le_nom(); }
+Entree& Op_Correlation_Triple::readOn(Entree& s) { return s; }
+
+// ---------------------------------------------------------------------------
+// get_dt : lecture efficace de dt_integration_ sans rapatrier tout le tableau.
+// ---------------------------------------------------------------------------
+double get_dt(const DoubleTab& tab, int off_dt)
+{
+  if (tab.size_array() == 0) return 0.;
+  // Pour ne recuperer qu'une valeur:
+  Kokkos::View<double, Kokkos::HostSpace> dt("dt_integration");
+  if (tab.nb_dim() == 1)
+    {
+      CDoubleArrView val = static_cast<const ArrOfDouble&>(tab).view_ro();
+      Kokkos::deep_copy(dt, Kokkos::subview(val, 0));
+    }
+  else
+    {
+      CDoubleTabView val = tab.view_ro();
+      Kokkos::deep_copy(dt, Kokkos::subview(val, 0, off_dt));
+    }
+  return dt();
+}
+
+// ---------------------------------------------------------------------------
+// associer
+// ---------------------------------------------------------------------------
+void Op_Correlation_Triple::associer(const Domaine_dis_base& une_zdis,
+                                     const Champ_Generique_base& le_champ_a,
+                                     const Champ_Generique_base& le_champ_b,
+                                     const Champ_Generique_base& le_champ_c,
+                                     double t1, double t2)
+{
+  le_champ_a_ = le_champ_a;
+  le_champ_b_ = le_champ_b;
+  le_champ_c_ = le_champ_c;
+
+  OWN_PTR(Champ_base) es;
+  const Champ_base& source = le_champ_a.get_champ(es);
+  Nom type = source.que_suis_je();
+  int renomme = 0;
+  if (type.debute_par("Champ")) renomme = 1;
+  type.suffix("Champ_");
+  type.suffix("Fonc_");
+  Nom type_final("Champ_Fonc_");
+  if (renomme) type_final += type;
+  else type_final = type;
+
+  integrale_tps_abc_.typer_champ(type_final);
+  integrale_tps_abc_.le_champ_calcule().associer_domaine_dis_base(une_zdis);
+
+  t_deb_ = t1;
+  t_fin_ = t2;
+  tps_courant_ = t1;
+  integrale_tps_abc_.fixer_t_debut(t1);
+  integrale_tps_abc_.fixer_t_fin(t2);
+  integrale_tps_abc_.fixer_tps_integrale(t1);
+  integrale_tps_abc_.fixer_dt_integr(0.);
+}
+
+// ---------------------------------------------------------------------------
+// completer
+//
+// Le champ calcule de integrale_tps_abc_ est dimensionne avec nb_comp_tot_
+// colonnes au lieu de nb_comp_abc_, pour encoder egalement moy_a_/b_/c_ et
+// dt_integration_ dans le meme DoubleTab.
+//
+// Disposition des colonnes :
+//   [0 .. nb_comp_abc_-1]                 : integrale triple
+//   [off_moy_a_ .. off_moy_a_+nca-1]      : moy_a_ = int(F dt)
+//   [off_moy_b_ .. off_moy_b_+ncb-1]      : moy_b_ = int(G dt)
+//   [off_moy_c_ .. off_moy_c_+ncc-1]      : moy_c_ = int(H dt)
+//   [off_dt_]                             : dt_integration_ (meme valeur sur tous les elems)
+//
+// Pour le postraitement, get_champ() extrait les nb_comp_abc_ premieres colonnes
+// via calculer_valeurs(). La surcharge de integrale().le_champ_calcule().nb_comp()
+// par le Champ_Generique retourne nb_comp_abc_ (voir get_property).
+// ---------------------------------------------------------------------------
+void Op_Correlation_Triple::completer(const Probleme_base& Pb, const Nom& prefix)
+{
+  const OBS_PTR(Champ_Generique_base)& a = le_champ_a_;
+  const OBS_PTR(Champ_Generique_base)& b = le_champ_b_;
+  const OBS_PTR(Champ_Generique_base)& c = le_champ_c_;
+
+  const Domaine_dis_base& domaine = a->get_ref_domaine_dis_base();
+
+  OWN_PTR(Champ_base) es_a, es_b, es_c;
+  const Champ_base& source_a = a->get_champ(es_a);
+  const Champ_base& source_b = b->get_champ(es_b);
+  const Champ_base& source_c = c->get_champ(es_c);
+
+  const int nca = source_a.nb_comp();
+  const int ncb = source_b.nb_comp();
+  const int ncc = source_c.nb_comp();
+  nb_comp_a_ = nca;
+  nb_comp_b_ = ncb;
+  nb_comp_c_ = ncc;
+  nb_comp_abc_ = nca * ncb * ncc;
+
+  off_moy_a_   = nb_comp_abc_;
+  off_moy_b_   = nb_comp_abc_ + nca;
+  off_moy_c_   = nb_comp_abc_ + nca + ncb;
+  off_dt_      = nb_comp_abc_ + nca + ncb + ncc;
+  nb_comp_tot_ = nb_comp_abc_ + nca + ncb + ncc + 1;
+
+  bool ref_abc = sub_type(Champ_Generique_refChamp, a.valeur())
+                 && sub_type(Champ_Generique_refChamp, b.valeur())
+                 && sub_type(Champ_Generique_refChamp, c.valeur());
+
+  Noms noms_a, noms_b, noms_c, compo_a, compo_b, compo_c;
+  if (!ref_abc)
+    {
+      noms_a  = a->get_property("nom");
+      noms_b  = b->get_property("nom");
+      noms_c  = c->get_property("nom");
+      compo_a = a->get_property("composantes");
+      compo_b = b->get_property("composantes");
+      compo_c = c->get_property("composantes");
+    }
+  else
+    {
+      noms_a  = a->get_property("nom_cible");
+      noms_b  = b->get_property("nom_cible");
+      noms_c  = c->get_property("nom_cible");
+      compo_a = source_a.noms_compo();
+      compo_b = source_b.noms_compo();
+      compo_c = source_c.noms_compo();
+    }
+  const Nom nom_a = noms_a[0], nom_b = noms_b[0], nom_c = noms_c[0];
+  const Noms unites_a = a->get_property("unites");
+  const Noms unites_b = b->get_property("unites");
+  const Noms unites_c = c->get_property("unites");
+
+  Nom type_P0 = "Champ_Fonc_P0_";
+  type_P0 += Pb.discretisation().que_suis_je().substr_old(1, 3);
+  const int nb_val = domaine.nb_elem();
+
+  // Noms de composantes pour le tableau etendu (nb_comp_tot_ colonnes).
+  // Les premieres nb_comp_abc_ colonnes ont les vrais noms (pour le post).
+  // Les colonnes auxiliaires ont des noms internes (non postes).
+  Noms noms_comp(nb_comp_tot_);
+  {
+    Nom debut("Correlation_Triple_");
+    for (int i = 0; i < nca; i++)
+      for (int j = 0; j < ncb; j++)
+        for (int k = 0; k < ncc; k++)
+          noms_comp[(i * ncb + j) * ncc + k] =
+            debut + compo_a[i] + "_" + compo_b[j] + "_" + compo_c[k];
+    // Colonnes auxiliaires : noms internes
+    for (int i = 0; i < nca; i++)
+      noms_comp[off_moy_a_ + i] = Nom("_moy_a_") + compo_a[i];
+    for (int i = 0; i < ncb; i++)
+      noms_comp[off_moy_b_ + i] = Nom("_moy_b_") + compo_b[i];
+    for (int i = 0; i < ncc; i++)
+      noms_comp[off_moy_c_ + i] = Nom("_moy_c_") + compo_c[i];
+    noms_comp[off_dt_] = "_dt_integration_";
+  }
+
+  Nom nom_post("Correlation_Triple_");
+  nom_post += nom_a + "_" + nom_b + "_" + nom_c;
+
+  Nom unite(unites_a[0]);
+  unite += ".";
+  unite += unites_b[0];
+  unite += ".";
+  unite += unites_c[0];
+  Noms unites_tot(nb_comp_tot_);
+  for (int i = 0; i < nb_comp_tot_; i++) unites_tot[i] = unite;
+
+  integrale_tps_abc_.support_different() = 1;
+  integrale_tps_abc_.typer_champ(type_P0);
+  integrale_tps_abc_.le_champ_calcule().associer_domaine_dis_base(domaine);
+  integrale_tps_abc_.le_champ_calcule().fixer_nb_comp(nb_comp_tot_);
+  valeurs_etendues().resize(0, nb_comp_tot_);
+  integrale_tps_abc_.le_champ_calcule().fixer_nb_valeurs_nodales(nb_val);
+  valeurs_etendues() = 0.;
+
+  integrale_tps_abc_.le_champ_calcule().nommer(nom_post);
+  integrale_tps_abc_.le_champ_calcule().set_pdi_name(prefix + nom_post);
+  integrale_tps_abc_.le_champ_calcule().fixer_noms_compo(noms_comp);
+  integrale_tps_abc_.le_champ_calcule().fixer_unites(unites_tot);
+  integrale_tps_abc_.le_champ_calcule().changer_temps(Pb.schema_temps().temps_courant());
+}
+
+void set_dt(DoubleTab& tab_dt, int off_dt, double val)
+{
+  if (tab_dt.size_array() == 0) return;
+  const int n = tab_dt.dimension_tot(0);
+  if (tab_dt.nb_dim() == 1) { tab_dt(0) = val; }
+  else
+    {
+      DoubleTabView tab = tab_dt.view_rw();
+      Kokkos::parallel_for(start_gpu_timer(__KERNEL_NAME__), range_1D(0, n), KOKKOS_LAMBDA(const int i)
+      {
+        tab(i, off_dt) = val;
+      });
+      end_gpu_timer(__KERNEL_NAME__);
+    }
+}
+
+// ---------------------------------------------------------------------------
+// mettre_a_jour
+// ---------------------------------------------------------------------------
+void Op_Correlation_Triple::mettre_a_jour(double tps)
+{
+  if (tps < t_deb_ || tps > t_fin_) return;
+  const double dt = tps - tps_courant_;
+  tps_courant_ = tps;
+  if (dt <= 0.) return;
+
+  const Domaine_dis_base& zdis = le_champ_a_->get_ref_domaine_dis_base();
+  const int n_elem = zdis.domaine().nb_elem_tot();
+
+  // Interpolation aux centres de gravite des elements.
+  // On suit exactement le patron de Integrale_tps_produit_champs::ajoute_produit_tensoriel :
+  //   val.resize(nb_elem_tot, nb_comp) SANS md_vector, puis valeur_aux(xp, val).
+  // Ceci evite toute propagation de md_vector (faces en VEF) dans val,
+  // qui declencherait l'assertion md_vector_ == v.md_vector_ en mode debug.
+  const DoubleTab& xp = ref_cast(Domaine_VF, zdis).xp();
+  auto aux_elems = [&](const OBS_PTR(Champ_Generique_base)& ch, int nb_comp, DoubleTab& tab_val)
+  {
+    OWN_PTR(Champ_base) es;
+    const Champ_base& src = ch->get_champ(es);
+    const DoubleTab& tab_raw = src.valeurs();
+    if (tab_raw.dimension_tot(0) != n_elem)
+      {
+        // Champ aux faces : interpolation aux elements via valeur_aux(xp, tab_val).
+        // tab_val est dimensionne SANS md_vector pour ne pas heriter celui des faces.
+        if (nb_comp > 1) tab_val.resize(n_elem, nb_comp);
+        else tab_val.resize(n_elem);
+        src.valeur_aux(xp, tab_val);
+      }
+    else
+      {
+        // Champ deja aux elements : copie simple.
+        // On copie les valeurs dans un tableau local sans md_vector
+        // pour rester coherent avec le cas interpole.
+        if (nb_comp > 1) tab_val.resize(n_elem, nb_comp, RESIZE_OPTIONS::NOCOPY_NOINIT);
+        else tab_val.resize(n_elem, RESIZE_OPTIONS::NOCOPY_NOINIT);
+        const int nloc = std::min(n_elem, (int)tab_raw.dimension_tot(0));
+        if (nb_comp == 1)
+          {
+            CDoubleArrView raw = static_cast<const ArrOfDouble&>(tab_raw).view_ro();
+            DoubleArrView val = static_cast<ArrOfDouble&>(tab_val).view_rw();
+            Kokkos::parallel_for(start_gpu_timer(__KERNEL_NAME__), range_1D(0, nloc), KOKKOS_LAMBDA(const int i)
+            {
+              val(i) = raw(i);
+            });
+            end_gpu_timer(__KERNEL_NAME__);
+          }
+        else
+          {
+            CDoubleTabView raw = tab_raw.view_ro();
+            DoubleTabView val = tab_val.view_rw();
+            Kokkos::parallel_for(start_gpu_timer(__KERNEL_NAME__), range_2D({0, 0}, {nloc, nb_comp}), KOKKOS_LAMBDA(const int i, const int k)
+            {
+              val(i, k) = raw(i, k);
+            });
+            end_gpu_timer(__KERNEL_NAME__);
+          }
+      }
+  };
+
+  DoubleTrav tab_val_a, tab_val_b, tab_val_c;
+  aux_elems(le_champ_a_, nb_comp_a_, tab_val_a);
+  aux_elems(le_champ_b_, nb_comp_b_, tab_val_b);
+  aux_elems(le_champ_c_, nb_comp_c_, tab_val_c);
+
+  DoubleTab& tab_ext = valeurs_etendues();
+  const double dt_old = get_dt(tab_ext, off_dt_);
+  const double dt_new = dt_old + dt;
+
+  const int n = std::min((int)tab_ext.dimension_tot(0), n_elem);
+
+  CDoubleArrView val_a = static_cast<const ArrOfDouble&>(tab_val_a).view_ro();
+  CDoubleArrView val_b = static_cast<const ArrOfDouble&>(tab_val_b).view_ro();
+  CDoubleArrView val_c = static_cast<const ArrOfDouble&>(tab_val_c).view_ro();
+  DoubleTabView ext = tab_ext.view_rw();
+
+  const int nb_comp_a = nb_comp_a_, nb_comp_b = nb_comp_b_, nb_comp_c = nb_comp_c_;
+  const int off_moy_a = off_moy_a_, off_moy_b = off_moy_b_, off_moy_c = off_moy_c_;
+
+  // Etape 1 : moy_X += X*dt  (colonnes auxiliaires du tableau etendu)
+  Kokkos::parallel_for(start_gpu_timer(__KERNEL_NAME__), range_2D({0, 0}, {n, nb_comp_a}), KOKKOS_LAMBDA(const int i, const int ia)
+  {
+    ext(i, off_moy_a + ia) += (nb_comp_a == 1 ? val_a(i) : val_a(i * nb_comp_a + ia)) * dt;
+  });
+  end_gpu_timer(__KERNEL_NAME__);
+  Kokkos::parallel_for(start_gpu_timer(__KERNEL_NAME__), range_2D({0, 0}, {n, nb_comp_b}), KOKKOS_LAMBDA(const int i, const int ib)
+  {
+    ext(i, off_moy_b + ib) += (nb_comp_b == 1 ? val_b(i) : val_b(i * nb_comp_b + ib)) * dt;
+  });
+  end_gpu_timer(__KERNEL_NAME__);
+  Kokkos::parallel_for(start_gpu_timer(__KERNEL_NAME__), range_2D({0, 0}, {n, nb_comp_c}), KOKKOS_LAMBDA(const int i, const int ic)
+  {
+    ext(i, off_moy_c + ic) += (nb_comp_c == 1 ? val_c(i) : val_c(i * nb_comp_c + ic)) * dt;
+  });
+  end_gpu_timer(__KERNEL_NAME__);
+
+  // Etape 2 : integrale += (F-<F>)*(G-<G>)*(H-<H>)*dt
+  Kokkos::parallel_for(start_gpu_timer(__KERNEL_NAME__), range_2D({0, 0}, {n, nb_comp_a}), KOKKOS_LAMBDA(const int i, const int ia)
+  {
+    for (int ib = 0; ib < nb_comp_b; ib++)
+      for (int ic = 0; ic < nb_comp_c; ic++)
+        {
+          const int idx = (ia * nb_comp_b + ib) * nb_comp_c + ic;
+          const double fa = (nb_comp_a == 1 ? val_a(i) : val_a(i * nb_comp_a + ia)) - ext(i, off_moy_a + ia) / dt_new;
+          const double fb = (nb_comp_b == 1 ? val_b(i) : val_b(i * nb_comp_b + ib)) - ext(i, off_moy_b + ib) / dt_new;
+          const double fc = (nb_comp_c == 1 ? val_c(i) : val_c(i * nb_comp_c + ic)) - ext(i, off_moy_c + ic) / dt_new;
+          ext(i, idx) += fa * fb * fc * dt;
+        }
+  });
+  end_gpu_timer(__KERNEL_NAME__);
+
+  // Etape 3 : mise a jour de dt_integration_ (meme valeur partout)
+  set_dt(tab_ext, off_dt_, dt_new);
+
+  integrale_tps_abc_.fixer_dt_integr(dt_new);
+  integrale_tps_abc_.le_champ_calcule().changer_temps(tps);
+}
+
+// ---------------------------------------------------------------------------
+// calculer_valeurs : extrait et renvoie les nb_comp_abc_ premieres colonnes.
+//
+// On utilise DoubleTrav(valeurs()) exactement comme Op_Moyenne::calculer_valeurs.
+// DoubleTrav copie la structure (taille, md_vector) de l'espace de stockage
+// du champ calcule, ce qui garantit que le tableau retourne porte le meme
+// md_vector que l'espace de stockage de Champ_Generique_Correlation_Triple.
+// Sans cela, l'affectation tab = calculer_valeurs() dans get_champ() declencherait
+// l'assertion md_vector_ == v.md_vector_ en mode debug.
+//
+// Note : valeurs() retourne ici l'integrale_tps_abc_ qui a nb_comp_tot_ colonnes.
+// On a besoin d'un tableau a nb_comp_abc_ colonnes. On cree donc un DoubleTrav
+// a partir du champ calcule de espace_stockage_ (passe en argument dans get_champ)
+// qui a la bonne taille. Mais calculer_valeurs() n'a pas acces a espace_stockage_.
+// Solution : on cree le tableau retourne en copiant la structure de ext mais en
+// ne retenant que les nb_comp_abc_ premieres colonnes via resize + copie scalaire,
+// en preservant le md_vector via copy() de la structure de ext.
+// ---------------------------------------------------------------------------
+void Op_Correlation_Triple::fill_result(DoubleTab& tab) const
+{
+  // Remplit tab (deja dimensionne et portant le bon md_vector) avec les valeurs
+  // de la correlation triple normalisees par dt_integration_.
+  // Travaille directement sur tab sans creer de tableau temporaire, evitant ainsi
+  // tout probleme de md_vector dans les affectations.
+  const DoubleTab& tab_ext = valeurs_etendues();
+  const double dt = get_dt(tab_ext, off_dt_);
+  const int n = std::min((int)tab_ext.dimension_tot(0), (int)tab.dimension_tot(0));
+
+  if (dt > 0.)
+    {
+      CDoubleTabView ext = tab_ext.view_ro();
+      if (nb_comp_abc_ == 1)
+        {
+          DoubleArrView val = static_cast<ArrOfDouble&>(tab).view_rw();
+          Kokkos::parallel_for(start_gpu_timer(__KERNEL_NAME__), range_1D(0, n), KOKKOS_LAMBDA(const int i)
+          {
+            val(i) = ext(i, 0) / dt;
+          });
+          end_gpu_timer(__KERNEL_NAME__);
+        }
+      else
+        {
+          const int nb_comp = nb_comp_abc_;
+          DoubleTabView val = tab.view_rw();
+          Kokkos::parallel_for(start_gpu_timer(__KERNEL_NAME__), range_2D({0, 0}, {n, nb_comp}), KOKKOS_LAMBDA(const int i, const int k)
+          {
+            val(i, k) = ext(i, k) / dt;
+          });
+          end_gpu_timer(__KERNEL_NAME__);
+        }
+    }
+  else
+    tab = 0.;
+}
+
+void Op_Correlation_Triple::calculer(DoubleTab& tab) const
+{
+  // Pour les appels hors get_champ (ex: tests unitaires).
+  // tab est un DoubleTab brut sans md_vector; fill_result y ecrit directement.
+  const int n = valeurs_etendues().dimension_tot(0);
+  if (nb_comp_abc_ == 1) tab.resize(n, RESIZE_OPTIONS::NOCOPY_NOINIT);
+  else tab.resize(n, nb_comp_abc_, RESIZE_OPTIONS::NOCOPY_NOINIT);
+  fill_result(tab);
+}
+
+int Op_Correlation_Triple::completer_post_statistiques(const Domaine&, const int, Format_Post_base&)
+{
+  return 1;
+}
diff --git a/src/Kernel/Statistiques_temps/Op_Correlation_Triple.h b/src/Kernel/Statistiques_temps/Op_Correlation_Triple.h
new file mode 100644
index 0000000000..f972745fc6
--- /dev/null
+++ b/src/Kernel/Statistiques_temps/Op_Correlation_Triple.h
@@ -0,0 +1,189 @@
+/****************************************************************************
+* Copyright (c) 2026, CEA
+* All rights reserved.
+*
+* Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
+* 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
+* 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.
+* 3. Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission.
+*
+* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+* IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+* OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*
+*****************************************************************************/
+
+#ifndef Op_Correlation_Triple_included
+#define Op_Correlation_Triple_included
+
+#include <Integrale_tps_produit_champs.h>
+#include <TRUSTTabs_forward.h>
+#include <Op_Moyenne.h>
+#include <TRUST_Ref.h>
+
+/*! @brief class Op_Correlation_Triple
+ *
+ *  Calcule (1/T) * int_0^T (F-<F>)*(G-<G>)*(H-<H>) dt
+ *  ou <X>(t) = (1/t) * int_0^t X dt' est la moyenne courante.
+ *
+ *  Algorithme : accumulation incrementale a la volee.
+ *  A chaque pas dt :
+ *    1. moy_a_ += F*dt,  moy_b_ += G*dt,  moy_c_ += H*dt
+ *    2. integrale += (F - moy_a_/T_new)*(G - moy_b_/T_new)*(H - moy_c_/T_new)*dt
+ *  avec T_new = dt_integration_ + dt.
+ *
+ *  === Strategie de sauvegarde / reprise ===
+ *
+ *  Le framework TRUST impose qu'un operateur ne sauvegarde/reprenne qu'UN
+ *  SEUL bloc champ : le framework ecrit/lit [ident][type] et l'operateur
+ *  ecrit/lit [temps][donnees].
+ *  Solution : on encode TOUT l'etat dans un UNIQUE DoubleTab etendu
+ *  (integrale_etendue_) de taille (n_elem, nb_comp + nb_comp_a + nb_comp_b + nb_comp_c + 1) :
+ *    colonnes [0 .. nb_comp-1]                            : integrale triple
+ *    colonnes [nb_comp .. nb_comp+nca-1]                  : moy_a_
+ *    colonnes [nb_comp+nca .. nb_comp+nca+ncb-1]          : moy_b_
+ *    colonnes [nb_comp+nca+ncb .. nb_comp+nca+ncb+ncc-1]  : moy_c_
+ *    colonne  [nb_comp+nca+ncb+ncc]                       : dt_integration_
+ *
+ *  Ce DoubleTab etendu est le champ calcule de integrale_tps_abc_, nomme avec
+ *  son pdi_name normal. Ainsi sauvegarder/reprendre deleguent entierement a
+ *  Champ_Fonc_base::sauvegarder/reprendre, qui gere xyz, single_hdf et pdi
+ *  sans aucun code specifique dans notre operateur.
+ *
+ *  Pour le postraitement, get_champ() extrait les nb_comp premieres colonnes.
+ *  Pour PDI, data_a_sauvegarder() declare un seul champ (le tableau etendu),
+ *  exactement comme Op_Correlation.
+ *
+ */
+class Op_Correlation_Triple : public Operateur_Statistique_tps_base
+{
+  Declare_instanciable(Op_Correlation_Triple);
+public:
+  inline const Nom& le_nom() const override { return integrale_tps_abc_.le_champ_calcule().le_nom(); }
+  inline double temps() const override { return integrale_tps_abc_.le_champ_calcule().temps(); }
+  inline const Integrale_tps_produit_champs& integrale() const override { return integrale_tps_abc_; }
+
+  inline const OBS_PTR(Champ_Generique_base)& le_champ_a() const { return le_champ_a_; }
+  // Nombre de composantes a poster (sans les colonnes auxiliaires du tableau etendu)
+  inline int nb_comp_post() const { return nb_comp_abc_; }
+
+  // Acces aux parties du tableau etendu
+  inline DoubleTab& valeurs_etendues() { return integrale_tps_abc_.le_champ_calcule().valeurs(); }
+  inline const DoubleTab& valeurs_etendues() const { return integrale_tps_abc_.le_champ_calcule().valeurs(); }
+
+  void mettre_a_jour(double tps) override;
+  inline void initialiser(double val) override;
+  inline void associer(const Domaine_dis_base&, const Champ_base&, double t1, double t2);
+  inline void associer(const Domaine_dis_base&, const Champ_Generique_base&, double t1, double t2) override;
+  void associer(const Domaine_dis_base&, const Champ_Generique_base&,
+                const Champ_Generique_base&, const Champ_Generique_base&, double t1, double t2);
+  inline void fixer_tstat_deb(double, double) override;
+  inline void fixer_tstat_fin(double) override;
+  int completer_post_statistiques(const Domaine& dom, const int is_axi, Format_Post_base& format) override;
+  inline std::vector<YAML_data> data_a_sauvegarder() const override;
+  inline int sauvegarder(Sortie& os) const override;
+  inline int reprendre(Entree& is) override;
+  void associer_op_stat(const Operateur_Statistique_tps_base&) override { }
+  void completer(const Probleme_base&, const Nom&) override;
+  void calculer(DoubleTab&) const override;
+  void fill_result(DoubleTab& tab) const; // ecrit directement dans tab (preserves md_vector)
+
+protected:
+  OBS_PTR(Champ_Generique_base) le_champ_a_, le_champ_b_, le_champ_c_;
+
+  // Tableau etendu : [integrale_triple | moy_a | moy_b | moy_c | dt_integration]
+  // Taille : (n_elem, nb_comp_abc + nca + ncb + ncc + 1)
+  // Stocke dans le champ calcule de integrale_tps_abc_.
+  Integrale_tps_produit_champs integrale_tps_abc_;
+
+  // Offsets dans le tableau etendu (fixes dans completer())
+  int nb_comp_abc_ = 1; // nb_comp_a * nb_comp_b * nb_comp_c
+  int off_moy_a_   = 1; // = nb_comp_abc_
+  int off_moy_b_   = 2; // = nb_comp_abc_ + nca
+  int off_moy_c_   = 3; // = nb_comp_abc_ + nca + ncb
+  int off_dt_      = 4; // = nb_comp_abc_ + nca + ncb + ncc
+  int nb_comp_tot_ = 5; // = nb_comp_abc_ + nca + ncb + ncc + 1
+
+  int nb_comp_a_ = 1, nb_comp_b_ = 1, nb_comp_c_ = 1;
+
+  double t_deb_          = 0.;
+  double t_fin_          = 1.e30;
+  double tps_courant_    = 0.;
+};
+
+// ---------------------------------------------------------------------------
+// Accesseurs inline vers les parties du tableau etendu
+// ---------------------------------------------------------------------------
+
+// dt_integration_ encode en colonne off_dt_ ligne 0 (identique sur tous les elems)
+double get_dt(const DoubleTab& tab, int off_dt);
+
+void set_dt(DoubleTab& tab, int off_dt, double val);
+
+inline void Op_Correlation_Triple::initialiser(double val_init)
+{
+  integrale_tps_abc_.le_champ_calcule().valeurs() = val_init;
+}
+
+inline void Op_Correlation_Triple::associer(const Domaine_dis_base&, const Champ_base&, double, double)
+{
+  Cerr << "Exactly three fields must be associated to triple correlation operator." << finl;
+  exit();
+}
+
+inline void Op_Correlation_Triple::associer(const Domaine_dis_base&, const Champ_Generique_base&, double, double)
+{
+  Cerr << "Exactly three fields must be associated to triple correlation operator." << finl;
+  exit();
+}
+
+inline void Op_Correlation_Triple::fixer_tstat_deb(double tdeb, double tps)
+{
+  // Ne PAS remettre les valeurs a zero ici : fixer_tstat_deb est appele
+  // par le framework APRES reprendre() pour repositionner les bornes
+  // temporelles, et ne doit pas ecraser les donnees restaurees.
+  // La remise a zero se fait uniquement via initialiser(0), appele par
+  // fixer_serie() (cas statistiques en serie), pas lors d'une reprise.
+  t_deb_ = tdeb;
+  tps_courant_ = tps;
+  integrale_tps_abc_.fixer_t_debut(tdeb);
+  integrale_tps_abc_.fixer_tps_integrale(tps);
+  integrale_tps_abc_.fixer_dt_integr(tps - tdeb);
+}
+
+inline void Op_Correlation_Triple::fixer_tstat_fin(double tps)
+{
+  t_fin_ = tps;
+  integrale_tps_abc_.fixer_t_fin(tps);
+}
+
+inline std::vector<YAML_data> Op_Correlation_Triple::data_a_sauvegarder() const
+{
+  // Un seul champ, exactement comme Op_Correlation
+  const Nom& name = integrale_tps_abc_.le_champ_calcule().get_pdi_name();
+  int nb_dim = integrale_tps_abc_.le_champ_calcule().valeurs().nb_dim();
+  YAML_data d(name.getString(), "double", nb_dim);
+  std::vector<YAML_data> data;
+  data.push_back(d);
+  return data;
+}
+
+inline int Op_Correlation_Triple::sauvegarder(Sortie& os) const
+{
+  // Delegation complete a Champ_Fonc_base::sauvegarder, exactement comme Op_Correlation
+  return integrale_tps_abc_.le_champ_calcule().sauvegarder(os);
+}
+
+inline int Op_Correlation_Triple::reprendre(Entree& is)
+{
+  integrale_tps_abc_.le_champ_calcule().reprendre(is);
+  // Resynchronisation depuis le tableau etendu restaure
+  const double dt_repris = get_dt(valeurs_etendues(), off_dt_);
+  const double tps_repris = integrale_tps_abc_.le_champ_calcule().temps();
+  tps_courant_ = tps_repris;
+  integrale_tps_abc_.fixer_tps_integrale(tps_repris);
+  integrale_tps_abc_.fixer_dt_integr(dt_repris);
+  return 1;
+}
+
+#endif
diff --git a/src/Kernel/Statistiques_temps/Op_Ecart_type.cpp b/src/Kernel/Statistiques_temps/Op_Ecart_type.cpp
index 365d058ec9..078b168ac3 100644
--- a/src/Kernel/Statistiques_temps/Op_Ecart_type.cpp
+++ b/src/Kernel/Statistiques_temps/Op_Ecart_type.cpp
@@ -68,7 +68,7 @@ void Op_Ecart_type::completer(const Probleme_base& Pb, const Nom& prefix)
   integrale_carre_champ_.le_champ_calcule().changer_temps(Pb.schema_temps().temps_courant());
 }
 
-DoubleTab Op_Ecart_type::calculer_valeurs() const
+void Op_Ecart_type::calculer(DoubleTab& ecart_type) const
 {
   double dt = dt_integration();
   if (!est_egal(dt, dt_integration_carre()))
@@ -76,7 +76,6 @@ DoubleTab Op_Ecart_type::calculer_valeurs() const
       Cerr << "Not implemented yet in Op_Ecart_type::calculer_valeurs()" << finl;
       exit();
     }
-  DoubleTrav ecart_type(valeurs_carre());
   ecart_type = valeurs_carre();
   if (dt > 0)
     {
@@ -86,5 +85,4 @@ DoubleTab Op_Ecart_type::calculer_valeurs() const
       ecart_type.abs();                      // To avoid negative number ?
       ecart_type.racine_carree();            // sqrt(mean(I^2)-mean(I)^2)
     }
-  return ecart_type;
 }
diff --git a/src/Kernel/Statistiques_temps/Op_Ecart_type.h b/src/Kernel/Statistiques_temps/Op_Ecart_type.h
index c14c38be39..948ae030f4 100644
--- a/src/Kernel/Statistiques_temps/Op_Ecart_type.h
+++ b/src/Kernel/Statistiques_temps/Op_Ecart_type.h
@@ -1,5 +1,5 @@
 /****************************************************************************
-* Copyright (c) 2025, CEA
+* Copyright (c) 2026, CEA
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
@@ -44,7 +44,7 @@ class Op_Ecart_type : public Operateur_Statistique_tps_base
   inline void fixer_tstat_fin(double) override;
   inline void associer_op_stat(const Operateur_Statistique_tps_base&) override;
   void completer(const Probleme_base&, const Nom&) override;
-  DoubleTab calculer_valeurs() const override;
+  void calculer(DoubleTab&) const override;
   inline std::vector<YAML_data> data_a_sauvegarder() const override;
   inline int sauvegarder(Sortie& os) const override;
   inline int reprendre(Entree& is) override;
diff --git a/src/Kernel/Statistiques_temps/Op_Moyenne.cpp b/src/Kernel/Statistiques_temps/Op_Moyenne.cpp
index 4fd29ac6d8..cc3e15049e 100644
--- a/src/Kernel/Statistiques_temps/Op_Moyenne.cpp
+++ b/src/Kernel/Statistiques_temps/Op_Moyenne.cpp
@@ -71,12 +71,10 @@ void Op_Moyenne::completer(const Probleme_base& Pb, const Nom& prefix)
   integrale_champ_.le_champ_calcule().changer_temps(Pb.schema_temps().temps_courant());
 }
 
-DoubleTab Op_Moyenne::calculer_valeurs() const
+void Op_Moyenne::calculer(DoubleTab& moyenne) const
 {
   double dt = dt_integration();
-  DoubleTrav moyenne(valeurs());
   moyenne = valeurs();
   if (dt > 0)
     moyenne /= dt;
-  return moyenne;
 }
diff --git a/src/Kernel/Statistiques_temps/Op_Moyenne.h b/src/Kernel/Statistiques_temps/Op_Moyenne.h
index 0cd743592e..a91c3cb4be 100644
--- a/src/Kernel/Statistiques_temps/Op_Moyenne.h
+++ b/src/Kernel/Statistiques_temps/Op_Moyenne.h
@@ -43,7 +43,7 @@ class Op_Moyenne: public Operateur_Statistique_tps_base
   inline int sauvegarder(Sortie& os) const override;
   inline int reprendre(Entree& is) override;
   void completer(const Probleme_base&, const Nom&) override;
-  DoubleTab calculer_valeurs() const override;
+  void calculer(DoubleTab&) const override;
 
 protected:
   Integrale_tps_Champ integrale_champ_;
diff --git a/src/Kernel/Statistiques_temps/Operateur_Statistique_tps_base.h b/src/Kernel/Statistiques_temps/Operateur_Statistique_tps_base.h
index 675fd0019a..2be7e80dcc 100644
--- a/src/Kernel/Statistiques_temps/Operateur_Statistique_tps_base.h
+++ b/src/Kernel/Statistiques_temps/Operateur_Statistique_tps_base.h
@@ -1,5 +1,5 @@
 /****************************************************************************
-* Copyright (c) 2025, CEA
+* Copyright (c) 2026, CEA
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
@@ -47,7 +47,7 @@ class Operateur_Statistique_tps_base : public Objet_U
   virtual const Integrale_tps_Champ& integrale() const =0;
   virtual void initialiser(double val) =0;
   virtual void completer(const Probleme_base&, const Nom& post_name) =0;
-  virtual DoubleTab calculer_valeurs() const =0;
+  virtual void calculer(DoubleTab&) const =0;
   virtual int completer_post_statistiques(const Domaine& dom,const int is_axi,Format_Post_base& format);
   inline double tstat_deb() const { return tstat_deb_; }
   inline double tstat_fin() const { return tstat_fin_; }
diff --git a/src/Kernel/Utilitaires/Device.cpp b/src/Kernel/Utilitaires/Device.cpp
index 6a36e81cde..04e3642a7a 100644
--- a/src/Kernel/Utilitaires/Device.cpp
+++ b/src/Kernel/Utilitaires/Device.cpp
@@ -1,5 +1,5 @@
 /****************************************************************************
-* Copyright (c) 2025, CEA
+* Copyright (c) 2026, CEA
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
@@ -33,19 +33,6 @@
 #include <PE_Groups.h>
 #endif
 
-/*
-bool init_device_ = false;
-bool clock_on = false;
-bool fence = true;
-double clock_start;
-int timer_counter=0;
-#ifdef TRUST_USE_GPU
-bool timer = true;
-#else
-bool timer = false;
-#endif
-*/
-
 std::string ptrToString(const void* adr)
 {
   std::stringstream ss;
@@ -197,23 +184,30 @@ _TYPE_* allocateOnDevice(_TYPE_* ptr, _SIZE_ size)
 #ifdef TRUST_USE_GPU
   assert(!isAllocatedOnDevice(ptr)); // Verifie que la zone n'est pas deja allouee
   statistics().begin_count(STD_COUNTERS::gpu_malloc_free,statistics().get_last_opened_counter_level()+1);
-  size_t bytes = sizeof(_TYPE_) * size;
-  size_t free_bytes  = DeviceMemory::deviceMemGetInfo(0);
-  size_t total_bytes = DeviceMemory::deviceMemGetInfo(1);
-  if (bytes>free_bytes)
+  _TYPE_ *device_ptr = nullptr;
+  if (size>0)
     {
-      Cerr << "Error ! Trying to allocate " << bytes << " bytes on GPU memory whereas only " << free_bytes << " bytes are available." << finl;
-      Process::exit();
+      size_t bytes = sizeof(_TYPE_) * size;
+      size_t free_bytes = DeviceMemory::deviceMemGetInfo(0);
+      size_t total_bytes = DeviceMemory::deviceMemGetInfo(1);
+      if (bytes > free_bytes)
+        {
+          Cerr << "Error ! Trying to allocate " << bytes << " bytes on GPU memory whereas only " << free_bytes
+               << " bytes are available." << finl;
+          Process::exit();
+        }
+      device_ptr = static_cast<_TYPE_ *>(Kokkos::kokkos_malloc(bytes));
+      if (statistics().is_gpu_verbose_on() && Process::je_suis_maitre())
+        {
+          std::string clock(Process::is_parallel() ? "[clock]#" + std::to_string(Process::me()) : "[clock]  ");
+          double ms = 1000 * statistics().get_time_since_last_open(STD_COUNTERS::gpu_malloc_free);
+          printf("%s %7.3f ms [Data]   Allocate on device [%9s] %6ld Bytes (%ld/%ldGB free) Currently allocated: %6ld\n",
+                 clock.c_str(), ms, ptrToString(ptr).c_str(), long(bytes), free_bytes / (1024 * 1024 * 1024),
+                 total_bytes / (1024 * 1024 * 1024), long(DeviceMemory::allocatedBytesOnDevice()));
+        }
     }
-  _TYPE_* device_ptr = static_cast<_TYPE_*>(Kokkos::kokkos_malloc(bytes));
   // Map host_ptr with device_ptr:
   DeviceMemory::add(ptr, device_ptr, size * sizeof(_TYPE_));
-  if (statistics().is_gpu_verbose_on() && Process::je_suis_maitre())
-    {
-      std::string clock(Process::is_parallel() ? "[clock]#"+std::to_string(Process::me()) : "[clock]  ");
-      double ms = 1000 * statistics().get_time_since_last_open(STD_COUNTERS::gpu_malloc_free);
-      printf("%s %7.3f ms [Data]   Allocate on device [%9s] %6ld Bytes (%ld/%ldGB free) Currently allocated: %6ld\n", clock.c_str(), ms, ptrToString(ptr).c_str(), long(bytes), free_bytes/(1024*1024*1024), total_bytes/(1024*1024*1024), long(DeviceMemory::allocatedBytesOnDevice()));
-    }
   statistics().end_count(STD_COUNTERS::gpu_malloc_free);
 #ifndef NDEBUG
   const _TYPE_ INVALIDE_ = (std::is_same<_TYPE_,double>::value) ? DMAXFLOAT*0.999 : ( (std::is_same<_TYPE_,int>::value) ? INT_MIN : 0); // Identique a TRUSTArray<_TYPE_>::fill_default_value()
@@ -509,15 +503,16 @@ std::string start_gpu_timer(std::string str, int bytes)
 #ifdef TRUST_USE_GPU
   if (!statistics().get_init_device())
     return str;
+  //std::cerr << "Provisoire start_gpu_timer " << str << std::endl;
   if (statistics().get_gpu_timer())
-    Process::exit("A GPU KERNEL is still running, you can't open a new one yet");
+    {
+      Cerr << "A GPU KERNEL is still running, you can't open a new one (" << str << ") yet." << finl;
+      Cerr <<"Probably you forgot to define a end_gpu_timer(...) call." << finl;
+      Cerr <<"Or more subtil bug: you copy a C++ object on the device which has at least one TRUST array as attribute." << finl;
+      Process::exit();
+    }
   statistics().start_gpu_timer();
   statistics().add_to_gpu_timer_counter(1);
-#ifndef NDEBUG
-  if (statistics().get_gpu_timer_counter()>1)
-    Cerr << "[Kokkos] timer_counter=" << statistics().get_gpu_timer_counter() << " : start_gpu_timer() not closed by end_gpu_timer() !" << finl;
-  //Process::exit("Error, start_gpu_timer() not closed by end_gpu_timer() !");
-#endif
   if (bytes == -1)
     statistics().begin_count(STD_COUNTERS::gpu_kernel,statistics().get_last_opened_counter_level()+1);
 #ifdef TRUST_USE_CUDA
@@ -535,12 +530,8 @@ void end_gpu_timer(const std::string& str, int onDevice, int bytes) // Return in
 #ifdef TRUST_USE_GPU
   if (!statistics().get_init_device())
     return;
+  //std::cerr << "Provisoire end_gpu_timer " << str << std::endl;
   statistics().add_to_gpu_timer_counter(-1);
-#ifndef NDEBUG
-  if (statistics().get_gpu_timer_counter()!=0)
-    Cerr << "[Kokkos] timer_counter=" << statistics().get_gpu_timer_counter() << " : end_gpu_timer() not opened by start_gpu_timer() !" << finl;
-  //Process::exit("Error, start_gpu_timer() not closed by end_gpu_timer() !");
-#endif
   if (onDevice)
     {
 #ifdef TRUST_USE_UVM
diff --git a/src/Kernel/Utilitaires/Schema_Comm_Vecteurs.cpp b/src/Kernel/Utilitaires/Schema_Comm_Vecteurs.cpp
index 1eb5d28f15..cd7b41146a 100644
--- a/src/Kernel/Utilitaires/Schema_Comm_Vecteurs.cpp
+++ b/src/Kernel/Utilitaires/Schema_Comm_Vecteurs.cpp
@@ -94,7 +94,7 @@ Schema_Comm_Vecteurs::Schema_Comm_Vecteurs()
       if (getenv("MPICH_GPU_SUPPORT_ENABLED") == nullptr)
         Process::exit("You try to enable GPU communications on Cray MPICH with TRUST_USE_MPI_GPU_AWARE=1 but forgot to set also MPICH_GPU_SUPPORT_ENABLED=1 !");
 #endif
-      std::cerr << "[MPI] Enabling GPU capability to communicate between devices." << std::endl;
+      //std::cerr << "[MPI] Enabling GPU capability to communicate between devices." << std::endl;
       //Cerr << "[MPI] Warning! Only MPI calls with device pointers will benefit. Classic MPI calls with host pointers will be slower..." << finl;
     }
 }
diff --git a/src/Kernel/Utilitaires/View_Types.h b/src/Kernel/Utilitaires/View_Types.h
index bb59387dd5..24f34d868b 100644
--- a/src/Kernel/Utilitaires/View_Types.h
+++ b/src/Kernel/Utilitaires/View_Types.h
@@ -43,6 +43,8 @@ using host_mirror_space = Kokkos::HostSpace;
 
 // The execution space (=where code is run): on the device if compiled for GPU, else CPU.
 using execution_space = DeviceView<double, 1>::execution_space;
+using HostSpace = Kokkos::DefaultHostExecutionSpace;
+using DeviceSpace = Kokkos::DefaultExecutionSpace;
 
 // Typedefs for range policies in kernels
 using range_1D = Kokkos::RangePolicy<execution_space>;
diff --git a/src/Kernel/Utilitaires/kokkos++.h b/src/Kernel/Utilitaires/kokkos++.h
index 5f88d1e426..12b2493f3b 100644
--- a/src/Kernel/Utilitaires/kokkos++.h
+++ b/src/Kernel/Utilitaires/kokkos++.h
@@ -44,8 +44,13 @@
 #pragma diag_warning 47
 #endif
 
+// CUDA device lambdas (KOKKOS_LAMBDA) can only capture members of public classes.
+// These macros preserve the intended C++ access level in CPU builds while forcing
+// public visibility in CUDA builds so that Kokkos kernels defined in these sections compile.
 #ifdef TRUST_USE_CUDA
-#define public_for_cuda public:
+#define protected_but_public_for_cuda public:
+#define private_but_public_for_cuda public:
 #else
-#define public_for_cuda protected:
+#define protected_but_public_for_cuda protected:
+#define private_but_public_for_cuda private:
 #endif
diff --git a/src/Kernel/Utilitaires/kokkos_test.cpp b/src/Kernel/Utilitaires/kokkos_test.cpp
index aaca1a6922..18578b4979 100644
--- a/src/Kernel/Utilitaires/kokkos_test.cpp
+++ b/src/Kernel/Utilitaires/kokkos_test.cpp
@@ -153,7 +153,6 @@ void kokkos_self_test()
   // C++ object in Kokkos region
   {
     ArrOfDouble f(nb_elem);
-    f = 0;
     std::string expr("2*x+2");
     // Parser sur le device;
     ParserView parser(expr, 1);
diff --git a/src/Kernel/VF/Champs/Champ_Fonc_reprise.cpp b/src/Kernel/VF/Champs/Champ_Fonc_reprise.cpp
index bc81db1ad6..5cbd965057 100644
--- a/src/Kernel/VF/Champs/Champ_Fonc_reprise.cpp
+++ b/src/Kernel/VF/Champs/Champ_Fonc_reprise.cpp
@@ -466,7 +466,7 @@ void Champ_Fonc_reprise::read_field_from_file(Entree& jdd, Entree& file, const P
       champ_moyen.reprendre(file);
 
       // On remplit le champ
-      le_champ().valeurs() = champ_moyen.calculer_valeurs();
+      champ_moyen.calculer(le_champ().valeurs());
     }
   else if (reprend_modele_k_eps)
     {
diff --git a/src/Kernel/VF/Champs/Champ_front_recyclage.cpp b/src/Kernel/VF/Champs/Champ_front_recyclage.cpp
index d1960cca75..c85b1562d7 100644
--- a/src/Kernel/VF/Champs/Champ_front_recyclage.cpp
+++ b/src/Kernel/VF/Champs/Champ_front_recyclage.cpp
@@ -454,6 +454,7 @@ void Champ_front_recyclage::get_coord_faces(const Frontiere_dis_base& fr_vf,
   const int      dim       = xv2.dimension(1);
 
   coords.resize(nb_faces2,dim);
+  ToDo_Kokkos("critical");
   for (int i = 0; i < nb_faces2; i++)
     for (int j = 0; j < dim; j++)
       coords(i,j) = xv2(i+ndeb2,j) + delt_dist(j);
@@ -625,6 +626,7 @@ int Champ_front_recyclage::initialiser(double temps, const Champ_Inc_base& inco)
 
       int nb_remote_faces = 0;
       // Loop on local faces on the process pe:
+      ToDo_Kokkos("critical");
       for (int face = 0; face < nb_faces_on_pe; face++)
         {
           const int elem = elem_list[face];
@@ -659,6 +661,7 @@ int Champ_front_recyclage::initialiser(double temps, const Champ_Inc_base& inco)
       //Cerr << index_to_recv << finl;
     }
   bool error_1 = false, error_2 = false;
+  ToDo_Kokkos("critical");
   for (int i = 0; i < nb_faces2; i++)
     {
       if (count[i] < 1)
@@ -756,6 +759,7 @@ void Champ_front_recyclage::mettre_a_jour(double temps)
   calcul_moyenne_imposee(tab,temps);
   calcul_moyenne_recyclee(tab,temps);
 
+  ToDo_Kokkos("critical");
   for (int i=0; i<nb_faces2; i++)
     for (int dir=0; dir<nb_compo_; dir++)
       tab(i,dir) = ampli_moy_imposee_(dir)*moyenne_imposee_(i,dir) + ampli_fluct_(dir)*(tab(i,dir)-ampli_moy_recyclee_(dir)*moyenne_recyclee_(i,dir));
@@ -884,6 +888,7 @@ void Champ_front_recyclage::calcul_moyenne_imposee(const DoubleTab& tab,double t
       int nb_faces_bord2 = fr_vf2.nb_faces();
       int ndeb = fr_vf2.num_premiere_face();
 
+      ToDo_Kokkos("critical");
       for (int i=0; i<nb_faces_bord2; i++)
         for (int j=0; j<nb_compo_; j++)
           {
@@ -935,6 +940,7 @@ void Champ_front_recyclage::calcul_moyenne_recyclee(const DoubleTab& tab,double
       DoubleVect moyenne(nb_compo_);
       double somme_si = 0.;
       moyenne = 0.;
+      ToDo_Kokkos("critical");
       for (int i=0; i<nb_faces_bord2; i++)
         {
           for (int j=0; j<nb_compo_; j++)
@@ -948,6 +954,7 @@ void Champ_front_recyclage::calcul_moyenne_recyclee(const DoubleTab& tab,double
         moyenne(j) = mp_sum(moyenne(j));
       moyenne /= somme_si;
 
+      ToDo_Kokkos("critical");
       for (int i=0; i<nb_faces_bord2; i++)
         for (int j=0; j<nb_compo_; j++)
           moyenne_recyclee_(i,j) = moyenne(j);
@@ -1080,6 +1087,7 @@ void Champ_front_recyclage::lire_fichier_format1(DoubleTab& moyenne,
       exit();
     }
 
+  ToDo_Kokkos("critical");
   for (int i=0; i<nb_faces2; i++)
     {
       double y = coords(i,dir_ortho);
diff --git a/src/Kernel/VF/Champs/Champ_front_softanalytique.cpp b/src/Kernel/VF/Champs/Champ_front_softanalytique.cpp
index f9e7d83b34..1904c040d6 100644
--- a/src/Kernel/VF/Champs/Champ_front_softanalytique.cpp
+++ b/src/Kernel/VF/Champs/Champ_front_softanalytique.cpp
@@ -104,6 +104,7 @@ int Champ_front_softanalytique::initialiser(double temps, const Champ_Inc_base&
 
   const Front_VF& le_bord = ref_cast(Front_VF,frontiere_dis());
   int nb_faces_bord_tot=le_bord.nb_faces_tot();
+  ToDo_Kokkos("critical");
   for (int ind_face=0; ind_face<nb_faces_bord_tot; ind_face++)
     {
       int face = le_bord.num_face(ind_face);
@@ -116,9 +117,9 @@ int Champ_front_softanalytique::initialiser(double temps, const Champ_Inc_base&
 
       for( k=0; k<dim; k++)
         {
-          fxyz[k].setVar("x",x);
-          fxyz[k].setVar("y",y);
-          fxyz[k].setVar("z",z);
+          fxyz[k].setVar(0,x);
+          fxyz[k].setVar(1,y);
+          fxyz[k].setVar(2,z);
           tab(ind_face,k)=fxyz[k].eval();
           //Cout << " x y z " << x << " " << y << " " << z << " " << tab(i,k) << finl;
         }
@@ -141,9 +142,9 @@ void Champ_front_softanalytique::valeur_a(DoubleVect& position, DoubleVect& vale
 
   for (int k=0; k<dim; k++)
     {
-      fxyz[k].setVar("x",x);
-      fxyz[k].setVar("y",y);
-      fxyz[k].setVar("z",z);
+      fxyz[k].setVar(0,x);
+      fxyz[k].setVar(1,y);
+      fxyz[k].setVar(2,z);
       valeur[k] = fxyz[k].eval();
     }
   return;
diff --git a/src/Kernel/VF/Geometrie/Domaine_VF.cpp b/src/Kernel/VF/Geometrie/Domaine_VF.cpp
index 50962ebcbe..89bdde13ce 100644
--- a/src/Kernel/VF/Geometrie/Domaine_VF.cpp
+++ b/src/Kernel/VF/Geometrie/Domaine_VF.cpp
@@ -1205,18 +1205,22 @@ void Domaine_VF::build_mc_Cmesh_nodesCorrespondence()
 
 #endif
 
-void Domaine_VF::get_position(DoubleTab& positions) const
+void Domaine_VF::get_position(DoubleTab& tab_positions) const
 {
-  positions.resize(nb_elem(), xp_.dimension(1));
-  CDoubleTabView xp = xp_.view_ro();
-  DoubleTabView positions_v = positions.view_wo();
-  Kokkos::parallel_for(start_gpu_timer(__KERNEL_NAME__), range_2D({0,0}, {nb_elem(), xp_.dimension(1)}), KOKKOS_LAMBDA(const int i, const int j)
-  {
-    positions_v(i,j) = xp(i,j);
-  });
-  end_gpu_timer(__KERNEL_NAME__);
-  // Don't work with simply: ToDo fix
-  // positions = zvf.xp();
+  if (xp().nb_dim() != 2) /* xp() non initialise */
+    domaine().calculer_centres_gravite(tab_positions);
+  else
+    {
+      //tab_positions.ref(xp()); Fails on QC_vs_WC_jdd6 test case in // why ?
+      tab_positions.resize(nb_elem(), xp().dimension(1), RESIZE_OPTIONS::NOCOPY_NOINIT); // Resize without init
+      CDoubleTabView xp = xp_.view_ro();
+      DoubleTabView positions = tab_positions.view_wo();
+      Kokkos::parallel_for(start_gpu_timer(__KERNEL_NAME__), range_2D({0,0}, {nb_elem(), xp_.dimension(1)}), KOKKOS_LAMBDA(const int i, const int j)
+      {
+        positions(i,j) = xp(i,j);
+      });
+      end_gpu_timer(__KERNEL_NAME__);
+    }
 }
 
 double Domaine_VF::compute_L1_norm(const DoubleVect& val_source, const bool basis_function, const int order) const
diff --git a/src/MAIN/catch_and_trace.cpp b/src/MAIN/catch_and_trace.cpp
index 4c3f94109a..742c0490d5 100644
--- a/src/MAIN/catch_and_trace.cpp
+++ b/src/MAIN/catch_and_trace.cpp
@@ -134,6 +134,5 @@ void install_handlers()
                 << " (" << strsignal(SIGFPE) << ")" << std::endl;
       Process::exit();
     }
-  std::cerr << "Custom error handlers correctly installed. SIGFPE and SIGABRT redirected." << std::endl;
 }
 
diff --git a/src/MAIN/mon_main.cpp b/src/MAIN/mon_main.cpp
index b5749738a1..b16f08869c 100644
--- a/src/MAIN/mon_main.cpp
+++ b/src/MAIN/mon_main.cpp
@@ -101,12 +101,6 @@ static int init_petsc(True_int argc, char **argv, bool with_mpi,bool& trio_began
     }
 #else
   PetscInitialize(&argc, &argv, (char*)0, help);
-#endif
-#ifdef TRUST_USE_GPU
-  PetscDevice device;
-  PetscDeviceCreate(PETSC_DEVICE_DEFAULT(), PETSC_DECIDE, &device);
-  PetscDeviceView(device, PETSC_VIEWER_STDERR_WORLD);
-  //if (instance==1) PetscLogGpuTime(); // Slow down calculation ! Use -log_view_gpu_time
 #endif
   // Bizarrerie qui se produit sur une machine (ioulia, MPICH natif): PetscInitialize change le pwd()
   // en sequentiel et si le binaire n'est pas dans le repertoire de l'etude, le pwd est perdu...
@@ -125,11 +119,6 @@ static int init_petsc(True_int argc, char **argv, bool with_mpi,bool& trio_began
   // Desactive le signal handler en optimise pour eviter d'etre trop bavard
   // et de "masquer" les messages d'erreur TRUST:
   PetscPopSignalHandler();
-
-#ifndef __CYGWIN__
-  Cerr << "Enabling error handlers catching SIGFPE and SIGABORT and giving a trace of where the fault happened." << finl;
-  install_handlers();
-#endif
 #else
 #ifdef MPI_
   // MPI_Init pour les machines ou Petsc n'est pas installe
@@ -268,13 +257,24 @@ void mon_main::init_parallel(const int argc, char **argv, bool with_mpi, bool ch
       if (Process::je_suis_maitre())
         Cerr << "Kokkos initialized after MPI !" << finl;
     }
+  // Print GPU informations with PETSc and Kokkos:
   if (Process::je_suis_maitre())
     {
 #ifdef TRUST_USE_GPU
+#ifdef PETSCKSP_H
+      PetscDevice device;
+      PetscDeviceCreate(PETSC_DEVICE_DEFAULT(), PETSC_DECIDE, &device);
+      PetscDeviceView(device, PETSC_VIEWER_STDERR_SELF);
+      //if (instance==1) PetscLogGpuTime(); // Slow down calculation ! Use -log_view_gpu_time
+#endif
       Kokkos::print_configuration(std::cerr, true);
 #endif
       Cerr << "You can run --kokkos-help option." << finl;
     }
+#ifndef __CYGWIN__
+  if (Process::je_suis_maitre()) Cerr << "Enabling error handlers catching SIGFPE and SIGABORT and giving a trace of where the fault happened." << finl;
+  install_handlers();
+#endif
 }
 
 void mon_main::finalize()
diff --git a/src/PolyMAC_family/Champs/Champ_Elem_PolyMAC_MPFA.cpp b/src/PolyMAC_family/Champs/Champ_Elem_PolyMAC_MPFA.cpp
index 37a12cf1c7..91fe5f2345 100644
--- a/src/PolyMAC_family/Champs/Champ_Elem_PolyMAC_MPFA.cpp
+++ b/src/PolyMAC_family/Champs/Champ_Elem_PolyMAC_MPFA.cpp
@@ -33,7 +33,7 @@ const Domaine_PolyMAC_MPFA& Champ_Elem_PolyMAC_MPFA::domaine_PolyMAC_MPFA() cons
 inline void Champ_Elem_PolyMAC_MPFA::mettre_a_jour(double tps)
 {
   if (tps_last_calc_grad_ != tps) grad_a_jour = 0;
-  Champ_Inc_P0_base::mettre_a_jour(tps);
+  Champ_Inc_base::mettre_a_jour(tps);
 }
 
 int Champ_Elem_PolyMAC_MPFA::reprendre(Entree& fich)
diff --git a/src/PolyMAC_family/Champs/Champ_Fonc_Tabule_Elem_PolyMAC_CDO.cpp b/src/PolyMAC_family/Champs/Champ_Fonc_Tabule_Elem_PolyMAC_CDO.cpp
index 6ad89ca9cf..6b51cee206 100644
--- a/src/PolyMAC_family/Champs/Champ_Fonc_Tabule_Elem_PolyMAC_CDO.cpp
+++ b/src/PolyMAC_family/Champs/Champ_Fonc_Tabule_Elem_PolyMAC_CDO.cpp
@@ -31,6 +31,7 @@ void Champ_Fonc_Tabule_Elem_PolyMAC_CDO::associer_param(const VECT(OBS_PTR(Champ
 
 void Champ_Fonc_Tabule_Elem_PolyMAC_CDO::mettre_a_jour(double t)
 {
+  // ToDo: replace by Champ_Fonc_P0_base::mettre_a_jour(t, la_table.valeur(), les_ch_param);
   const Domaine_VF& zvf = le_dom_VF.valeur();
   const Table& table = la_table.valeur();
   DoubleTab& mes_valeurs = valeurs();
diff --git a/src/PolyMAC_family/Operateurs/Op_Diff_Dift/Couplage_Parietal_PolyMAC_MPFA_helper.cpp b/src/PolyMAC_family/Operateurs/Op_Diff_Dift/Couplage_Parietal_PolyMAC_MPFA_helper.cpp
index d48b916919..39afb188c2 100644
--- a/src/PolyMAC_family/Operateurs/Op_Diff_Dift/Couplage_Parietal_PolyMAC_MPFA_helper.cpp
+++ b/src/PolyMAC_family/Operateurs/Op_Diff_Dift/Couplage_Parietal_PolyMAC_MPFA_helper.cpp
@@ -340,7 +340,7 @@ void Couplage_Parietal_PolyMAC_MPFA_helper::dimensionner_blocs(matrices_t matric
           *mat[i] = mat2;
       }
 
-  int n_sten = 0;
+  decltype(stencil.front().dimension(0)) n_sten = 0;
   for (const auto &st : stencil)
     n_sten += st.dimension(0); //n_sten : nombre total de points du stencil de l'operateur
 
diff --git a/src/PolyMAC_family/Operateurs/Op_Diff_Dift/Op_Diff_PolyMAC_HFV_Elem.cpp b/src/PolyMAC_family/Operateurs/Op_Diff_Dift/Op_Diff_PolyMAC_HFV_Elem.cpp
index 8a7437f092..aed42aaeaf 100644
--- a/src/PolyMAC_family/Operateurs/Op_Diff_Dift/Op_Diff_PolyMAC_HFV_Elem.cpp
+++ b/src/PolyMAC_family/Operateurs/Op_Diff_Dift/Op_Diff_PolyMAC_HFV_Elem.cpp
@@ -117,8 +117,7 @@ void Op_Diff_PolyMAC_HFV_Elem::dimensionner_blocs_ext(int aux_only, matrices_t m
 {
   init_op_ext();
   const std::string& nom_inco = (le_champ_inco ? le_champ_inco.valeur() : equation().inconnue()).le_nom().getString();
-  int i, j, k, l, e, o_e, f, o_f, fb, m, n, M, n_ext = (int) op_ext.size(), n_sten = 0, semi = (int) semi_impl.count(nom_inco);
-  long p;
+  int i, j, k, l, e, o_e, f, o_f, fb, m, n, M, n_ext = (int) op_ext.size(), semi = (int) semi_impl.count(nom_inco);
   std::vector<Matrice_Morse*> mat(n_ext); //matrices
   std::vector<int> N, ne_tot; //composantes, nombre d'elements total par pb
   std::vector<std::reference_wrapper<const Domaine_PolyMAC_HFV>> domaine; //domaines
@@ -127,6 +126,7 @@ void Op_Diff_PolyMAC_HFV_Elem::dimensionner_blocs_ext(int aux_only, matrices_t m
   std::vector<std::reference_wrapper<const DoubleTab>> diffu, inco; //inconnues, normales aux faces, positions elems / faces / sommets
   std::deque<ConstDoubleTab_parts> v_part; //blocs de chaque inconnue
   std::vector<Stencil> stencil(n_ext); //stencils par matrice
+  decltype(stencil[0].dimension(0)) n_sten = 0;
   for (i = 0, M = 0; i < n_ext; M = std::max(M, N[i]), i++)
     {
       std::string nom_mat = i ? nom_inco + "/" + op_ext[i]->equation().probleme().le_nom().getString() : nom_inco;
@@ -180,6 +180,7 @@ void Op_Diff_PolyMAC_HFV_Elem::dimensionner_blocs_ext(int aux_only, matrices_t m
     }
   /* problemes distants : pour les Echange_contact */
   const Echange_contact_PolyMAC_HFV *pcl;
+  long p;
   if (!semi)
     for (i = 0; i < cls[0].get().size(); i++)
       if ((pcl = sub_type(Echange_contact_PolyMAC_HFV, cls[0].get()[i].valeur()) ? &ref_cast(Echange_contact_PolyMAC_HFV, cls[0].get()[i].valeur()) : nullptr))
diff --git a/src/PolyMAC_family/Operateurs/Op_Diff_Dift/Op_Diff_PolyMAC_MPFA_Elem.cpp b/src/PolyMAC_family/Operateurs/Op_Diff_Dift/Op_Diff_PolyMAC_MPFA_Elem.cpp
index 44949d5f37..d2245797ef 100644
--- a/src/PolyMAC_family/Operateurs/Op_Diff_Dift/Op_Diff_PolyMAC_MPFA_Elem.cpp
+++ b/src/PolyMAC_family/Operateurs/Op_Diff_Dift/Op_Diff_PolyMAC_MPFA_Elem.cpp
@@ -246,7 +246,7 @@ void Op_Diff_PolyMAC_MPFA_Elem::dimensionner_blocs(matrices_t matrices, const ta
         *mat = mat2;
     }
 
-  int n_sten = stencil.dimension(0);
+  auto n_sten = stencil.dimension(0);
 
   const double elem_t = static_cast<double>(domaine.domaine().md_vector_elements()->nb_items_seq_tot()),
                face_t = static_cast<double>(domaine.md_vector_faces()->nb_items_seq_tot());
diff --git a/src/PolyMAC_family/Operateurs/Op_Diff_Dift/Op_Diff_PolyMAC_MPFA_Face.cpp b/src/PolyMAC_family/Operateurs/Op_Diff_Dift/Op_Diff_PolyMAC_MPFA_Face.cpp
index 9ea42b83f2..92f4c0f167 100644
--- a/src/PolyMAC_family/Operateurs/Op_Diff_Dift/Op_Diff_PolyMAC_MPFA_Face.cpp
+++ b/src/PolyMAC_family/Operateurs/Op_Diff_Dift/Op_Diff_PolyMAC_MPFA_Face.cpp
@@ -213,10 +213,10 @@ void Op_Diff_PolyMAC_MPFA_Face::dimensionner_blocs(matrices_t matrices, const ta
       }
 
   tableau_trier_retirer_doublons(stencil);
+#ifndef TRUST_USE_GPU
   const double face_t = static_cast<double>(domaine.md_vector_faces()->nb_items_seq_tot()),
                elem_t = static_cast<double>(domaine.domaine().md_vector_elements()->nb_items_seq_tot());
   const double width = mp_sum_as_double(stencil.dimension(0)) / (N * (face_t + D * elem_t));
-#ifndef TRUST_USE_GPU
   const double perc = mp_somme_vect_as_double(tpfa) * 100. / (N * face_t);
   Cerr << "width " << width << " " << perc  << "% TPFA " << finl;
 #endif
diff --git a/src/ThHyd/Dilatable/Common/Cond_Lim/Entree_fluide_temperature_imposee_H.h b/src/ThHyd/Dilatable/Common/Cond_Lim/Entree_fluide_temperature_imposee_H.h
index 5aa3901c21..9a1d8c5ce4 100644
--- a/src/ThHyd/Dilatable/Common/Cond_Lim/Entree_fluide_temperature_imposee_H.h
+++ b/src/ThHyd/Dilatable/Common/Cond_Lim/Entree_fluide_temperature_imposee_H.h
@@ -1,5 +1,5 @@
 /****************************************************************************
-* Copyright (c) 2024, CEA
+* Copyright (c) 2025, CEA
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
@@ -29,10 +29,12 @@ class Entree_fluide_temperature_imposee_H  : public Entree_fluide_temperature_im
 {
   Declare_instanciable(Entree_fluide_temperature_imposee_H);
 public :
+  using Dirichlet::val_imp;
   void completer() override;
   double val_imp(int i) const override;
   double val_imp(int i, int j) const override;
-
+  double val_imp_au_temps(double temps, int i) const override { return val_imp(i); }
+  double val_imp_au_temps(double temps, int i, int j) const override { return val_imp(i,j); }
 protected :
   OBS_PTR(Fluide_Dilatable_base) le_fluide;
 };
diff --git a/src/ThHyd/Dilatable/Common/Cond_Lim/Neumann_sortie_libre_Temp_H.cpp b/src/ThHyd/Dilatable/Common/Cond_Lim/Neumann_sortie_libre_Temp_H.cpp
index 50e7874c82..e9503ce61e 100644
--- a/src/ThHyd/Dilatable/Common/Cond_Lim/Neumann_sortie_libre_Temp_H.cpp
+++ b/src/ThHyd/Dilatable/Common/Cond_Lim/Neumann_sortie_libre_Temp_H.cpp
@@ -1,5 +1,5 @@
 /****************************************************************************
-* Copyright (c) 2024, CEA
+* Copyright (c) 2025, CEA
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
@@ -43,38 +43,6 @@ void Neumann_sortie_libre_Temp_H::completer()
   modifier_val_imp = 1;
 }
 
-/*! @brief Renvoie la valeur de la i-eme composante du champ impose a l'exterieur de la frontiere.
- *
- * @param (int i) indice suivant la premiere dimension du champ
- * @return (double) la valeur imposee sur la composante du champ specifiee
- * @throws deuxieme dimension du champ de frontiere superieur a 1
- */
-double Neumann_sortie_libre_Temp_H::val_ext(int i) const
-{
-  if (le_champ_ext->valeurs().size() == 1)
-    {
-      if (modifier_val_imp == 1)
-        return le_fluide->calculer_H(le_champ_ext->valeurs()(0, 0));
-      else
-        return le_champ_ext->valeurs()(0, 0);
-    }
-  else if (le_champ_ext->valeurs().dimension(1) == 1)
-    {
-      if (modifier_val_imp == 1)
-        return le_fluide->calculer_H(le_champ_ext->valeurs()(i, 0));
-      else
-        return le_champ_ext->valeurs()(i, 0);
-    }
-  else
-    {
-      Cerr << "Neumann_sortie_libre_Temp_H::val_ext" << finl;
-      Cerr << le_champ_ext << finl;
-    }
-
-  abort();
-  return 0.;
-}
-
 /*! @brief Renvoie la valeur de la (i,j)-eme composante du champ impose a l'exterieur de la frontiere.
  *
  * @param (int i) indice suivant la premiere dimension du champ
diff --git a/src/ThHyd/Dilatable/Common/Cond_Lim/Neumann_sortie_libre_Temp_H.h b/src/ThHyd/Dilatable/Common/Cond_Lim/Neumann_sortie_libre_Temp_H.h
index c264c32393..eafc636de5 100644
--- a/src/ThHyd/Dilatable/Common/Cond_Lim/Neumann_sortie_libre_Temp_H.h
+++ b/src/ThHyd/Dilatable/Common/Cond_Lim/Neumann_sortie_libre_Temp_H.h
@@ -1,5 +1,5 @@
 /****************************************************************************
-* Copyright (c) 2024, CEA
+* Copyright (c) 2025, CEA
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
@@ -33,7 +33,7 @@ class Neumann_sortie_libre_Temp_H : public Neumann_sortie_libre
   Declare_instanciable(Neumann_sortie_libre_Temp_H);
 public:
   void completer() override;
-  double val_ext(int i) const override;
+  double val_ext(int i) const override { return val_ext(i,0); };
   double val_ext(int i,int j) const override;
 
 protected :
diff --git a/src/ThHyd/Dilatable/Common/Cond_Lim/Temperature_imposee_paroi_H.h b/src/ThHyd/Dilatable/Common/Cond_Lim/Temperature_imposee_paroi_H.h
index 61d7576e6a..2b02ef9ff0 100644
--- a/src/ThHyd/Dilatable/Common/Cond_Lim/Temperature_imposee_paroi_H.h
+++ b/src/ThHyd/Dilatable/Common/Cond_Lim/Temperature_imposee_paroi_H.h
@@ -1,5 +1,5 @@
 /****************************************************************************
-* Copyright (c) 2024, CEA
+* Copyright (c) 2025, CEA
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
@@ -30,6 +30,7 @@ class Temperature_imposee_paroi_H  : public Temperature_imposee_paroi
 {
   Declare_instanciable(Temperature_imposee_paroi_H);
 public :
+  using Dirichlet::val_imp;
   void completer() override;
   double val_imp(int i) const override;
   double val_imp(int i, int j) const override;
diff --git a/src/ThHyd/Dilatable/Common/Equations/Convection_Diffusion_Fluide_Dilatable_Proto.h b/src/ThHyd/Dilatable/Common/Equations/Convection_Diffusion_Fluide_Dilatable_Proto.h
index 68727d7a71..627228f5f4 100644
--- a/src/ThHyd/Dilatable/Common/Equations/Convection_Diffusion_Fluide_Dilatable_Proto.h
+++ b/src/ThHyd/Dilatable/Common/Equations/Convection_Diffusion_Fluide_Dilatable_Proto.h
@@ -44,7 +44,7 @@ class Convection_Diffusion_Fluide_Dilatable_Proto
   virtual ~Convection_Diffusion_Fluide_Dilatable_Proto() {}
 
 protected:
-  public_for_cuda
+  protected_but_public_for_cuda
   void assembler_impl(Convection_Diffusion_Fluide_Dilatable_base& eqn,
                       Matrice_Morse& mat_morse, const DoubleTab& present, DoubleTab& secmem);
 protected:
diff --git a/src/ThHyd/Dilatable/Common/Equations/Navier_Stokes_Fluide_Dilatable_Proto.cpp b/src/ThHyd/Dilatable/Common/Equations/Navier_Stokes_Fluide_Dilatable_Proto.cpp
index 6ca527a72b..c27951b073 100644
--- a/src/ThHyd/Dilatable/Common/Equations/Navier_Stokes_Fluide_Dilatable_Proto.cpp
+++ b/src/ThHyd/Dilatable/Common/Equations/Navier_Stokes_Fluide_Dilatable_Proto.cpp
@@ -456,7 +456,6 @@ void Navier_Stokes_Fluide_Dilatable_Proto::update_vpoint_on_boundaries(const Nav
   const DoubleTab& tab_rho_face_n = fluide_dil.rho_face_n(), &tab_rho_face_np1=fluide_dil.rho_face_np1();
   const DoubleTab& tab_vit = eqn.vitesse().valeurs();
   const Conds_lim& lescl = eqn.domaine_Cl_dis().les_conditions_limites();
-  const IntTab& face_voisins = eqn.domaine_dis().face_voisins();
   const int taille = tab_vpoint.line_size();
 
   if (taille==1)
@@ -470,30 +469,31 @@ void Navier_Stokes_Fluide_Dilatable_Proto::update_vpoint_on_boundaries(const Nav
           const Front_VF& la_front_dis = ref_cast(Front_VF,la_cl_base.frontiere_dis());
           const Dirichlet& diri=ref_cast(Dirichlet,la_cl_base);
           const int ndeb = la_front_dis.num_premiere_face(), nfin = ndeb + la_front_dis.nb_faces();
-
+          CDoubleTabView val_imp = diri.tab_val_imp().view_ro();
+          CDoubleArrView rho_face_np1 = static_cast<const ArrOfDouble&>(tab_rho_face_np1).view_ro();
+          CDoubleArrView rho_face_n = static_cast<const ArrOfDouble&>(tab_rho_face_n).view_ro();
           if (taille==1) // VDF //
             {
-              ToDo_Kokkos("critical");
-              for (int num_face=ndeb; num_face<nfin; num_face++)
-                {
-                  int n0 = face_voisins(num_face, 0);
-                  if (n0 == -1) n0 = face_voisins(num_face, 1);
-
-                  // GF en cas de diffsion implicite vpoint!=0 on ignrore l'ancienne valeur
-                  tab_vpoint(num_face)=(diri.val_imp(num_face-ndeb,orientation_VDF_(num_face))*tab_rho_face_np1(num_face)-
-                                        tab_vit(num_face)*tab_rho_face_n(num_face))/dt_;
-                }
+              CIntTabView face_voisins = eqn.domaine_dis().face_voisins().view_ro();
+              CIntArrView orientation_VDF = orientation_VDF_.view_ro();
+              CDoubleArrView vit = static_cast<const ArrOfDouble&>(tab_vit).view_ro();
+              DoubleArrView vpoint = static_cast<ArrOfDouble&>(tab_vpoint).view_wo();
+              Kokkos::parallel_for(start_gpu_timer(__KERNEL_NAME__), range_1D(ndeb, nfin), KOKKOS_LAMBDA(const int num_face)
+              {
+                int n0 = face_voisins(num_face, 0);
+                if (n0 == -1) n0 = face_voisins(num_face, 1);
+                // GF en cas de diffsion implicite vpoint!=0 on ignrore l'ancienne valeur
+                vpoint(num_face)=(val_imp(num_face-ndeb,orientation_VDF(num_face))*rho_face_np1(num_face)-
+                                  vit(num_face)*rho_face_n(num_face))/dt_;
+              });
+              end_gpu_timer(__KERNEL_NAME__);
             }
           else // VEF //
             {
               int dim = Objet_U::dimension;
-              CDoubleTabView val_imp = diri.tab_val_imp().view_ro();
-              CDoubleArrView rho_face_np1 = static_cast<const ArrOfDouble&>(tab_rho_face_np1).view_ro();
-              CDoubleArrView rho_face_n = static_cast<const ArrOfDouble&>(tab_rho_face_n).view_ro();
               CDoubleTabView vit = tab_vit.view_ro();
               DoubleTabView vpoint = tab_vpoint.view_wo();
-              Kokkos::MDRangePolicy<Kokkos::Rank<2>> policy({ndeb, 0}, {nfin, dim});
-              Kokkos::parallel_for(start_gpu_timer(__KERNEL_NAME__), policy, KOKKOS_LAMBDA(const int num_face, const int jj)
+              Kokkos::parallel_for(start_gpu_timer(__KERNEL_NAME__), range_2D({ndeb, 0}, {nfin, dim}), KOKKOS_LAMBDA(const int num_face, const int jj)
               {
                 // GF en cas de diffusion implicite vpoint!=0 on ignrore l'ancienne valeur
                 vpoint(num_face,jj)=(rho_face_np1(num_face)*val_imp(num_face-ndeb,jj)
diff --git a/src/ThHyd/Dilatable/Common/Equations/Navier_Stokes_Fluide_Dilatable_Proto.h b/src/ThHyd/Dilatable/Common/Equations/Navier_Stokes_Fluide_Dilatable_Proto.h
index 779295a780..d3a486898b 100644
--- a/src/ThHyd/Dilatable/Common/Equations/Navier_Stokes_Fluide_Dilatable_Proto.h
+++ b/src/ThHyd/Dilatable/Common/Equations/Navier_Stokes_Fluide_Dilatable_Proto.h
@@ -58,7 +58,7 @@ public :
                                 DoubleTab& secmem,DoubleTab& inc_pre,DoubleTab& vpoint );
   void correct_and_compute_u_np1(Navier_Stokes_std& eqn,const Fluide_Dilatable_base& fluide_dil,DoubleTab& rhoU,
                                  DoubleTab& Mmoins1grad,DoubleTab& inc_pre,DoubleTab& gradP,DoubleTab& vpoint);
-  public_for_cuda
+  protected_but_public_for_cuda
   void prepare_and_solve_u_star(Navier_Stokes_std& eqn,const Fluide_Dilatable_base& fluide_dil,DoubleTab& rhoU,DoubleTab& vpoint);
   void update_vpoint_on_boundaries(const Navier_Stokes_std& eqn,const Fluide_Dilatable_base& fluide_dil,DoubleTab& vpoint);
 };
diff --git a/src/ThHyd/Dilatable/Common/Milieu/Loi_Etat_GP_base.cpp b/src/ThHyd/Dilatable/Common/Milieu/Loi_Etat_GP_base.cpp
index 98cfdd280a..be8392f23c 100644
--- a/src/ThHyd/Dilatable/Common/Milieu/Loi_Etat_GP_base.cpp
+++ b/src/ThHyd/Dilatable/Common/Milieu/Loi_Etat_GP_base.cpp
@@ -153,21 +153,23 @@ void Loi_Etat_GP_base::calculer_alpha()
   if (champ_alpha.que_suis_je()=="Champ_Fonc_P0_VDF") isVDF = 1;
   int n=tab_alpha.size();
   bool lambda_uniforme = sub_type(Champ_Uniforme,champ_lambda);
+  CDoubleTabView lambda = tab_lambda.view_ro();
+  CDoubleTabView rho = tab_rho.view_ro();
+  DoubleTabView alpha = tab_alpha.view_wo();
+  double Cp = Cp_; // Cp_ attribute of class can't be used on device: so local copy
   if (isVDF)
     {
-      ToDo_Kokkos("critical");
-      for (int i=0 ; i<n ; i++)
-        tab_alpha(i,0) = (lambda_uniforme ? tab_lambda(0,0) : tab_lambda(i,0)) / (tab_rho(i,0) * Cp_);
+      Kokkos::parallel_for(start_gpu_timer(__KERNEL_NAME__), n, KOKKOS_LAMBDA(const int i)
+      {
+        alpha(i,0) = (lambda_uniforme ? lambda(0,0) : lambda(i,0)) / (rho(i,0) * Cp);
+      });
+      end_gpu_timer(__KERNEL_NAME__);
     }
   else
     {
       const IntTab& tab_elem_faces = ref_cast(Domaine_VF,le_fluide->vitesse().domaine_dis_base()).elem_faces();
       int nfe = tab_elem_faces.line_size();
-      double Cp = Cp_;
       CIntTabView elem_faces = tab_elem_faces.view_ro();
-      CDoubleTabView lambda = tab_lambda.view_ro();
-      CDoubleTabView rho = tab_rho.view_ro();
-      DoubleTabView alpha = tab_alpha.view_wo();
       Kokkos::parallel_for(start_gpu_timer(__KERNEL_NAME__), n, KOKKOS_LAMBDA(
                              const int i)
       {
diff --git a/src/ThHyd/Dilatable/Common/Milieu/Loi_Etat_Mono_GP_base.h b/src/ThHyd/Dilatable/Common/Milieu/Loi_Etat_Mono_GP_base.h
index 0ee866c9f2..85351e1a76 100644
--- a/src/ThHyd/Dilatable/Common/Milieu/Loi_Etat_Mono_GP_base.h
+++ b/src/ThHyd/Dilatable/Common/Milieu/Loi_Etat_Mono_GP_base.h
@@ -1,5 +1,5 @@
 /****************************************************************************
-* Copyright (c) 2024, CEA
+* Copyright (c) 2026, CEA
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
@@ -30,6 +30,8 @@
 class Loi_Etat_Mono_GP_base : public Loi_Etat_GP_base
 {
   Declare_base(Loi_Etat_Mono_GP_base);
+public:
+  KOKKOS_INLINE_FUNCTION static double calculer_rho(double P, double T, double r) { return P / (r * T); }
 protected :
   OWN_PTR(Champ_base) rho_constant_pour_debug_;
 };
diff --git a/src/ThHyd/Dilatable/Common/Milieu/Loi_Etat_base.cpp b/src/ThHyd/Dilatable/Common/Milieu/Loi_Etat_base.cpp
index 865e0383b2..0cf83b3adf 100644
--- a/src/ThHyd/Dilatable/Common/Milieu/Loi_Etat_base.cpp
+++ b/src/ThHyd/Dilatable/Common/Milieu/Loi_Etat_base.cpp
@@ -172,8 +172,14 @@ void Loi_Etat_base::calculer_nu()
   if (viscosite_cinematique.que_suis_je()=="Champ_Fonc_P0_VDF")
     {
       // VDF
-      for (int i=0 ; i<n ; i++)
-        tab_nu(i,0) = tab_mu(uniforme ? 0 : i,0) / tab_rho(i,0);
+      CDoubleTabView rho = tab_rho.view_ro();
+      CDoubleTabView mu = tab_mu.view_ro();
+      DoubleTabView nu = tab_nu.view_rw();
+      Kokkos::parallel_for(start_gpu_timer(__KERNEL_NAME__), n, KOKKOS_LAMBDA(const int i)
+      {
+        nu(i,0) = mu(uniforme ? 0 : i,0) / rho(i,0);
+      });
+      end_gpu_timer(__KERNEL_NAME__);
     }
   else // VEF
     {
diff --git a/src/ThHyd/Dilatable/Common/Sources/Source_Masse_Fluide_Dilatable_base.cpp b/src/ThHyd/Dilatable/Common/Sources/Source_Masse_Fluide_Dilatable_base.cpp
index dab3e032fa..4c61d73f04 100644
--- a/src/ThHyd/Dilatable/Common/Sources/Source_Masse_Fluide_Dilatable_base.cpp
+++ b/src/ThHyd/Dilatable/Common/Sources/Source_Masse_Fluide_Dilatable_base.cpp
@@ -21,6 +21,8 @@
 #include <Domaine_VF.h>
 #include <SFichier.h>
 #include <Param.h>
+#include <kokkos++.h>
+#include <TRUSTArray_kokkos.tpp>
 
 Implemente_base(Source_Masse_Fluide_Dilatable_base, "Source_Masse_Fluide_Dilatable_base", Objet_U);
 // XD mass_source interprete nul 1 Mass source used in a dilatable simulation to add/reduce a mass at the boundary (volumetric source in the first cell of a given boundary).
@@ -114,7 +116,6 @@ void Source_Masse_Fluide_Dilatable_base::mettre_a_jour(double temps)
 
   double sum_conv = 0.;
   std::vector<double> sum_diff_vect(ncomp_);
-  const DoubleTab& val_flux0 = ch_front_source_->valeurs();
 
   const Domaine_Cl_dis_base& zclb = domaine_cl_dis_.valeur();
   const Domaine_VF& zvf = ref_cast(Domaine_VF, zclb.domaine_dis());
@@ -130,14 +131,17 @@ void Source_Masse_Fluide_Dilatable_base::mettre_a_jour(double temps)
           const int is_uniforme = sub_type(Champ_front_uniforme, ch_front_source_.valeur());
           const int ndeb = le_bord.num_premiere_face(), nfin = ndeb + le_bord.nb_faces();
 
+          CDoubleTabView val_flux0 = ch_front_source_->valeurs().view_ro();
+          CDoubleArrView face_surfaces = zvf.face_surfaces().view_ro();
           for (int i = 0; i < ncomp_; i++)
             {
               double sum_diff = 0.;
-              for (int f = ndeb; f < nfin; f++)
-                {
-                  const double surf = zvf.face_surfaces(f);
-                  sum_diff += is_uniforme ? val_flux0(0, i) * surf : val_flux0(f - ndeb, i) * surf;
-                }
+              Kokkos::parallel_reduce(start_gpu_timer(__KERNEL_NAME__), range_1D(ndeb, nfin), KOKKOS_LAMBDA(const int f, double& local_sum)
+              {
+                const double surf = face_surfaces(f);
+                local_sum += is_uniforme ? val_flux0(0, i) * surf : val_flux0(f - ndeb, i) * surf;
+              }, sum_diff);
+              end_gpu_timer(__KERNEL_NAME__);
 
               sum_diff_vect[i] = Process::mp_sum(sum_diff);
             }
@@ -167,9 +171,8 @@ void Source_Masse_Fluide_Dilatable_base::set_temps_defaut(double temps)
   ch_front_source_->set_temps_defaut(temps);
 }
 
-void Source_Masse_Fluide_Dilatable_base::fill_val_flux_tab(DoubleTrav& val_flux) const
+void Source_Masse_Fluide_Dilatable_base::fill_val_flux_tab(DoubleTrav& tab_val_flux) const
 {
-  const DoubleTab& val_flux0 = ch_front_source_->valeurs();
   /*
    * XXX Elie Saikali mai 2025 : soucis avec ICoCo ...
    * Attention : val_flux a dimension de nb_faces or val_flux0 a dimension de nb_faces du bord nom_bord_ ...
@@ -186,10 +189,15 @@ void Source_Masse_Fluide_Dilatable_base::fill_val_flux_tab(DoubleTrav& val_flux)
           // Handle uniform case ... such a pain:
           const int is_uniforme = sub_type(Champ_front_uniforme, ch_front_source_.valeur());
           const int ndeb = le_bord.num_premiere_face(), nfin = ndeb + le_bord.nb_faces();
-
-          for (int num_face = ndeb; num_face < nfin; num_face++)
-            for (int ncomp = 0; ncomp < val_flux0.line_size(); ncomp++)
+          const int ncomp_size = ch_front_source_->valeurs().line_size();
+          CDoubleTabView val_flux0 = ch_front_source_->valeurs().view_ro();
+          DoubleTabView val_flux = tab_val_flux.view_rw();
+          Kokkos::parallel_for(start_gpu_timer(__KERNEL_NAME__), range_1D(ndeb, nfin), KOKKOS_LAMBDA(const int num_face)
+          {
+            for (int ncomp = 0; ncomp < ncomp_size; ncomp++)
               val_flux(num_face, 0) += is_uniforme ? val_flux0(0, ncomp) : val_flux0(num_face - ndeb, ncomp);
+          });
+          end_gpu_timer(__KERNEL_NAME__);
         }
     }
 }
diff --git a/src/ThHyd/Dilatable/Quasi_Compressible/Milieu/Fluide_Quasi_Compressible.h b/src/ThHyd/Dilatable/Quasi_Compressible/Milieu/Fluide_Quasi_Compressible.h
index 2ad449d486..5491df5a15 100644
--- a/src/ThHyd/Dilatable/Quasi_Compressible/Milieu/Fluide_Quasi_Compressible.h
+++ b/src/ThHyd/Dilatable/Quasi_Compressible/Milieu/Fluide_Quasi_Compressible.h
@@ -45,7 +45,7 @@ protected :
   mutable DoubleTab tab_W_old_;
 
 private :
-  public_for_cuda
+  protected_but_public_for_cuda
   void remplir_champ_pression_tot(int n, const DoubleTab& PHydro, DoubleTab& PTot) override;
 };
 
diff --git a/src/ThHyd/Dilatable/Quasi_Compressible/Milieu/Loi_Etat_GP_QC.cpp b/src/ThHyd/Dilatable/Quasi_Compressible/Milieu/Loi_Etat_GP_QC.cpp
index 7a8b9c06e7..39e78813bc 100644
--- a/src/ThHyd/Dilatable/Quasi_Compressible/Milieu/Loi_Etat_GP_QC.cpp
+++ b/src/ThHyd/Dilatable/Quasi_Compressible/Milieu/Loi_Etat_GP_QC.cpp
@@ -54,7 +54,7 @@ void Loi_Etat_GP_QC::compute_tab_rho(DoubleTab& tab_rho)
   DoubleArrView rho = static_cast<ArrOfDouble&>(tab_rho).view_wo();
   Kokkos::parallel_for(start_gpu_timer(__KERNEL_NAME__), tab_rho.size(), KOKKOS_LAMBDA(const int som)
   {
-    rho_np1(som) = rho_constant ? rho_constant : Loi_Etat_Mono_GP_base::calculer_masse_volumique(Pth, tab_ICh(som), R);
+    rho_np1(som) = rho_constant ? rho_constant : calculer_rho(Pth, tab_ICh(som), R);
     rho(som) = 0.5 * (rho_n(som) + rho_np1(som));
   });
   end_gpu_timer(__KERNEL_NAME__);
diff --git a/src/ThHyd/Dilatable/Quasi_Compressible/Milieu/Loi_Etat_GP_QC.h b/src/ThHyd/Dilatable/Quasi_Compressible/Milieu/Loi_Etat_GP_QC.h
index ea1106bf34..2b2baaadc9 100644
--- a/src/ThHyd/Dilatable/Quasi_Compressible/Milieu/Loi_Etat_GP_QC.h
+++ b/src/ThHyd/Dilatable/Quasi_Compressible/Milieu/Loi_Etat_GP_QC.h
@@ -35,7 +35,7 @@ public :
   void calculer_masse_volumique() override;
   double calculer_masse_volumique(double,double) const override;
 protected:
-  public_for_cuda
+  protected_but_public_for_cuda
   void compute_tab_rho(DoubleTab&) override;
 };
 
diff --git a/src/ThHyd/Dilatable/Quasi_Compressible/Sources/Source_QC_Chaleur.h b/src/ThHyd/Dilatable/Quasi_Compressible/Sources/Source_QC_Chaleur.h
index 744f722365..724b540b69 100644
--- a/src/ThHyd/Dilatable/Quasi_Compressible/Sources/Source_QC_Chaleur.h
+++ b/src/ThHyd/Dilatable/Quasi_Compressible/Sources/Source_QC_Chaleur.h
@@ -37,7 +37,7 @@ class Source_QC_Chaleur : public Source_Chaleur_Fluide_Dilatable_base
 public:
   DoubleTab& ajouter(DoubleTab& ) const override;
 protected:
-  public_for_cuda
+  protected_but_public_for_cuda
   virtual DoubleTab& ajouter_(DoubleTab& ) const;
 };
 
diff --git a/src/ThHyd/Incompressible/Cond_Lim/Entree_fluide_T_h_imposee.h b/src/ThHyd/Incompressible/Cond_Lim/Entree_fluide_T_h_imposee.h
index 907c7971e4..af607bd9cc 100644
--- a/src/ThHyd/Incompressible/Cond_Lim/Entree_fluide_T_h_imposee.h
+++ b/src/ThHyd/Incompressible/Cond_Lim/Entree_fluide_T_h_imposee.h
@@ -1,5 +1,5 @@
 /****************************************************************************
-* Copyright (c) 2024, CEA
+* Copyright (c) 2025, CEA
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
@@ -29,6 +29,7 @@ class Entree_fluide_T_h_imposee: public Dirichlet_entree_fluide
 {
   Declare_instanciable(Entree_fluide_T_h_imposee);
 public:
+  using Dirichlet::val_imp;
   double val_imp(int i) const override;
   double val_imp(int i, int j) const override;
   inline void bascule_cond_lim_en_enthalpie() { type_cond_lim = 1; }
diff --git a/src/ThHyd/Incompressible/Cond_Lim/Neumann_sortie_libre.cpp b/src/ThHyd/Incompressible/Cond_Lim/Neumann_sortie_libre.cpp
index dd2c116bbd..a73d21f09d 100644
--- a/src/ThHyd/Incompressible/Cond_Lim/Neumann_sortie_libre.cpp
+++ b/src/ThHyd/Incompressible/Cond_Lim/Neumann_sortie_libre.cpp
@@ -1,5 +1,5 @@
 /****************************************************************************
-* Copyright (c) 2026, CEA
+* Copyright (c) 2025, CEA
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
@@ -160,7 +160,7 @@ double Neumann_sortie_libre::val_ext(int i, int j) const
     return le_champ_ext->valeurs()(i, j);
 }
 
-const DoubleTab& Neumann_sortie_libre::val_ext() const
+const DoubleTab& Neumann_sortie_libre::tab_val_ext() const
 {
   const Front_VF& le_bord = ref_cast(Front_VF, frontiere_dis());
   int nb_faces_tot = le_bord.nb_faces_tot();
diff --git a/src/ThHyd/Incompressible/Cond_Lim/Neumann_sortie_libre.h b/src/ThHyd/Incompressible/Cond_Lim/Neumann_sortie_libre.h
index 5fe1c8b82a..56a3e7c0b2 100644
--- a/src/ThHyd/Incompressible/Cond_Lim/Neumann_sortie_libre.h
+++ b/src/ThHyd/Incompressible/Cond_Lim/Neumann_sortie_libre.h
@@ -1,5 +1,5 @@
 /****************************************************************************
-* Copyright (c) 2024, CEA
+* Copyright (c) 2025, CEA
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
@@ -39,7 +39,7 @@ class Neumann_sortie_libre: public Neumann_val_ext
 
   double val_ext(int i) const override;
   double val_ext(int i, int j) const override;
-  const DoubleTab& val_ext() const;
+  const DoubleTab& tab_val_ext() const override;
   int initialiser(double temps) override;
   void associer_fr_dis_base(const Frontiere_dis_base&) override;
   void verifie_ch_init_nb_comp() const override;
diff --git a/src/ThHyd/Incompressible/Equations/Navier_Stokes_std.cpp b/src/ThHyd/Incompressible/Equations/Navier_Stokes_std.cpp
index 32437ef8c3..0881834899 100644
--- a/src/ThHyd/Incompressible/Equations/Navier_Stokes_std.cpp
+++ b/src/ThHyd/Incompressible/Equations/Navier_Stokes_std.cpp
@@ -1067,7 +1067,6 @@ void Navier_Stokes_std::mettre_a_jour(double temps)
   Debog::verifier("Navier_Stokes_std::mettre_a_jour : pression", la_pression->valeurs());
   Debog::verifier("Navier_Stokes_std::mettre_a_jour : vitesse", la_vitesse->valeurs());
 
-  if (la_vorticite) la_vorticite->mettre_a_jour(temps);
   if (critere_Q) critere_Q->mettre_a_jour(temps);
   if (Reynolds_maille) Reynolds_maille->mettre_a_jour(temps);
   if (Taux_cisaillement) Taux_cisaillement->mettre_a_jour(temps);
@@ -1505,7 +1504,7 @@ const Champ_base& Navier_Stokes_std::get_champ(const Motcle& nom) const
         throw std::runtime_error(std::string("Field ") + nom.getString() + std::string(" not found !"));
 
       Champ_Fonc_base& ch = ref_cast_non_const(Champ_Fonc_base, la_vorticite.valeur());
-      if ((ch.temps() == temps_init) && (la_vitesse->mon_equation_non_nul()))
+      if (((ch.temps() != la_vitesse->temps()) || (ch.temps() == temps_init)) && (la_vitesse->mon_equation_non_nul()))
         ch.mettre_a_jour(la_vitesse->temps());
       return champs_compris_.get_champ(nom);
     }
diff --git a/src/ThHyd/Incompressible/Problems/Generique/Pb_Fluide_base.cpp b/src/ThHyd/Incompressible/Problems/Generique/Pb_Fluide_base.cpp
index d9bdfdbfa6..734e174778 100644
--- a/src/ThHyd/Incompressible/Problems/Generique/Pb_Fluide_base.cpp
+++ b/src/ThHyd/Incompressible/Problems/Generique/Pb_Fluide_base.cpp
@@ -103,6 +103,13 @@ int Pb_Fluide_base::expression_predefini(const Motcle& motlu, Nom& expression)
       expression += " energie_cinetique_elem } ";
       return 1;
     }
+  else if (motlu=="ENSTROPHIE_TOTALE")
+    {
+      expression = "predefini { pb_champ ";
+      expression += le_nom();
+      expression += " enstrophie_totale } ";
+      return 1;
+    }
   else if (motlu=="VISCOUS_FORCE_X")
     {
       expression = "predefini { pb_champ ";
diff --git a/src/ThHyd/Incompressible/Sources/Terme_Boussinesq_base.h b/src/ThHyd/Incompressible/Sources/Terme_Boussinesq_base.h
index 6f1c25bc89..e8f31967b6 100644
--- a/src/ThHyd/Incompressible/Sources/Terme_Boussinesq_base.h
+++ b/src/ThHyd/Incompressible/Sources/Terme_Boussinesq_base.h
@@ -125,6 +125,12 @@ inline double valeur(const DoubleTab& valeurs, const int elem, const int dim)
     return valeurs(elem,dim);
 }
 
+KOKKOS_INLINE_FUNCTION
+double valeur(CDoubleTabView valeurs, const int elem, const int dim)
+{
+  return valeurs.rank()==1 ? valeurs(elem,0) : valeurs(elem,dim);
+}
+
 // Methode de calcul de la valeur sur une face encadree par elem1 et elem2 d'un champ uniforme ou non a plusieurs composantes
 inline double valeur(const DoubleTab& valeurs_champ, int elem1, int elem2, const int compo)
 {
@@ -139,6 +145,22 @@ inline double valeur(const DoubleTab& valeurs_champ, int elem1, int elem2, const
         return 0.5*(valeurs_champ(elem1,compo)+valeurs_champ(elem2,compo));
     }
 }
+
+KOKKOS_INLINE_FUNCTION
+double valeur(CDoubleTabView valeurs_champ, int elem1, int elem2, const int compo)
+{
+  if (valeurs_champ.extent(0)==1)
+    return valeurs_champ(0,compo); // Champ uniforme
+  else
+    {
+      if (elem2<0) elem2 = elem1; // face frontiere
+      if (valeurs_champ.rank()==1)
+        return 0.5*(valeurs_champ(elem1,0)+valeurs_champ(elem2,0));
+      else
+        return 0.5*(valeurs_champ(elem1,compo)+valeurs_champ(elem2,compo));
+    }
+}
+
 KOKKOS_INLINE_FUNCTION
 double valeur(CDoubleTabView valeurs_champ, int valeurs_champ_dimension0, int nb_dim, int elem1, int elem2, const int compo, int nb_compo)
 {
diff --git a/src/ThHyd/Incompressible/Traitement_particulier/Traitement_particulier_NS_EC.cpp b/src/ThHyd/Incompressible/Traitement_particulier/Traitement_particulier_NS_EC.cpp
index 9265040d88..eea80c2f80 100644
--- a/src/ThHyd/Incompressible/Traitement_particulier/Traitement_particulier_NS_EC.cpp
+++ b/src/ThHyd/Incompressible/Traitement_particulier/Traitement_particulier_NS_EC.cpp
@@ -1,5 +1,5 @@
 /****************************************************************************
-* Copyright (c) 2024, CEA
+* Copyright (c) 2026, CEA
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
@@ -172,73 +172,80 @@ void Traitement_particulier_NS_EC::post_traitement_particulier()
 static double trait_part_calculer_ec_faces(const int         face_debut,
                                            const int         nb_faces,
                                            const int                frontiere,
-                                           const DoubleTab&         vitesse,
-                                           const DoubleVect&         volumes_entrelaces,
-                                           const DoubleTab&         xv,
-                                           const DoubleTab&         masse_volumique,
-                                           const ArrOfDouble&         translation,
-                                           const ArrOfDouble&         rotation,
+                                           const DoubleTab&         tab_vitesse,
+                                           const DoubleVect&         tab_volumes_entrelaces,
+                                           const DoubleTab&         tab_xv,
+                                           const DoubleTab&         tab_masse_volumique,
+                                           const ArrOfDouble&         tab_translation,
+                                           const ArrOfDouble&         tab_rotation,
                                            const int         repere_mobile_,
-                                           const ArrOfInt&        faces_doubles
+                                           const ArrOfInt&        tab_faces_doubles
                                           )
 {
   const int face_fin = face_debut + nb_faces;
   double ec = 0.;
-  double rho = 0.;
-  const int nb_dim_1 = (vitesse.line_size() == 1);
+  const int nb_dim_1 = (tab_vitesse.line_size() == 1);
   const int dim      = Objet_U::dimension;
-  ArrOfDouble ve(Objet_U::dimension);
-  for (int face = face_debut; face < face_fin; face++)
+  if (nb_dim_1 && repere_mobile_)
     {
-      // Calcul de la vitesse d'entrainement
-      if (repere_mobile_)
-        {
-          ve[0]=translation[0];
-          ve[1]=translation[1];
-          if (Objet_U::dimension==3)
-            {
-              ve[2]=translation[2];
-              ve[0]+=rotation[1]*xv(face,2)-rotation[2]*xv(face,1);
-              ve[1]+=rotation[2]*xv(face,0)-rotation[0]*xv(face,2);
-              ve[2]+=rotation[0]*xv(face,1)-rotation[1]*xv(face,0);
-            }
-        }
-      else
-        ve=0;
-
-      double v2;
-      double volume;
-      if (nb_dim_1)
-        {
-          // Une composante de vitesse a la face (VDF)
-          const double v = vitesse(face);
-          if (repere_mobile_)
-            {
-              Cerr << "Le codage de l'energie cinetique calculee dans un repere fixe" <<finl;
-              Cerr << "n'est pas fait en VDF." << finl;
-              Process::exit(); // En effet probleme de conception, il faudrait avoir l'orientation des faces VDF
-            }
-          v2 = v * v;
-          // En VDF, sur les frontieres, on ne prend que le 1/2 volume entrelace
-          volume = (frontiere ? 0.5 : 1) * volumes_entrelaces(face);
-        }
-      else
-        {
-          // Deux ou trois composantes (VEFP1B)
-          v2 = 0.;
-          for (int i = 0; i < dim; i++)
-            {
-              const double v_i = vitesse(face, i);
-              v2 += (v_i + ve[i]) * (v_i + ve[i]);
-            }
-          // En VEF, cela est incorrect, il faudrait les volumes etendus:
-          volume = volumes_entrelaces(face);
-        }
-      const int k = (masse_volumique.dimension(0)==1) ? 0 : face;
-      rho = masse_volumique(k, 0);
-      double contribution = (faces_doubles[face]==1) ? 0.5 : 1 ;
-      ec += contribution * 0.5 * v2 * volume * rho;
+      Cerr << "Le codage de l'energie cinetique calculee dans un repere fixe" <<finl;
+      Cerr << "n'est pas fait en VDF." << finl;
+      Process::exit(); // En effet probleme de conception, il faudrait avoir l'orientation des faces VDF
     }
+  const int masse_vol_uniform = (tab_masse_volumique.dimension(0) == 1);
+  const double volume_factor = (frontiere ? 0.5 : 1.);
+  CDoubleTabView vitesse = tab_vitesse.view_ro();
+  CDoubleArrView volumes_entrelaces = tab_volumes_entrelaces.view_ro();
+  CDoubleTabView masse_volumique = tab_masse_volumique.view_ro();
+  CIntArrView faces_doubles = tab_faces_doubles.view_ro();
+  CDoubleTabView xv = tab_xv.view_ro();
+  CDoubleArrView translation = tab_translation.view_ro();
+  CDoubleArrView rotation = tab_rotation.view_ro();
+  Kokkos::parallel_reduce(start_gpu_timer(__KERNEL_NAME__), Kokkos::RangePolicy<>(face_debut, face_fin),
+                          KOKKOS_LAMBDA(const int face, double& ec_)
+  {
+    // Calcul de la vitesse d'entrainement
+    double ve0 = 0., ve1 = 0., ve2 = 0.;
+    if (repere_mobile_)
+      {
+        ve0 = translation(0);
+        ve1 = translation(1);
+        if (dim == 3)
+          {
+            ve2 = translation(2);
+            ve0 += rotation(1)*xv(face,2) - rotation(2)*xv(face,1);
+            ve1 += rotation(2)*xv(face,0) - rotation(0)*xv(face,2);
+            ve2 += rotation(0)*xv(face,1) - rotation(1)*xv(face,0);
+          }
+      }
+    double v2, volume;
+    if (nb_dim_1)
+      {
+        // Une composante de vitesse a la face (VDF)
+        const double v = vitesse(face, 0);
+        v2 = v * v;
+        // En VDF, sur les frontieres, on ne prend que le 1/2 volume entrelace
+        volume = volume_factor * volumes_entrelaces(face);
+      }
+    else
+      {
+        // Deux ou trois composantes (VEFP1B)
+        v2 = 0.;
+        const double ve[3] = {ve0, ve1, ve2};
+        for (int i = 0; i < dim; i++)
+          {
+            const double v_i = vitesse(face, i);
+            v2 += (v_i + ve[i]) * (v_i + ve[i]);
+          }
+        // En VEF, cela est incorrect, il faudrait les volumes etendus:
+        volume = volumes_entrelaces(face);
+      }
+    const int k = masse_vol_uniform ? 0 : face;
+    const double rho = masse_volumique(k, 0);
+    const double contribution = (faces_doubles(face) == 1) ? 0.5 : 1.;
+    ec_ += contribution * 0.5 * v2 * volume * rho;
+  }, Kokkos::Sum<double>(ec));
+  end_gpu_timer(__KERNEL_NAME__);
   return ec;
 }
 
diff --git a/src/ThHyd/Multiphase/Schemas_Temps/SETS.cpp b/src/ThHyd/Multiphase/Schemas_Temps/SETS.cpp
index d5d8766737..e92650fa18 100644
--- a/src/ThHyd/Multiphase/Schemas_Temps/SETS.cpp
+++ b/src/ThHyd/Multiphase/Schemas_Temps/SETS.cpp
@@ -212,7 +212,7 @@ void SETS::init_cv_ctx(const DoubleTab& secmem, const DoubleVect& norme)
   KSPConvergedDefaultCreate(&cv_ctx->defctx);
 }
 
-#if PETSC_VERSION_GE(3,24,0)
+#if PETSC_VERSION_GE(3,24,0) && PETSC_VERSION_LT(3,25,0)
 PetscErrorCode SETS::destroy_cvctx(void **mctx)
 {
   SETS::cv_test_t *ctx = (SETS::cv_test_t *)*mctx;
@@ -224,6 +224,18 @@ PetscErrorCode SETS::destroy_cvctx(void **mctx)
   free(ctx);
   return err;
 }
+#elif PETSC_VERSION_GE(3,25,0)
+PetscErrorCode SETS::destroy_cvctx(void *mctx)
+{
+  SETS::cv_test_t *ctx = *(SETS::cv_test_t **)mctx;
+  if (ctx->v)
+    VecDestroy(&ctx->v);
+  if (ctx->t)
+    VecDestroy(&ctx->t);
+  PetscErrorCode err = KSPConvergedDefaultDestroy((void *)&ctx->defctx);
+  free(ctx);
+  return err;
+}
 #else
 PetscErrorCode SETS::destroy_cvctx(void *mctx)
 {
diff --git a/src/ThHyd/Multiphase/Schemas_Temps/SETS.h b/src/ThHyd/Multiphase/Schemas_Temps/SETS.h
index 3887bba782..d70326b596 100644
--- a/src/ThHyd/Multiphase/Schemas_Temps/SETS.h
+++ b/src/ThHyd/Multiphase/Schemas_Temps/SETS.h
@@ -94,7 +94,7 @@ class SETS: public Simpler
   ArrOfTID ix; //indices pour recuperer le residu
   cv_test_t *cv_ctx = nullptr;
   void init_cv_ctx(const DoubleTab& secmem, const DoubleVect& norm);
-#if PETSC_VERSION_GE(3,24,0)
+#if PETSC_VERSION_GE(3,24,0) && PETSC_VERSION_LT(3,25,0)
   static PetscErrorCode destroy_cvctx(void **mctx);
 #else
   static PetscErrorCode destroy_cvctx(void *mctx);
diff --git a/src/ThHyd/Rayonnement/Semi_transparent/Equations/Eq_rayo_semi_transp.cpp b/src/ThHyd/Rayonnement/Semi_transparent/Equations/Eq_rayo_semi_transp.cpp
index e51857f808..d78c1115ff 100644
--- a/src/ThHyd/Rayonnement/Semi_transparent/Equations/Eq_rayo_semi_transp.cpp
+++ b/src/ThHyd/Rayonnement/Semi_transparent/Equations/Eq_rayo_semi_transp.cpp
@@ -299,18 +299,17 @@ void Eq_rayo_semi_transp::Mat_Morse_to_Mat_Bloc(Matrice& matrice_tmp)
   DoubleTab ligne_tmp(n1);
   for (int i = 0; i < n2; i++)
     {
-      int k;
       // On recopie le premier bloc de la matrice dans un tableau :
       //      ligne_tmp = 0;
-      for (k = la_matrice_.get_tab1()(i) - 1; k < la_matrice_.get_tab1()(i + 1) - 1; k++)
+      for (auto k = la_matrice_.get_tab1()(i) - 1; k < la_matrice_.get_tab1()(i + 1) - 1; k++)
         ligne_tmp(la_matrice_.get_tab2()(k) - 1) = la_matrice_.get_coeff()(k);
 
       // On complete la partie reelle de la matrice
-      for (k = tab1RR(i) - 1; k < tab1RR(i + 1) - 1; k++)
+      for (auto k = tab1RR(i) - 1; k < tab1RR(i + 1) - 1; k++)
         coeffRR[k] = ligne_tmp(tab2RR[k] - 1);
 
       // On complete la partie virtuelle
-      for (k = tab1RV(i) - 1; k < tab1RV(i + 1) - 1; k++)
+      for (auto k = tab1RV(i) - 1; k < tab1RV(i + 1) - 1; k++)
         coeffRV[k] = ligne_tmp(n2 + tab2RV[k] - 1);
     }
 }
@@ -347,13 +346,11 @@ void Eq_rayo_semi_transp::dimensionner_Mat_Bloc_Morse_Sym(Matrice& matrice_tmp)
 
   // On parcours les lignes de la_matrice pour compter les elements
   // non nuls de chaque ligne
-  int jcolonne;
   for (iligne = 0; iligne < n2; iligne++)
     {
-      int k;
-      for (k = tab1(iligne) - 1; k < tab1(iligne + 1) - 1; k++)
+      for (auto k = tab1(iligne) - 1; k < tab1(iligne + 1) - 1; k++)
         {
-          jcolonne = tab2(k) - 1;
+          auto jcolonne = tab2(k) - 1;
           if (jcolonne < n2)
             {
               // l'element correspondant est dans la partie RR de la_matrice
@@ -384,15 +381,13 @@ void Eq_rayo_semi_transp::dimensionner_Mat_Bloc_Morse_Sym(Matrice& matrice_tmp)
   MBrv.dimensionner(n2, n1 - n2, tab1RV(n2) - 1);
 
   // On remplit tab2RR et tab2RV
-  int compteurRR, compteurRV;
   for (iligne = 0; iligne < n2; iligne++)
     {
-      int k;
-      compteurRR = tab1RR(iligne) - 1;
-      compteurRV = tab1RV(iligne) - 1;
-      for (k = tab1(iligne) - 1; k < tab1(iligne + 1) - 1; k++)
+      auto compteurRR = tab1RR(iligne) - 1;
+      auto compteurRV = tab1RV(iligne) - 1;
+      for (auto k = tab1(iligne) - 1; k < tab1(iligne + 1) - 1; k++)
         {
-          jcolonne = tab2(k) - 1;
+          auto jcolonne = tab2(k) - 1;
           if (jcolonne < n2)
             {
               // l'element correspondant est dans la partie RR de la_matrice
diff --git a/src/ThHyd/Schemas_Temps/Simple.cpp b/src/ThHyd/Schemas_Temps/Simple.cpp
index abb64bc2be..3bae04dc04 100644
--- a/src/ThHyd/Schemas_Temps/Simple.cpp
+++ b/src/ThHyd/Schemas_Temps/Simple.cpp
@@ -450,7 +450,16 @@ bool Simple::iterer_eqs(LIST(OBS_PTR(Equation_base)) eqs, int nb_iter, int& ok)
   DoubleTab_parts residu_parts(residus), inconnues_parts(inconnues), dudt_parts(dudt);
 
   //remplissage des inconnues
-  for(i = 0; i < eqs.size(); i++) inconnues_parts[i] = eqs[i]->inconnue().valeurs();
+  // DoubleTab_parts share data_location_: allocating device memory for parts[0] marks all parts as
+  // Device, but parts[1..] have no exact entry in DeviceMemory, causing inconsistency. Ensure all
+  // unknowns are on host so no device allocation is triggered in copy_.
+  for(i = 0; i < eqs.size(); i++)
+    {
+      ToDo_Kokkos("Fix this D2H copy.");
+      eqs[i]->inconnue().valeurs().ensureDataOnHost();
+    }
+  for(i = 0; i < eqs.size(); i++)
+    inconnues_parts[i] = eqs[i]->inconnue().valeurs();
   dudt = inconnues;
 
   //remplissage des matrices
diff --git a/src/ThHyd/Turbulence/Lois_Paroi/Turbulence_paroi_base.cpp b/src/ThHyd/Turbulence/Lois_Paroi/Turbulence_paroi_base.cpp
index 3bde1b6935..3c59a2ac5a 100644
--- a/src/ThHyd/Turbulence/Lois_Paroi/Turbulence_paroi_base.cpp
+++ b/src/ThHyd/Turbulence/Lois_Paroi/Turbulence_paroi_base.cpp
@@ -124,24 +124,29 @@ const Champ_base& Turbulence_paroi_base::get_champ(const Motcle& nom) const
   if (champ_u_star_ && nom == champ_u_star_->le_nom())
     {
       // Initialisation a 0 du champ volumique u_star
-      DoubleTab& valeurs = champ_u_star_->valeurs();
-      valeurs = 0;
+      DoubleTab& tab_valeurs = champ_u_star_->valeurs();
+      tab_valeurs = 0;
       const Equation_base& my_eqn = mon_modele_turb_hyd->equation();
       if (tab_u_star_.size_array() > 0)
         {
           // Boucle sur les frontieres pour recuperer u_star si tab_u_star dimensionne
           int nb_front = my_eqn.domaine_dis().nb_front_Cl();
+          CDoubleArrView u_star = tab_u_star_.view_ro();
+          DoubleArrView valeurs = static_cast<ArrOfDouble&>(tab_valeurs).view_rw();
           for (int n_bord = 0; n_bord < nb_front; n_bord++)
             {
               const Cond_lim& la_cl = my_eqn.domaine_Cl_dis().les_conditions_limites(n_bord);
               const Front_VF& le_bord = ref_cast(Front_VF, la_cl->frontiere_dis());
-              int ndeb = le_bord.num_premiere_face();
-              int nfin = ndeb + le_bord.nb_faces();
-              for (int num_face = ndeb; num_face < nfin; num_face++)
-                valeurs(num_face) = tab_u_star_(num_face);
+              const int ndeb = le_bord.num_premiere_face();
+              const int nfin = ndeb + le_bord.nb_faces();
+              Kokkos::parallel_for(start_gpu_timer(__KERNEL_NAME__), range_1D(ndeb, nfin), KOKKOS_LAMBDA(const int num_face)
+              {
+                valeurs(num_face) = u_star(num_face);
+              });
+              end_gpu_timer(__KERNEL_NAME__);
             }
         }
-      valeurs.echange_espace_virtuel();
+      tab_valeurs.echange_espace_virtuel();
       // Met a jour le temps du champ:
       champ_u_star_->mettre_a_jour(my_eqn.schema_temps().temps_courant());
       return champs_compris_.get_champ(nom);
diff --git a/src/ThHyd/Turbulence/Lois_Paroi/Turbulence_paroi_scal_base.h b/src/ThHyd/Turbulence/Lois_Paroi/Turbulence_paroi_scal_base.h
index 217915444d..d9d9bd1490 100644
--- a/src/ThHyd/Turbulence/Lois_Paroi/Turbulence_paroi_scal_base.h
+++ b/src/ThHyd/Turbulence/Lois_Paroi/Turbulence_paroi_scal_base.h
@@ -1,5 +1,5 @@
 /****************************************************************************
-* Copyright (c) 2025, CEA
+* Copyright (c) 2026, CEA
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
@@ -106,7 +106,7 @@ class Turbulence_paroi_scal_base: public Champs_compris_interface, public Objet_
   int calcul_ldp_en_flux_impose_; // flag defenissant si on utilise la ldp en flux impose 0 par defaut
   double Prdt_sur_kappa_;         // Constante dans la loi de paroi
   KOKKOS_INLINE_FUNCTION
-  double T_plus(double y_plus, double Pr, double Prdt_sur_kappa);
+  static double T_plus(double y_plus, double Pr, double Prdt_sur_kappa);
 
   DoubleVects equivalent_distance_;
   // tableau des distances equivalentes sur chaque bord
diff --git a/src/ThHyd/Turbulence/Modeles/scal/Modele_turbulence_scal_Prandtl.h b/src/ThHyd/Turbulence/Modeles/scal/Modele_turbulence_scal_Prandtl.h
index 3f3086bfaf..668d2de307 100644
--- a/src/ThHyd/Turbulence/Modeles/scal/Modele_turbulence_scal_Prandtl.h
+++ b/src/ThHyd/Turbulence/Modeles/scal/Modele_turbulence_scal_Prandtl.h
@@ -39,7 +39,7 @@ class Modele_turbulence_scal_Prandtl: public Modele_turbulence_scal_diffturb_bas
   Nom definition_fonction_; // stockage de la chaine du jdd
   Parser_U fonction_; // fonction de calcul de alpha_t
   Parser_U fonction1_; // fonction de calcul de Prandtl variant en espace
-  public_for_cuda
+  protected_but_public_for_cuda
   virtual Champ_Fonc_base& calculer_diffusivite_turbulente();
 };
 
diff --git a/src/VDF/Champs/Champ_Face_VDF.cpp b/src/VDF/Champs/Champ_Face_VDF.cpp
index f1c6d26b6f..b9f802d8ae 100644
--- a/src/VDF/Champs/Champ_Face_VDF.cpp
+++ b/src/VDF/Champs/Champ_Face_VDF.cpp
@@ -72,6 +72,7 @@ Champ_base& Champ_Face_VDF::affecter_(const Champ_base& ch)
       int ndeb_int = domaine_VDF.premiere_face_int();
       const IntTab& f_e = domaine_VDF.face_voisins();
 
+      ToDo_Kokkos("critical");
       for (int f = 0; f < ndeb_int; f++)
         {
           const int ori = orientation(f);
@@ -80,6 +81,7 @@ Champ_base& Champ_Face_VDF::affecter_(const Champ_base& ch)
             val(f, n) = v(e, N * ori + n);
         }
 
+      ToDo_Kokkos("critical");
       for (int f = ndeb_int; f < domaine_VDF.nb_faces(); f++)
         {
           const int ori = orientation(f);
@@ -94,6 +96,7 @@ Champ_base& Champ_Face_VDF::affecter_(const Champ_base& ch)
       if (unif) eval = ch.valeurs();
       else eval.resize(val.dimension(0), N * D), ch.valeur_aux(domaine_VDF.xv(), eval);
 
+      ToDo_Kokkos("critical");
       for (int f = 0; f < domaine_VDF.nb_faces(); f++)
         for (int n = 0; n < N; n++)
           val(f, n) = eval(unif ? 0 : f, N * orientation(f) + n);
@@ -154,6 +157,7 @@ const Champ_Proto& Champ_Face_VDF::affecter(const DoubleTab& v)
     {
       if (v.dimension(1) == dimension)
         {
+          ToDo_Kokkos("critical");
           if (v.dimension(0) == val.size())
             for (int num_face = 0; num_face < val.size(); num_face++)
               val(num_face) = v(num_face, orientation(num_face));
@@ -179,8 +183,7 @@ void Champ_Face_VDF::verifie_valeurs_cl()
 {
   const Domaine_Cl_dis_base& zcl = domaine_Cl_dis();
   int nb_cl = zcl.nb_cond_lim();
-  DoubleTab& ch_tab = valeurs();
-  int ndeb, nfin, num_face;
+  DoubleTab& tab_ch = valeurs();
 
   for (int i = 0; i < nb_cl; i++)
     {
@@ -189,24 +192,24 @@ void Champ_Face_VDF::verifie_valeurs_cl()
         {
           const Periodique& la_cl_perio = ref_cast(Periodique, la_cl);
           const Front_VF& le_bord = ref_cast(Front_VF, la_cl.frontiere_dis());
-          ndeb = le_bord.num_premiere_face();
-          nfin = ndeb + le_bord.nb_faces();
-          int voisine;
-          double moy;
-
-          for (num_face = ndeb; num_face < nfin; num_face++)
-            {
-              voisine = la_cl_perio.face_associee(num_face - ndeb) + ndeb;
-              if (ch_tab[num_face] != ch_tab[voisine])
-                {
-                  moy = 0.5 * (ch_tab[num_face] + ch_tab[voisine]);
-                  ch_tab[num_face] = moy;
-                  ch_tab[voisine] = moy;
-                }
-            }
+          const int ndeb = le_bord.num_premiere_face();
+          const int nfin = ndeb + le_bord.nb_faces();
+          CIntArrView face_associee = la_cl_perio.face_associee().view_ro();
+          DoubleArrView ch_tab = static_cast<ArrOfDouble&>(tab_ch).view_rw();
+          Kokkos::parallel_for(start_gpu_timer(__KERNEL_NAME__), range_1D(ndeb, nfin), KOKKOS_LAMBDA(const int num_face)
+          {
+            const int voisine = face_associee(num_face - ndeb) + ndeb;
+            if (ch_tab(num_face) != ch_tab(voisine))
+              {
+                const double moy = 0.5 * (ch_tab(num_face) + ch_tab(voisine));
+                ch_tab(num_face) = moy;
+                ch_tab(voisine) = moy;
+              }
+          });
+          end_gpu_timer(__KERNEL_NAME__);
         }
     }
-  ch_tab.echange_espace_virtuel();
+  tab_ch.echange_espace_virtuel();
 }
 
 /*! @brief Renvoie la valeur que devrait avoir le champ sur une face de bord, si on en croit les conditions aux limites.
@@ -220,7 +223,7 @@ void Champ_Face_VDF::verifie_valeurs_cl()
 double Champ_Face_VDF::val_imp_face_bord_private(int face, int comp) const
 {
   const Domaine_Cl_VDF& zclo = ref_cast(Domaine_Cl_VDF, equation().domaine_Cl_dis());
-  return Champ_Face_get_val_imp_face_bord_sym(valeurs(), temps(), face, comp, zclo);
+  return Champ_Face_get_val_imp_face_bord(temps(), face, comp, zclo, &valeurs());
 }
 
 // WEC : jamais appele !!
@@ -260,14 +263,10 @@ void Champ_Face_VDF::calculer_rotationnel_ordre2_centre_element(DoubleTab& rot)
 {
   const DoubleTab& val = valeurs();
   const Domaine_VDF& domaine_VDF = domaine_vdf();
-  int nb_elem = domaine_VDF.nb_elem();
-  const IntTab& face_voisins = domaine_VDF.face_voisins();
-  const IntTab& elem_faces = domaine_VDF.elem_faces();
-
   if (dimension == 2)
-    calrotord2centelemdim2(rot, val, domaine_VDF, nb_elem, face_voisins, elem_faces);
+    calrotord2centelemdim2(rot, val, domaine_VDF);
   else if (dimension == 3)
-    calrotord2centelemdim3(rot, val, domaine_VDF, nb_elem, face_voisins, elem_faces);
+    calrotord2centelemdim3(rot, val, domaine_VDF);
 }
 
 int Champ_Face_VDF::imprime(Sortie& os, int ncomp) const
@@ -276,7 +275,7 @@ int Champ_Face_VDF::imprime(Sortie& os, int ncomp) const
   return 1;
 }
 
-void Champ_Face_VDF::calcul_critere_Q(DoubleTab& Q, const Domaine_Cl_VDF& domaine_Cl_VDF)
+void Champ_Face_VDF::calcul_critere_Q(DoubleTab& tab_Q, const Domaine_Cl_VDF& domaine_Cl_VDF)
 {
   // Q=0.5*(\Omega_{ij}*\Omega_{ij}-S_{ij}*S_{ij})=-0.25*du_i/dx_j*du_j/dx_i
 
@@ -285,55 +284,45 @@ void Champ_Face_VDF::calcul_critere_Q(DoubleTab& Q, const Domaine_Cl_VDF& domain
   const DoubleTab& vitesse = valeurs();
   const int nb_elem = domaine_VDF.nb_elem();
   const int nb_elem_tot = domaine_VDF.nb_elem_tot();
-  int num_elem, i, j, N = vitesse.line_size();
-  double crit, deriv1, deriv2;
-
+  const int N = vitesse.line_size();
+  const int dim = Objet_U::dimension;
   if (N!=1) Process::exit(que_suis_je() + "::calcul_critere_Q : the velocity field must be single phase !!");
 
-  DoubleTab gradient_elem(nb_elem_tot, dimension, dimension, N);
-  gradient_elem = 0.;
-
-  vit.calcul_duidxj(vitesse, gradient_elem, domaine_Cl_VDF);
-
-  for (num_elem = 0; num_elem < nb_elem; num_elem++)
-    {
-      crit = 0.;
-      for (i = 0; i < dimension; i++)
-        for (j = 0; j < dimension; j++)
-          {
-            deriv1 = gradient_elem(num_elem, i, j, 0);
-            deriv2 = gradient_elem(num_elem, j, i, 0);
-
-            crit += -0.25 * deriv1 * deriv2;
-          }
-      Q[num_elem] = crit;
-    }
+  DoubleTrav tab_gradient_elem(nb_elem_tot, dim, dim, N);
+  vit.calcul_duidxj(vitesse, tab_gradient_elem, domaine_Cl_VDF);
+  CDoubleTabView4 gradient_elem = tab_gradient_elem.view_ro<4>();
+  DoubleArrView Q = static_cast<ArrOfDouble&>(tab_Q).view_wo();
+  Kokkos::parallel_for(start_gpu_timer(__KERNEL_NAME__), range_1D(0, nb_elem), KOKKOS_LAMBDA(const int num_elem)
+  {
+    double crit = 0.;
+    for (int i = 0; i < dim; i++)
+      for (int j = 0; j < dim; j++)
+        crit += -0.25 * gradient_elem(num_elem, i, j, 0) * gradient_elem(num_elem, j, i, 0);
+    Q(num_elem) = crit;
+  });
+  end_gpu_timer(__KERNEL_NAME__);
 }
 
-void Champ_Face_VDF::calcul_y_plus(DoubleTab& y_plus, const Domaine_Cl_VDF& domaine_Cl_VDF)
+void Champ_Face_VDF::calcul_y_plus(DoubleTab& tab_y_plus, const Domaine_Cl_VDF& domaine_Cl_VDF)
 {
   // On initialise le champ y_plus avec une valeur negative,
   // comme ca lorsqu'on veut visualiser le champ pres de la paroi,
   // on n'a qu'a supprimer les valeurs negatives et n'apparaissent
   // que les valeurs aux parois.
 
-  int ndeb, nfin, elem, ori, l_unif;
-  double norm_tau, u_etoile, norm_v = 0, dist, val0, val1, val2, d_visco = 0, visco = 1.;
-  y_plus = -1.;
+  int ndeb, nfin, l_unif;
+  double visco = 1.;
 
   const Champ_Face_VDF& vit = *this;
   const Domaine_VDF& domaine_VDF = domaine_vdf();
-  const IntTab& face_voisins = domaine_VDF.face_voisins();
-  const IntVect& orientation = domaine_VDF.orientation();
   const Equation_base& eqn_hydr = equation();
   const Fluide_base& le_fluide = ref_cast(Fluide_base, eqn_hydr.milieu());
   const Champ_Don_base& ch_visco_cin = le_fluide.viscosite_cinematique();
-  const DoubleTab& tab_visco = ch_visco_cin.valeurs();
-  //DoubleTab& tab_visco = ch_visco_cin.valeurs();
+  const DoubleTab& tab_visco_cin = ch_visco_cin.valeurs();
 
   if (sub_type(Champ_Uniforme, ch_visco_cin))
     {
-      visco = tab_visco(0, 0);
+      visco = tab_visco_cin(0, 0);
       l_unif = 1;
     }
   else
@@ -343,15 +332,15 @@ void Champ_Face_VDF::calcul_y_plus(DoubleTab& y_plus, const Domaine_Cl_VDF& doma
   /* GF on a pas a change tab_visco ici !
    if (!l_unif)
    {
-   const int n = tab_visco.size_array();
-   ArrOfDouble& v = tab_visco;
+   const int n = tab_visco_cin.size_array();
+   ArrOfDouble& v = tab_visco_cin;
    for (int i = 0; i < n; i++)
    if (v[i] < DMINFLOAT)
    v[i] = DMINFLOAT;
    }
    */
 
-  DoubleTab yplus_faces(1, 1); // will contain yplus values if available
+  DoubleTrav tab_yplus_faces(1, 1); // will contain yplus values if available
   int yplus_already_computed = 0; // flag
 
   const RefObjU& modele_turbulence = eqn_hydr.get_modele(TURBULENCE);
@@ -361,12 +350,28 @@ void Champ_Face_VDF::calcul_y_plus(DoubleTab& y_plus, const Domaine_Cl_VDF& doma
       const Turbulence_paroi_base& loipar = mod_turb.loi_paroi();
       if (loipar.use_shear())
         {
-          yplus_faces.resize(domaine_vdf().nb_faces_tot());
-          yplus_faces.ref(loipar.tab_d_plus());
+          tab_yplus_faces.resize(domaine_vdf().nb_faces_tot());
+          tab_yplus_faces.ref(loipar.tab_d_plus());
           yplus_already_computed = 1;
         }
     }
-
+  const int dim = Objet_U::dimension;
+  const int is_axi = Objet_U::axi;
+
+  Domaine_VDF_View dom_vdf(domaine_VDF);
+  CIntTabView face_voisins = domaine_VDF.face_voisins().view_ro();
+  CIntArrView orientation = domaine_VDF.orientation().view_ro();
+  CIntTabView elem_faces = domaine_VDF.elem_faces().view_ro();
+  CDoubleArrView vitesse = static_cast<const ArrOfDouble&>(vit.valeurs()).view_ro();
+  CDoubleArrView yplus_faces;
+  if (yplus_already_computed) yplus_faces = static_cast<const ArrOfDouble&>(tab_yplus_faces).view_ro();
+  CDoubleTabView tab_visco;
+  if (!l_unif) tab_visco = tab_visco_cin.view_ro();
+  DoubleTrav tab_counter(tab_y_plus.size_array());
+  tab_counter = 0;
+  tab_y_plus = 0;
+  DoubleArrView y_plus = static_cast<ArrOfDouble&>(tab_y_plus).view_rw();
+  DoubleArrView counter = static_cast<DoubleVect&>(tab_counter).view_wo();
   for (int n_bord = 0; n_bord < domaine_VDF.nb_front_Cl(); n_bord++)
     {
       const Cond_lim& la_cl = domaine_Cl_VDF.les_conditions_limites(n_bord);
@@ -376,52 +381,47 @@ void Champ_Face_VDF::calcul_y_plus(DoubleTab& y_plus, const Domaine_Cl_VDF& doma
           const Front_VF& le_bord = ref_cast(Front_VF, la_cl->frontiere_dis());
           ndeb = le_bord.num_premiere_face();
           nfin = ndeb + le_bord.nb_faces();
-
-          for (int num_face = ndeb; num_face < nfin; num_face++)
-            {
-
-              if (face_voisins(num_face, 0) != -1)
-                elem = face_voisins(num_face, 0);
-              else
-                elem = face_voisins(num_face, 1);
-
-              if (yplus_already_computed)
-                {
-                  // y+ is only defined on faces so we take the face value to put in the element
-                  y_plus(elem) = yplus_faces(num_face);
-                }
-              else
-                {
-                  if (dimension == 2)
-                    {
-                      ori = orientation(num_face);
-                      norm_v = norm_2D_vit(vit.valeurs(), elem, ori, domaine_VDF, val0);
-                    }
-                  else if (dimension == 3)
-                    {
-                      ori = orientation(num_face);
-                      norm_v = norm_3D_vit(vit.valeurs(), elem, ori, domaine_VDF, val1, val2);
-                    } // dim 3
-
-                  if (axi)
-                    dist = domaine_VDF.dist_norm_bord_axi(num_face);
-                  else
-                    dist = domaine_VDF.dist_norm_bord(num_face);
-                  if (l_unif)
-                    d_visco = visco;
-                  else
-                    d_visco = tab_visco[elem];
-
-                  // PQ : 01/10/03 : corrections par rapport a la version premiere
-                  norm_tau = d_visco * norm_v / dist;
-
-                  u_etoile = sqrt(norm_tau);
-                  y_plus(elem) = dist * u_etoile / d_visco;
-
-                } // else yplus already computed
-            } // loop on faces
+          Kokkos::parallel_for(start_gpu_timer(__KERNEL_NAME__), range_1D(ndeb, nfin), KOKKOS_LAMBDA(const int num_face)
+          {
+            const int elem = face_voisins(num_face, 0) != -1 ? face_voisins(num_face, 0) : face_voisins(num_face, 1);
+            if (yplus_already_computed)
+              {
+                // y+ is only defined on faces so we take the face value to put in the element
+                Kokkos::atomic_add(&y_plus(elem), yplus_faces(num_face));
+              }
+            else
+              {
+                const int ori = orientation(num_face);
+                double norm_v = 0;
+                if (dim == 2)
+                  {
+                    double val0;
+                    norm_v = norm_2D_vit(vitesse, elem, ori, elem_faces, val0);
+                  }
+                else
+                  {
+                    double val1, val2;
+                    norm_v = norm_3D_vit(vitesse, elem, ori, elem_faces, val1, val2);
+                  }
+                const double dist = is_axi ? dom_vdf.dist_norm_bord_axi(num_face) : dom_vdf.dist_norm_bord(num_face);
+                const double d_visco = l_unif ? visco : tab_visco(elem, 0);
+                const double norm_tau = d_visco * norm_v / dist;
+                Kokkos::atomic_add(&y_plus(elem), dist * Kokkos::sqrt(norm_tau) / d_visco);
+              }
+            Kokkos::atomic_add(&counter(elem), 1.);
+          }); // loop on faces
+          end_gpu_timer(__KERNEL_NAME__);
         } // Fin paroi fixe
     } // Fin boucle sur les bords
+
+  Kokkos::parallel_for(start_gpu_timer(__KERNEL_NAME__), range_1D(0, tab_y_plus.size_array()), KOKKOS_LAMBDA(const int elem)
+  {
+    if (counter(elem) > 0.)
+      y_plus(elem) /= counter(elem);
+    else
+      y_plus(elem) = -1.;
+  });
+  end_gpu_timer(__KERNEL_NAME__);
 }
 
 /*! @brief Methode qui renvoie gij aux elements a partir de la vitesse aux faces (gij represente la derivee partielle dui/dxj)
@@ -429,298 +429,314 @@ void Champ_Face_VDF::calcul_y_plus(DoubleTab& y_plus, const Domaine_Cl_VDF& doma
  *  A partir de gij, on peut calculer Sij = 0.5(gij(i,j)+gij(j,i))
  *
  */
-DoubleTab& Champ_Face_VDF::calcul_duidxj(const DoubleTab& vitesse, DoubleTab& gij, const Domaine_Cl_VDF& domaine_Cl_VDF) const
+DoubleTab& Champ_Face_VDF::calcul_duidxj(const DoubleTab& tab_vitesse, DoubleTab& tab_gij, const Domaine_Cl_VDF& domaine_Cl_VDF) const
 {
   const Champ_Face_VDF& vit = ref_cast(Champ_Face_VDF, mon_equation->inconnue());
   const Domaine_Cl_VDF& dclvdf = ref_cast(Domaine_Cl_VDF, vit.domaine_Cl_dis());
   const Domaine_VDF& domaine_VDF = domaine_vdf();
-  const int nb_elem = domaine_VDF.domaine().nb_elem_tot(), N = vitesse.line_size();
-  const IntTab& face_voisins = domaine_VDF.face_voisins(), &elem_faces = domaine_VDF.elem_faces(), &Qdm = domaine_VDF.Qdm();
-  const IntVect& orientation = domaine_VDF.orientation();
+  const int nb_elem = domaine_VDF.domaine().nb_elem_tot(), N = tab_vitesse.line_size();
 
   const int prem_am = domaine_VDF.premiere_arete_mixte(), dern_am = prem_am + domaine_VDF.nb_aretes_mixtes();
   const int prem_ai = domaine_VDF.premiere_arete_interne(), dern_ai = prem_ai + domaine_VDF.nb_aretes_internes();
-  IntVect element(4);
-  gij = 0.;
+  tab_gij = 0.;
 
   // On parcourt toutes les aretes qui permettent de calculer les termes croises du_i/dx_j
   // (les termes non-croises sont calcules en bouclant sur les elements)
 
+  const IntTab& tab_Qdm = domaine_VDF.Qdm();
+
+  // Calcul de val_imp_face_bord_
+  Champ_Face_get_val_imp_face_bord(vit.temps(), val_imp_face_bord_, dclvdf, &tab_vitesse);
+  const bool traitement_gradients = Option_VDF::traitement_gradients;
+  const bool traitement_coins_opt = Option_VDF::traitement_coins;
+  const int dim = Objet_U::dimension;
+  Domaine_VDF_View dom_vdf(domaine_VDF);
+  CIntTabView Qdm = domaine_VDF.Qdm().view_ro();
+  CIntTabView face_voisins = domaine_VDF.face_voisins().view_ro();
+  CIntTabView elem_faces = domaine_VDF.elem_faces().view_ro();
+  CIntArrView orientation = domaine_VDF.orientation().view_ro();
+  CDoubleTabView vitesse = tab_vitesse.view_ro();
+  CDoubleTabView val_imp = val_imp_face_bord_.view_ro();
+  DoubleTabView4 gij = tab_gij.view_rw<4>();
 
   // On commence par les bords
   int ndeb = domaine_VDF.premiere_arete_bord(), nfin = ndeb + domaine_VDF.nb_aretes_bord();
-  for (int num_arete = ndeb; num_arete < nfin; num_arete++)
-    for (int n=0; n<N; n++)
+  DoubleTrav tab_coeff_frot_grad(nfin, N);
+  tab_coeff_frot_grad = 0.;
+  if (Option_VDF::traitement_gradients)
+    {
+      ToDo_Kokkos("critical");
+      for (int num_arete = ndeb; num_arete < nfin; num_arete++)
+        if (domaine_Cl_VDF.type_arete_bord(num_arete - ndeb) == 3)
+          for (int n = 0; n < N; n++)
+            {
+              const int num0 = tab_Qdm(num_arete, 0), num1 = tab_Qdm(num_arete, 1);
+              tab_coeff_frot_grad(num_arete, n) = (Champ_Face_coeff_frottement_grad_face_bord(num0, n, dclvdf) +
+                                                   Champ_Face_coeff_frottement_grad_face_bord(num1, n, dclvdf)) /
+                                                  2.;
+            }
+    }
+  CIntArrView type_arete_bord = domaine_Cl_VDF.type_arete_bord().view_ro();
+  CDoubleTabView coeff_frot_grad = tab_coeff_frot_grad.view_ro();
+  Kokkos::parallel_for(start_gpu_timer(__KERNEL_NAME__), range_2D({ndeb, 0}, {nfin, N}),
+                       KOKKOS_LAMBDA(const int num_arete, const int n)
+  {
+    const int n_type = type_arete_bord(num_arete - ndeb);
+
+    int element[4];
+    if (n_type == 4) // arete de type periodicite
       {
-        const int n_type = domaine_Cl_VDF.type_arete_bord(num_arete - ndeb);
-
-        if (n_type == 4) // arete de type periodicite
-          {
-            const int num0 = Qdm(num_arete, 0), num1 = Qdm(num_arete, 1), num2 = Qdm(num_arete, 2), num3 = Qdm(num_arete, 3);
-            const int i = orientation(num0), j = orientation(num2);
-
-            const double temp1 = (vitesse(num1, n) - vitesse(num0, n)) / domaine_VDF.dist_face_period(num0, num1, j); // du_i / dx_j
-            const double temp2 = (vitesse(num3, n) - vitesse(num2, n)) / domaine_VDF.dist_face_period(num2, num3, i); // du_j / dx_i
-
-            element(0) = face_voisins(num0, 0);
-            element(1) = face_voisins(num0, 1);
-            element(2) = face_voisins(num1, 0);
-            element(3) = face_voisins(num1, 1);
-
-            for (int k = 0; k < 4; k++)
-              {
-                // 1) 0.5 : pour la periodicite, car on distribuera deux fois sur les elements qui "touchent" cette arete puisqu'elle existe en double.
-                // 2) 0.25 : on distribue le gradient de vitesse sur les 4 elements qui l'entourent.
-                gij(element(k), i, j, n) += temp1 * 0.5 * 0.25;
-                gij(element(k), j, i, n) += temp2 * 0.5 * 0.25;
-              }
-          }
-        else if (n_type == 3 && Option_VDF::traitement_gradients) /* NAVIER - NAVIER */
-          {
-            const int num0 = Qdm(num_arete, 0), num1 = Qdm(num_arete, 1), num2 = Qdm(num_arete, 2), signe = Qdm(num_arete, 3);
-            const int i = orientation(num0), j = orientation(num2);
-
-            const double temp1 = (vitesse(num1, n) - vitesse(num0, n)) / domaine_VDF.dist_face_period(num0, num1, j); // du_i / dx_j
-            const double coeff_frot = (Champ_Face_coeff_frottement_grad_face_bord(num0, n, dclvdf)+Champ_Face_coeff_frottement_grad_face_bord(num1, n, dclvdf))/2.;
-            const double temp2 = -signe * coeff_frot * vitesse(num2, n);
-
-            element(0) = face_voisins(num2, 0);
-            element(1) = face_voisins(num2, 1);
-
-            for (int k = 0; k < 2; k++)
-              {
-                // 1) 0.25 : on distribue le gradient de vitesse sur les 4 elements qui l'entourent.
-                gij(element(k), i, j, n) += temp1 * 0.25;
-                gij(element(k), j, i, n) += temp2 * 0.25;
-              }
-          }
-        else if (Option_VDF::traitement_gradients && (n_type == 5 || n_type == 6))
-          Process::exit("Issue in Champ_Face_VDF::calcul_duidxj ... This case is not yet considered. Contact the TRUST team.");
-        else /* les autres aretes bords ... */
+        const int num0 = Qdm(num_arete, 0), num1 = Qdm(num_arete, 1), num2 = Qdm(num_arete, 2), num3 = Qdm(num_arete, 3);
+        const int i = orientation(num0), j = orientation(num2);
+        const double temp1 = (vitesse(num1, n) - vitesse(num0, n)) / dom_vdf.dist_face_period(num0, num1, j); // du_i/dx_j
+        const double temp2 = (vitesse(num3, n) - vitesse(num2, n)) / dom_vdf.dist_face_period(num2, num3, i); // du_j/dx_i
+        element[0] = face_voisins(num0, 0);
+        element[1] = face_voisins(num0, 1);
+        element[2] = face_voisins(num1, 0);
+        element[3] = face_voisins(num1, 1);
+        for (int k = 0; k < 4; k++)
           {
-            const int num0 = Qdm(num_arete, 0), num1 = Qdm(num_arete, 1), num2 = Qdm(num_arete, 2), signe = Qdm(num_arete, 3);
-            const int i = orientation(num0), j = orientation(num2);
-
-            const double temp1 = (vitesse(num1, n) - vitesse(num0, n)) / domaine_VDF.dist_face_period(num0, num1, j); // du_i / dx_j
-            const double vit_imp = 0.5 * (vit.val_imp_face_bord_private(num0, N*j+n) + vit.val_imp_face_bord_private(num1, N*j+n)); // vitesse tangentielle
-
-            //Dans cette partie, on conserve le codage de Hyd_SGE_Wale_VDF (num1 et non num2) pour calculer la distance entre le centre de la maille et le bord.
-            const double temp2 = -signe * (vitesse(num2, n) - vit_imp) / domaine_VDF.dist_norm_bord(num1);
-
-            element(0) = face_voisins(num2, 0);
-            element(1) = face_voisins(num2, 1);
-
-            for (int k = 0; k < 2; k++)
-              {
-                // 1) 0.25 : on distribue le gradient de vitesse sur les 4 elements qui l'entourent.
-                gij(element(k), i, j, n) += temp1 * 0.25;
-                gij(element(k), j, i, n) += temp2 * 0.25;
-              }
+            // 1) 0.5 : pour la periodicite, car on distribuera deux fois sur les elements qui "touchent" cette arete puisqu'elle existe en double.
+            // 2) 0.25 : on distribue le gradient de vitesse sur les 4 elements qui l'entourent.
+            Kokkos::atomic_add(&gij(element[k], i, j, n), temp1 * 0.5 * 0.25);
+            Kokkos::atomic_add(&gij(element[k], j, i, n), temp2 * 0.5 * 0.25);
           }
       }
+    else if (n_type == 3 && traitement_gradients) /* NAVIER - NAVIER */
+      {
+        const int num0 = Qdm(num_arete, 0), num1 = Qdm(num_arete, 1), num2 = Qdm(num_arete, 2), signe = Qdm(num_arete, 3);
+        const int i = orientation(num0), j = orientation(num2);
+        const double temp1 = (vitesse(num1, n) - vitesse(num0, n)) / dom_vdf.dist_face_period(num0, num1, j); // du_i/dx_j
+        const double temp2 = -signe * coeff_frot_grad(num_arete, n) * vitesse(num2, n);
+        element[0] = face_voisins(num2, 0);
+        element[1] = face_voisins(num2, 1);
+        for (int k = 0; k < 2; k++)
+          if (element[k] >= 0)
+            {
+              // 1) 0.25 : on distribue le gradient de vitesse sur les 4 elements qui l'entourent.
+              Kokkos::atomic_add(&gij(element[k], i, j, n), temp1 * 0.25);
+              Kokkos::atomic_add(&gij(element[k], j, i, n), temp2 * 0.25);
+            }
+      }
+    else if (traitement_gradients && (n_type == 5 || n_type == 6))
+      Kokkos::abort("Issue in Champ_Face_VDF::calcul_duidxj: n_type 5/6 not handled");
+    else /* les autres aretes bords ... */
+      {
+        const int num0 = Qdm(num_arete, 0), num1 = Qdm(num_arete, 1), num2 = Qdm(num_arete, 2), signe = Qdm(num_arete, 3);
+        const int i = orientation(num0), j = orientation(num2);
+        const double temp1 = (vitesse(num1, n) - vitesse(num0, n)) / dom_vdf.dist_face_period(num0, num1, j); // du_i/dx_j
+        const double vit_imp = 0.5 * (val_imp(num0, N*j+n) + val_imp(num1, N*j+n)); // vitesse tangentielle
+        //Dans cette partie, on conserve le codage de Hyd_SGE_Wale_VDF (num1 et non num2) pour calculer la distance entre le centre de la maille et le bord.
+        const double temp2 = -signe * (vitesse(num2, n) - vit_imp) / dom_vdf.dist_norm_bord(num1);
+        element[0] = face_voisins(num2, 0);
+        element[1] = face_voisins(num2, 1);
+        for (int k = 0; k < 2; k++)
+          if (element[k] >= 0)
+            {
+              Kokkos::atomic_add(&gij(element[k], i, j, n), temp1 * 0.25);
+              Kokkos::atomic_add(&gij(element[k], j, i, n), temp2 * 0.25);
+            }
+      }
+  }); // fin aretes bords
+  end_gpu_timer(__KERNEL_NAME__);
 
   // On continue avec les coins
   ndeb = domaine_VDF.premiere_arete_coin(), nfin = ndeb + domaine_VDF.nb_aretes_coin();
-
-  for (int num_arete = ndeb; num_arete < nfin; num_arete++)
-    for (int n=0; n<N; n++)
+  DoubleTrav tab_coeff_frot_f1_coin(nfin - ndeb, N), tab_coeff_frot_f2_coin(nfin - ndeb, N);
+  tab_coeff_frot_f1_coin = 0.;
+  tab_coeff_frot_f2_coin = 0.;
+  if (traitement_gradients && traitement_coins_opt)
+    for (int num_arete = ndeb; num_arete < nfin; num_arete++)
       {
         const int n_type = domaine_Cl_VDF.type_arete_coin(num_arete - ndeb);
-
-        if (n_type == 0) // arete de type perio-perio
+        if (n_type == 3 || n_type == 4 || n_type == 8)
           {
-            const int num0 = Qdm(num_arete, 0), num1 = Qdm(num_arete, 1), num2 = Qdm(num_arete, 2), num3 = Qdm(num_arete, 3);
-            const int i = orientation(num0), j = orientation(num2);
-
-            const double temp1 = (vitesse(num1, n) - vitesse(num0, n)) / domaine_VDF.dist_face_period(num0, num1, j); // du_i / dx_j
-            const double temp2 = (vitesse(num3, n) - vitesse(num2, n)) / domaine_VDF.dist_face_period(num2, num3, i); // du_j / dx_i
-
-            element(0) = face_voisins(num0, 0);
-            element(1) = face_voisins(num0, 1);
-            element(2) = face_voisins(num1, 0);
-            element(3) = face_voisins(num1, 1);
-
-            for (int k = 0; k < 4; k++)
+            const int num0 = tab_Qdm(num_arete, 0), num1 = tab_Qdm(num_arete, 1);
+            const int num2 = tab_Qdm(num_arete, 2), num3 = tab_Qdm(num_arete, 3);
+            const int f1 = num0 > -1 ? num0 : num1, f2 = num2 > -1 ? num2 : num3;
+            for (int n = 0; n < N; n++)
               {
-                // 1) 0.5 : pour la periodicite, car on distribuera deux fois sur les elements qui "touchent" cette arete puisqu'elle existe en double.
-                // 2) 0.5 : idem ci-dessus, car cette fois-ci on a un coin perio-perio.
-                // 3) 0.25 : on distribue le gradient de vitesse sur les 4 elements qui l'entourent.
-                gij(element(k), i, j, n) += temp1 * 0.5 * 0.5 * 0.25;
-                gij(element(k), j, i, n) += temp2 * 0.5 * 0.5 * 0.25;
-              }
-          }
-
-        if (n_type == 1) // arete de type perio-paroi
-          {
-            const int num0 = Qdm(num_arete, 0), num1 = Qdm(num_arete, 1), num2 = Qdm(num_arete, 2), signe = Qdm(num_arete, 3);
-            const int i = orientation(num1), j = orientation(num2);
-
-            const double temp1 = (vitesse(num1, n) - vitesse(num0, n)) / domaine_VDF.dist_face_period(num0, num1, j); // du_i / dx_j
-            const double vit_imp = 0.5 * (vit.val_imp_face_bord_private(num0, N*j+n) + vit.val_imp_face_bord_private(num1, N*j+n)); // vitesse tangentielle
-
-            const double temp2 = -signe * (vitesse(num2, n) - vit_imp) / domaine_VDF.dist_norm_bord(num1);
-
-            element(0) = face_voisins(num2, 0);
-            element(1) = face_voisins(num2, 1);
-
-            for (int k = 0; k < 2; k++)
-              {
-                // 1) 0.5 : pour la periodicite, car on distribuera deux fois sur les elements qui "touchent" cette arete puisqu'elle existe en double.
-                // 2) 0.25 : on distribue le gradient de vitesse sur les 4 elements qui l'entourent.
-                gij(element(k), i, j, n) += temp1 * 0.5 * 0.25;
-                gij(element(k), j, i, n) += temp2 * 0.5 * 0.25;
-              }
-          }
-
-        // XXX : Elie Saikali : j'ajoute ca pour les coins juste si option_vdf active pour le moment ...
-
-        if (Option_VDF::traitement_gradients && Option_VDF::traitement_coins)
-          {
-            if (n_type == 14 || n_type == 15) // arete de type fluide-paroi ou paroi-fluide
-              {
-                const int num0 = Qdm(num_arete, 0), num1 = Qdm(num_arete, 1), num2 = Qdm(num_arete, 2), signe = Qdm(num_arete, 3);
-                const int i = orientation(num1), j = orientation(num2);
-
-                const double temp1 = (vitesse(num1, n) - vitesse(num0, n)) / domaine_VDF.dist_face_period(num0, num1, j); // du_i / dx_j
-                const double vit_imp = 0.5 * (vit.val_imp_face_bord_private(num0, N*j+n) + vit.val_imp_face_bord_private(num1, N*j+n)); // vitesse tangentielle
-
-                const double temp2 = -signe * (vitesse(num2, n) - vit_imp) / domaine_VDF.dist_norm_bord(num1);
-
-                element(0) = face_voisins(num2, 0);
-                element(1) = face_voisins(num2, 1);
-
-                for (int k = 0; k < 2; k++)
-                  if (element(k) != -1)
-                    {
-                      gij(element(k), i, j, n) += temp1 * 0.25;
-                      gij(element(k), j, i, n) += temp2 * 0.25;
-                    }
-              }
-            else if (n_type == 3 || n_type == 4 || n_type == 8) // arete de type fluide-navier
-              {
-                const int num0 = Qdm(num_arete, 0),  num1 = Qdm(num_arete, 1), num2 = Qdm(num_arete, 2), num3 = Qdm(num_arete, 3);
-                const int f1 = num0 > -1 ? num0 : num1, f2 = num2 > -1 ? num2 : num3;
-                const int i = orientation(f1), j = orientation(f2);
-
-                const double coeff_frot1 = Champ_Face_coeff_frottement_grad_face_bord(f1, n, dclvdf), coeff_frot2 = Champ_Face_coeff_frottement_grad_face_bord(f2, n, dclvdf);
-
-//                int signe = f2 == num3 ? -1 : 1;
-//                const double temp1 = coeff_frot2 * signe * vitesse(f1, n);
-//                const double temp2 = coeff_frot1 * signe * vitesse(f2, n);
-                const double temp1 = coeff_frot2 * (face_voisins(f2, 0)==-1 ? 1:-1)* vitesse(f1, n);
-                const double temp2 = coeff_frot1 * (face_voisins(f1, 0)==-1 ? 1:-1)* vitesse(f2, n);
-
-
-                element(0) = face_voisins(f1, 0);
-                element(1) = face_voisins(f1, 1);
-
-                for (int k = 0; k < 2; k++)
-                  if (element(k) != -1)
-                    {
-                      gij(element(k), i, j, n) += temp1 * 0.25;
-                      gij(element(k), j, i, n) += temp2 * 0.25;
-                    }
+                tab_coeff_frot_f1_coin(num_arete - ndeb, n) = Champ_Face_coeff_frottement_grad_face_bord(f1, n, dclvdf);
+                tab_coeff_frot_f2_coin(num_arete - ndeb, n) = Champ_Face_coeff_frottement_grad_face_bord(f2, n, dclvdf);
               }
           }
       }
 
-  // On continue avec les aretes mixtes
+  CIntArrView type_arete_coin = domaine_Cl_VDF.type_arete_coin().view_ro();
+  CDoubleTabView coeff_frot_f1_coin = tab_coeff_frot_f1_coin.view_ro();
+  CDoubleTabView coeff_frot_f2_coin = tab_coeff_frot_f2_coin.view_ro();
+  int ndeb_coin = ndeb;
 
-  for (int num_arete = prem_am; num_arete < dern_am; num_arete++)
-    for (int n=0; n<N; n++)
+  Kokkos::parallel_for(start_gpu_timer(__KERNEL_NAME__), range_2D({ndeb, 0}, {nfin, N}),
+                       KOKKOS_LAMBDA(const int num_arete, const int n)
+  {
+    const int n_type = type_arete_coin(num_arete - ndeb_coin);
+    int element[4];
+
+    if (n_type == 0) // arete de type perio-perio
       {
         const int num0 = Qdm(num_arete, 0), num1 = Qdm(num_arete, 1), num2 = Qdm(num_arete, 2), num3 = Qdm(num_arete, 3);
         const int i = orientation(num0), j = orientation(num2);
-
-        const double temp1 = (vitesse(num1, n) - vitesse(num0, n)) / domaine_VDF.dist_face_period(num0, num1, j); // du_i / dx_j
-        const double temp2 = (vitesse(num3, n) - vitesse(num2, n)) / domaine_VDF.dist_face_period(num2, num3, i); // du_j / dx_i
-
-        element(0) = face_voisins(num0, 0);
-        element(1) = face_voisins(num0, 1);
-        element(2) = face_voisins(num1, 0);
-        element(3) = face_voisins(num1, 1);
-
+        const double temp1 = (vitesse(num1, n) - vitesse(num0, n)) / dom_vdf.dist_face_period(num0, num1, j); // du_i / dx_j
+        const double temp2 = (vitesse(num3, n) - vitesse(num2, n)) / dom_vdf.dist_face_period(num2, num3, i); // du_j / dx_i
+        element[0] = face_voisins(num0, 0);
+        element[1] = face_voisins(num0, 1);
+        element[2] = face_voisins(num1, 0);
+        element[3] = face_voisins(num1, 1);
         for (int k = 0; k < 4; k++)
-          if (element(k) != -1)
+          if (element[k] >= 0)
             {
-              // 1) 0.25 : on distribue le gradient de vitesse sur les 3 elements qui l'entourent.
-              // C'est pour cela que l'on regarde si element(k)!=-1, car dans ce cas la, c'est qu'il s'agit de "la case qui manque" !
-              gij(element(k), i, j, n) += temp1 * 0.25;
-              gij(element(k), j, i, n) += temp2 * 0.25;
+              // 1) 0.5 : pour la periodicite, car on distribuera deux fois puisqu'elle existe en double.
+              // 2) 0.5 : idem, car cette fois-ci on a un coin perio-perio.
+              // 3) 0.25 : on distribue le gradient de vitesse sur les 4 elements qui l'entourent.
+              Kokkos::atomic_add(&gij(element[k], i, j, n), temp1 * 0.5 * 0.5 * 0.25);
+              Kokkos::atomic_add(&gij(element[k], j, i, n), temp2 * 0.5 * 0.5 * 0.25);
             }
       }
 
-  // On continue avec les aretes internes
-
-  for (int num_arete = prem_ai; num_arete < dern_ai; num_arete++)
-    for (int n=0; n<N; n++)
+    if (n_type == 1) // arete de type perio-paroi
       {
-        const int num0 = Qdm(num_arete, 0), num1 = Qdm(num_arete, 1), num2 = Qdm(num_arete, 2), num3 = Qdm(num_arete, 3);
-        const int i = orientation(num0), j = orientation(num2);
-
-        const double temp1 = (vitesse(num1, n) - vitesse(num0, n)) / domaine_VDF.dist_face_period(num0, num1, j); // du_i / dx_j
-        assert(est_egal(domaine_VDF.dist_face_period(num0, num1, j), domaine_VDF.dist_face(num0, num1, j)));
-
-        const double temp2 = (vitesse(num3, n) - vitesse(num2, n)) / domaine_VDF.dist_face_period(num2, num3, i); // du_j / dx_i
-        assert(est_egal(domaine_VDF.dist_face_period(num2, num3, j), domaine_VDF.dist_face(num2, num3, j)));
-
-        element(0) = face_voisins(num0, 0);
-        element(1) = face_voisins(num0, 1);
-        element(2) = face_voisins(num1, 0);
-        element(3) = face_voisins(num1, 1);
+        const int num0 = Qdm(num_arete, 0), num1 = Qdm(num_arete, 1), num2 = Qdm(num_arete, 2), signe = Qdm(num_arete, 3);
+        const int i = orientation(num1), j = orientation(num2);
+        const double temp1 = (vitesse(num1, n) - vitesse(num0, n)) / dom_vdf.dist_face_period(num0, num1, j); // du_i / dx_j
+        const double vit_imp = 0.5 * (val_imp(num0, N*j+n) + val_imp(num1, N*j+n)); // vitesse tangentielle
+        const double temp2 = -signe * (vitesse(num2, n) - vit_imp) / dom_vdf.dist_norm_bord(num1);
+        element[0] = face_voisins(num2, 0);
+        element[1] = face_voisins(num2, 1);
+        for (int k = 0; k < 2; k++)
+          if (element[k] >= 0)
+            {
+              // 1) 0.5 : pour la periodicite.
+              // 2) 0.25 : on distribue le gradient de vitesse sur les 4 elements qui l'entourent.
+              Kokkos::atomic_add(&gij(element[k], i, j, n), temp1 * 0.5 * 0.25);
+              Kokkos::atomic_add(&gij(element[k], j, i, n), temp2 * 0.5 * 0.25);
+            }
+      }
 
-        for (int k = 0; k < 4; k++)
+    // XXX : Elie Saikali : j'ajoute ca pour les coins juste si option_vdf active pour le moment ...
+    if (traitement_gradients && traitement_coins_opt)
+      {
+        if (n_type == 14 || n_type == 15) // arete de type fluide-paroi ou paroi-fluide
+          {
+            const int num0 = Qdm(num_arete, 0), num1 = Qdm(num_arete, 1), num2 = Qdm(num_arete, 2), signe = Qdm(num_arete, 3);
+            const int i = orientation(num1), j = orientation(num2);
+            const double temp1 = (vitesse(num1, n) - vitesse(num0, n)) / dom_vdf.dist_face_period(num0, num1, j); // du_i / dx_j
+            const double vit_imp = 0.5 * (val_imp(num0, N*j+n) + val_imp(num1, N*j+n)); // vitesse tangentielle
+            const double temp2 = -signe * (vitesse(num2, n) - vit_imp) / dom_vdf.dist_norm_bord(num1);
+            element[0] = face_voisins(num2, 0);
+            element[1] = face_voisins(num2, 1);
+            for (int k = 0; k < 2; k++)
+              if (element[k] != -1)
+                {
+                  Kokkos::atomic_add(&gij(element[k], i, j, n), temp1 * 0.25);
+                  Kokkos::atomic_add(&gij(element[k], j, i, n), temp2 * 0.25);
+                }
+          }
+        else if (n_type == 3 || n_type == 4 || n_type == 8) // arete de type fluide-navier
           {
-            // 1) 0.25 : on distribue le gradient de vitesse sur les 4 elements qui l'entourent.
-            gij(element(k), i, j, n) += temp1 * 0.25;
-            gij(element(k), j, i, n) += temp2 * 0.25;
+            const int num0 = Qdm(num_arete, 0), num1 = Qdm(num_arete, 1);
+            const int num2 = Qdm(num_arete, 2), num3 = Qdm(num_arete, 3);
+            const int f1 = num0 > -1 ? num0 : num1, f2 = num2 > -1 ? num2 : num3;
+            const int i = orientation(f1), j = orientation(f2);
+            const double coeff_frot1 = coeff_frot_f1_coin(num_arete - ndeb_coin, n);
+            const double coeff_frot2 = coeff_frot_f2_coin(num_arete - ndeb_coin, n);
+            const double temp1 = coeff_frot2 * (face_voisins(f2, 0) == -1 ? 1 : -1) * vitesse(f1, n);
+            const double temp2 = coeff_frot1 * (face_voisins(f1, 0) == -1 ? 1 : -1) * vitesse(f2, n);
+            element[0] = face_voisins(f1, 0);
+            element[1] = face_voisins(f1, 1);
+            for (int k = 0; k < 2; k++)
+              if (element[k] != -1)
+                {
+                  Kokkos::atomic_add(&gij(element[k], i, j, n), temp1 * 0.25);
+                  Kokkos::atomic_add(&gij(element[k], j, i, n), temp2 * 0.25);
+                }
           }
       }
+  }); // fin aretes coins
+  end_gpu_timer(__KERNEL_NAME__);
+
+  // On continue avec les aretes mixtes
+  Kokkos::parallel_for(start_gpu_timer(__KERNEL_NAME__), range_2D({prem_am, 0}, {dern_am, N}),
+                       KOKKOS_LAMBDA(const int num_arete, const int n)
+  {
+    const int num0 = Qdm(num_arete, 0), num1 = Qdm(num_arete, 1), num2 = Qdm(num_arete, 2), num3 = Qdm(num_arete, 3);
+    const int i = orientation(num0), j = orientation(num2);
+    const double temp1 = (vitesse(num1, n) - vitesse(num0, n)) / dom_vdf.dist_face_period(num0, num1, j); // du_i / dx_j
+    const double temp2 = (vitesse(num3, n) - vitesse(num2, n)) / dom_vdf.dist_face_period(num2, num3, i); // du_j / dx_i
+    const int element[4] = { face_voisins(num0, 0), face_voisins(num0, 1), face_voisins(num1, 0), face_voisins(num1, 1) };
+    for (int k = 0; k < 4; k++)
+      if (element[k] != -1)
+        {
+          // 1) 0.25 : on distribue le gradient de vitesse sur les 3 elements qui l'entourent.
+          Kokkos::atomic_add(&gij(element[k], i, j, n), temp1 * 0.25);
+          Kokkos::atomic_add(&gij(element[k], j, i, n), temp2 * 0.25);
+        }
+  }); // fin aretes mixtes
+  end_gpu_timer(__KERNEL_NAME__);
 
+  // On continue avec les aretes internes
+  Kokkos::parallel_for(start_gpu_timer(__KERNEL_NAME__), range_2D({prem_ai, 0}, {dern_ai, N}),
+                       KOKKOS_LAMBDA(const int num_arete, const int n)
+  {
+    const int num0 = Qdm(num_arete, 0), num1 = Qdm(num_arete, 1), num2 = Qdm(num_arete, 2), num3 = Qdm(num_arete, 3);
+    const int i = orientation(num0), j = orientation(num2);
+    const double temp1 = (vitesse(num1, n) - vitesse(num0, n)) / dom_vdf.dist_face_period(num0, num1, j); // du_i / dx_j
+    const double temp2 = (vitesse(num3, n) - vitesse(num2, n)) / dom_vdf.dist_face_period(num2, num3, i); // du_j / dx_i
+    const int element[4] = { face_voisins(num0, 0), face_voisins(num0, 1), face_voisins(num1, 0), face_voisins(num1, 1) };
+    for (int k = 0; k < 4; k++)
+      {
+        // 1) 0.25 : on distribue le gradient de vitesse sur les 4 elements qui l'entourent.
+        Kokkos::atomic_add(&gij(element[k], i, j, n), temp1 * 0.25);
+        Kokkos::atomic_add(&gij(element[k], j, i, n), temp2 * 0.25);
+      }
+  }); // fin aretes internes
+  end_gpu_timer(__KERNEL_NAME__);
 
   // XXX : Elie Saikali : HACK pour coins fluides-fluides
   // pour ce cas (j'avoue cas rare), attention soucis avec les valeurs de la vitesse sur les coins ... par exemple un champ_fonc_xyz x+y+z donne pas le bon truc sur les coins
 
   // On continue avec les coins
-
   ndeb = domaine_VDF.premiere_arete_coin(), nfin = ndeb + domaine_VDF.nb_aretes_coin();
-
-  for (int num_arete = ndeb; num_arete < nfin; num_arete++)
-    for (int n=0; n<N; n++)
-      {
-        const int n_type = domaine_Cl_VDF.type_arete_coin(num_arete - ndeb);
-
-        if (Option_VDF::traitement_gradients && Option_VDF::traitement_coins)
-          if (n_type == 16 ) // arete de type fluide-fluide
+  ndeb_coin = ndeb;
+  Kokkos::parallel_for(start_gpu_timer(__KERNEL_NAME__), range_2D({ndeb, 0}, {nfin, N}),
+                       KOKKOS_LAMBDA(const int num_arete, const int n)
+  {
+    const int n_type = type_arete_coin(num_arete - ndeb_coin);
+    if (traitement_gradients && traitement_coins_opt)
+      if (n_type == 16) // arete de type fluide-fluide
+        {
+          const int num1 = Qdm(num_arete, 1), num2 = Qdm(num_arete, 2);
+          const int i = orientation(num1), j = orientation(num2);
+          const int el0 = face_voisins(num2, 0), el1 = face_voisins(num2, 1);
+          // XXX : 1/3 car on veut un truc comme ca : (a+b+c+d)/4 = (a+b+c)/3 => d = (a+b+c)/3
+          if (el0 != -1)
             {
-              const int num1 = Qdm(num_arete, 1), num2 = Qdm(num_arete, 2);
-              const int i = orientation(num1), j = orientation(num2);
-
-              element(0) = face_voisins(num2, 0);
-              element(1) = face_voisins(num2, 1);
-
-              for (int k = 0; k < 2; k++)
-                if (element(k) != -1)
-                  {
-                    // XXX : 1/3 car on veut un truc comme ca : (a+b+c+d)/4 = (a+b+c)/3 => d = (a+b+c)/3
-                    gij(element(k), i, j, n) += gij(element(k), i, j, n) / 3.;
-                    gij(element(k), j, i, n) += gij(element(k), j, i, n) / 3.;
-                  }
+              Kokkos::atomic_add(&gij(el0, i, j, n), gij(el0, i, j, n) / 3.);
+              Kokkos::atomic_add(&gij(el0, j, i, n), gij(el0, j, i, n) / 3.);
             }
-      }
+          if (el1 != -1)
+            {
+              Kokkos::atomic_add(&gij(el1, i, j, n), gij(el1, i, j, n) / 3.);
+              Kokkos::atomic_add(&gij(el1, j, i, n), gij(el1, j, i, n) / 3.);
+            }
+        }
+  }); // fin coins fluides-fluides
+  end_gpu_timer(__KERNEL_NAME__);
 
   // 2eme partie : boucle sur les elements et remplissage de Sij pour les derivees non croisees (du_i / dx_i).
   // En fait dans ces cas la, on calcul directement le gradient dans l'element et on ne redistribue pas.
+  Kokkos::parallel_for(start_gpu_timer(__KERNEL_NAME__), range_2D({0, 0}, {nb_elem, N}),
+                       KOKKOS_LAMBDA(const int elem, const int n)
+  {
+    for (int i = 0; i < dim; i++)
+      {
+        const double temp1 = (vitesse(elem_faces(elem, i), n) - vitesse(elem_faces(elem, i + dim), n)) / dom_vdf.dim_elem(elem, orientation(elem_faces(elem, i)));
+        gij(elem, i, i, n) = -temp1;
+      }
+  }); // fin elements
+  end_gpu_timer(__KERNEL_NAME__);
 
-  for (int elem = 0; elem < nb_elem; elem++)
-    for (int n=0; n<N; n++)
-      for (int i = 0; i < dimension; i++)
-        {
-          double temp1 = (vitesse(elem_faces(elem, i), n) - vitesse(elem_faces(elem, i + dimension), n)) / domaine_VDF.dim_elem(elem, orientation(elem_faces(elem, i)));
-          gij(elem, i, i, n) = -temp1;
-        }
-
-  return gij;
+  return tab_gij;
 }
 
 /*! @brief Methode qui renvoie gij aux elements a partir de la vitesse aux elements (gij represente la derivee partielle dui/dxj)
@@ -744,6 +760,7 @@ DoubleTab& Champ_Face_VDF::calcul_duidxj(const DoubleTab& in_vel, DoubleTab& gij
   //
   if (dimension == 2)
     {
+      ToDo_Kokkos("critical");
       for (int element_number = 0; element_number < nb_elem_tot; element_number++)
         for (int n=0; n<N; n++)
           {
@@ -772,6 +789,7 @@ DoubleTab& Champ_Face_VDF::calcul_duidxj(const DoubleTab& in_vel, DoubleTab& gij
     }
   else
     {
+      ToDo_Kokkos("critical");
       for (int element_number = 0; element_number < nb_elem_tot; element_number++)
         for (int n=0; n<N; n++)
           {
@@ -847,7 +865,7 @@ DoubleVect& Champ_Face_VDF::calcul_S_barre_sans_contrib_paroi(const DoubleTab& v
   ArrOfInt element(4);
 
   int ndeb = domaine_VDF.premiere_arete_bord(), nfin = ndeb + domaine_VDF.nb_aretes_bord();
-
+  ToDo_Kokkos("critical");
   for (int num_arete = ndeb; num_arete < nfin; num_arete++)
     {
       int n_type = domaine_Cl_VDF.type_arete_bord(num_arete - ndeb);
@@ -1022,85 +1040,81 @@ DoubleVect& Champ_Face_VDF::calcul_S_barre_sans_contrib_paroi(const DoubleTab& v
   return SMA_barre;
 }
 
-DoubleVect& Champ_Face_VDF::calcul_S_barre(const DoubleTab& vitesse, DoubleVect& SMA_barre, const Domaine_Cl_VDF& domaine_Cl_VDF) const
+DoubleVect& Champ_Face_VDF::calcul_S_barre(const DoubleTab& tab_vitesse, DoubleVect& tab_SMA_barre, const Domaine_Cl_VDF& domaine_Cl_VDF) const
 {
   const Domaine_VDF& domaine_VDF = domaine_vdf();
   const int nb_elem_tot = domaine_VDF.nb_elem_tot();
-  const int nb_elem = domaine_VDF.nb_elem(), N = vitesse.line_size();
-
-  int i, j;
-  int elem;
-  double Sij, temp;
-
-  DoubleTab duidxj(nb_elem_tot, dimension, dimension, N);
-
-  calcul_duidxj(vitesse, duidxj, domaine_Cl_VDF);
-
-  for (elem = 0; elem < nb_elem; elem++)
-    for (int n=0; n<N; n++)
+  const int nb_elem = domaine_VDF.nb_elem(), N = tab_vitesse.line_size();
+  const int dim = Objet_U::dimension;
+  DoubleTrav duidxj(nb_elem_tot, dim, dim, N);
+  calcul_duidxj(tab_vitesse, duidxj, domaine_Cl_VDF);
+  CDoubleTabView4 tab_duidxj = duidxj.view_ro<4>();
+  DoubleArrView SMA_barre = static_cast<ArrOfDouble&>(tab_SMA_barre).view_rw();
+  Kokkos::parallel_for(start_gpu_timer(__KERNEL_NAME__), range_1D(0, nb_elem), KOKKOS_LAMBDA(const int elem)
+  {
+    for (int n = 0; n < N; n++)
       {
-        temp = 0.;
-        for (i = 0; i < dimension; i++)
-          for (j = 0; j < dimension; j++)
+        double temp = 0.;
+        for (int i = 0; i < dim; i++)
+          for (int j = 0; j < dim; j++)
             {
-              Sij = 0.5 * (duidxj(elem, i, j, n) + duidxj(elem, j, i, n));
+              double Sij = 0.5 * (tab_duidxj(elem, i, j, n) + tab_duidxj(elem, j, i, n));
               temp += Sij * Sij;
             }
         SMA_barre(elem) = 2. * temp;
       }
-
-  return SMA_barre;
-
+  });
+  end_gpu_timer(__KERNEL_NAME__);
+  return tab_SMA_barre;
 }
 
-DoubleTab& Champ_Face_VDF::calcul_S_barre_Multiphase(const DoubleTab& vitesse, DoubleTab& SMA_barre, const Domaine_Cl_VDF& domaine_Cl_VDF) const
+DoubleTab& Champ_Face_VDF::calcul_S_barre_Multiphase(const DoubleTab& tab_vitesse, DoubleTab& tab_SMA_barre, const Domaine_Cl_VDF& domaine_Cl_VDF) const
 {
   const Domaine_VDF& domaine_VDF = domaine_vdf();
   const int nb_elem_tot = domaine_VDF.nb_elem_tot();
   const int nb_elem = domaine_VDF.nb_elem();
-  const int N = vitesse.line_size();
-
-  int i, j;
-  int elem;
-  double Sij, temp;
-
-  DoubleTab duidxj(nb_elem_tot, dimension, dimension, N);
-
-  calcul_duidxj(vitesse, duidxj, domaine_Cl_VDF);
-
-  for (elem = 0; elem < nb_elem; elem++)
-    for (int n=0; n<N; n++)
+  const int N = tab_vitesse.line_size();
+  const int dim = Objet_U::dimension;
+  DoubleTrav duidxj(nb_elem_tot, dim, dim, N);
+  calcul_duidxj(tab_vitesse, duidxj, domaine_Cl_VDF);
+  CDoubleTabView4 tab_duidxj = duidxj.view_ro<4>();
+  DoubleTabView SMA_barre = tab_SMA_barre.view_rw();
+  Kokkos::parallel_for(start_gpu_timer(__KERNEL_NAME__), range_1D(0, nb_elem), KOKKOS_LAMBDA(const int elem)
+  {
+    for (int n = 0; n < N; n++)
       {
-        temp = 0.;
-        for (i = 0; i < dimension; i++)
-          for (j = 0; j < dimension; j++)
+        double temp = 0.;
+        for (int i = 0; i < dim; i++)
+          for (int j = 0; j < dim; j++)
             {
-              Sij = 0.5 * (duidxj(elem, i, j, n) + duidxj(elem, j, i, n));
+              double Sij = 0.5 * (tab_duidxj(elem, i, j, n) + tab_duidxj(elem, j, i, n));
               temp += Sij * Sij;
             }
-        SMA_barre(elem,n) = 2. * temp;
+        SMA_barre(elem, n) = 2. * temp;
       }
-
-  return SMA_barre;
-
+  });
+  end_gpu_timer(__KERNEL_NAME__);
+  return tab_SMA_barre;
 }
 
-void Champ_Face_VDF::calcul_grad_u(const DoubleTab& vitesse, DoubleTab& grad_u, const Domaine_Cl_VDF& domaine_Cl_VDF)
+void Champ_Face_VDF::calcul_grad_u(const DoubleTab& vitesse, DoubleTab& tab_grad_u, const Domaine_Cl_VDF& domaine_Cl_VDF)
 {
   const Domaine_VDF& domaine_VDF = domaine_vdf();
   const int nb_elem = domaine_VDF.nb_elem();
   const int nb_elem_tot = domaine_VDF.nb_elem_tot(), N = vitesse.line_size();
-
-  DoubleTab gradient_elem(nb_elem_tot, dimension, dimension, N);
-  gradient_elem = 0.;
-
-  calcul_duidxj(vitesse, gradient_elem, domaine_Cl_VDF);
-
-  for (int elem = 0; elem < nb_elem; elem++)
-    for (int n=0; n<N; n++)
-      for (int i = 0; i < dimension; i++)
-        for (int j = 0; j < dimension; j++)
-          grad_u(elem, N * ( dimension*i+j ) + n) = gradient_elem(elem, i, j, n);
+  const int dim = Objet_U::dimension;
+  DoubleTrav tab_gradient_elem(nb_elem_tot, dim, dim, N);
+  calcul_duidxj(vitesse, tab_gradient_elem, domaine_Cl_VDF);
+  CDoubleTabView4 gradient_elem = tab_gradient_elem.view_ro<4>();
+  DoubleTabView grad_u = tab_grad_u.view_rw();
+  Kokkos::parallel_for(start_gpu_timer(__KERNEL_NAME__), range_1D(0, nb_elem), KOKKOS_LAMBDA(const int elem)
+  {
+    for (int n = 0; n < N; n++)
+      for (int i = 0; i < dim; i++)
+        for (int j = 0; j < dim; j++)
+          grad_u(elem, N * (dim*i+j) + n) = gradient_elem(elem, i, j, n);
+  });
+  end_gpu_timer(__KERNEL_NAME__);
 }
 
 void Champ_Face_VDF::calculer_dscald_centre_element(DoubleTab& dscald) const
@@ -1137,6 +1151,7 @@ void Champ_Face_VDF::calculer_dercov_axi(const Domaine_Cl_VDF& domaine_Cl_VDF)
 
   int fx0, fx1, fy0, fy1;
   int num_elem;
+  ToDo_Kokkos("critical");
   for (num_elem = 0; num_elem < domaine_VDF.nb_elem(); num_elem++)
     {
       fx0 = elem_faces(num_elem, 0);
@@ -1207,20 +1222,18 @@ void Champ_Face_VDF::calculer_dercov_axi(const Domaine_Cl_VDF& domaine_Cl_VDF)
             signe = Qdm(n_arete, 3);
             ori1 = orientation(fac1);
             ori3 = orientation(fac3);
-            int rang1 = fac1 - domaine_VDF.premiere_face_bord();
-            int rang2 = fac2 - domaine_VDF.premiere_face_bord();
             double vit_imp;
 
             if (n_type == TypeAreteBordVDF::PAROI_FLUIDE)
               // arete paroi_fluide :il faut determiner qui est la face fluide
               {
                 if (est_egal(inco[fac1], 0))
-                  vit_imp = val_imp_face_bord_private(rang2, ori3);
+                  vit_imp = val_imp_face_bord_private(fac2, ori3);
                 else
-                  vit_imp = val_imp_face_bord_private(rang1, ori3);
+                  vit_imp = val_imp_face_bord_private(fac1, ori3);
               }
             else
-              vit_imp = 0.5 * (val_imp_face_bord_private(rang1, ori3) + val_imp_face_bord_private(rang2, ori3));
+              vit_imp = 0.5 * (val_imp_face_bord_private(fac1, ori3) + val_imp_face_bord_private(fac2, ori3));
 
             if (ori1 == 0) // bord d'equation R = cte
               {
@@ -1365,126 +1378,175 @@ void Champ_Face_VDF::calculer_dercov_axi(const Domaine_Cl_VDF& domaine_Cl_VDF)
 /*           METHODES UTILES MAIS HORS CLASSE            */
 /* ***************************************************** */
 
-double Champ_Face_get_val_imp_face_bord_sym(const DoubleTab& tab_valeurs, const double temp, int face, int comp, const Domaine_Cl_VDF& zclo)
+void Champ_Face_get_val_imp_face_bord(const double temps, DoubleTab& val_imp_face_bord_, const Domaine_Cl_VDF& zcl, const DoubleTab* tab_inco)
 {
-  const Domaine_VDF& domaine_vdf = zclo.domaine_VDF();
-  int face_locale = -123;
-  const int face_globale = face + domaine_vdf.premiere_face_bord(); // Maintenant numero dans le tableau global des faces.
-  const Domaine_Cl_dis_base& zcl = zclo; //equation().domaine_Cl_dis();
-  // On recupere la CL associee a la face et le numero local de la face dans la frontiere.
-  //assert(equation().domaine_Cl_dis()==zclo);
-
-  const Cond_lim_base& cl = (face < domaine_vdf.nb_faces()) ? zcl.condition_limite_de_la_face_reelle(face_globale, face_locale) :
-                            zcl.condition_limite_de_la_face_virtuelle(face_globale, face_locale);
-
-  const IntTab& face_voisins = domaine_vdf.face_voisins();
-  const IntTab& elem_faces = domaine_vdf.elem_faces();
-  const DoubleVect& porosite = zclo.equation().milieu().porosite_face();
-  const int ori = domaine_vdf.orientation()(face_globale);
-
-  if (sub_type(Navier, cl))
+  const Domaine_VDF& domaine_vdf = zcl.domaine_VDF();
+  // ToDo_Kokkos("Reduce size of val_imp_face_bord_ !");
+  int dim = Objet_U::dimension;
+  int N = tab_inco ? tab_inco->line_size() : 1;
+  if (val_imp_face_bord_.size() == 0)
+    val_imp_face_bord_.resize(domaine_vdf.nb_faces_tot(), N * dim);
+  for (int n_bord = 0; n_bord < zcl.nb_cond_lim(); n_bord++)
     {
-      const int N = tab_valeurs.line_size();
-      const int n=comp%N, comploc = (comp-n)/N;
-      if (comploc == ori)
-        return 0;
+      const Cond_lim_base& cl = zcl.les_conditions_limites(n_bord).valeur();
+      const Front_VF& le_bord = ref_cast(Front_VF, cl.frontiere_dis());
+      if (sub_type(Navier, cl))
+        {
+          CIntTabView face_voisins = domaine_vdf.face_voisins().view_ro();
+          CIntTabView elem_faces = domaine_vdf.elem_faces().view_ro();
+          CDoubleArrView porosite = zcl.equation().milieu().porosite_face().view_ro();
+          CIntArrView orientation = domaine_vdf.orientation().view_ro();
+          CDoubleTabView inco = tab_inco->view_ro();
+          CIntArrView le_bord_num_face = le_bord.num_face().view_ro();
+          DoubleTabView val_imp_face_bord = val_imp_face_bord_.view_wo();
+          Kokkos::parallel_for(start_gpu_timer(__KERNEL_NAME__), le_bord.nb_faces_tot(), KOKKOS_LAMBDA(const int ind_face)
+          {
+            int face = le_bord_num_face(ind_face);
+            int ori = orientation(face);
+            int nb_comp = N * dim;
+            for (int comp = 0; comp < nb_comp; comp++)
+              {
+                double val_imp;
+                int n = comp % N, comp1 = comp / N;
+                if (comp1 == ori)
+                  val_imp = 0;
+                else
+                  {
+                    int comp2 = comp1 + dim;
+                    int elem = face_voisins(face, 0);
+                    if (elem == -1) elem = face_voisins(face, 1);
+                    int fac1 = elem_faces(elem, comp1);
+                    int fac2 = elem_faces(elem, comp2);
+                    double sum = porosite(fac1) + porosite(fac2);
+                    val_imp = sum==0 ? 0 : (inco(fac1, n) * porosite(fac1) + inco(fac2, n) * porosite(fac2)) / sum;
+                  }
+                val_imp_face_bord(face, comp) = val_imp;
+              }
+          });
+          end_gpu_timer(__KERNEL_NAME__);
+        }
+      else if (sub_type(Dirichlet_entree_fluide, cl) || sub_type(Dirichlet_paroi_defilante, cl))
+        {
+          CDoubleTabView vals = ref_cast(Dirichlet, cl).tab_val_imp(temps).view_ro();
+          CIntArrView le_bord_num_face = le_bord.num_face().view_ro();
+          DoubleTabView val_imp_face_bord = val_imp_face_bord_.view_wo();
+          const bool ch_unif = vals.extent(0) == 1;
+          const int nb_comp = (int)vals.extent(1);
+          Kokkos::parallel_for(start_gpu_timer(__KERNEL_NAME__), le_bord.nb_faces_tot(), KOKKOS_LAMBDA(const int ind_face)
+          {
+            int face = le_bord_num_face(ind_face);
+            for (int comp = 0; comp < nb_comp; comp++)
+              val_imp_face_bord(face, comp) = vals(ch_unif ? 0 : ind_face, comp);
+          });
+          end_gpu_timer(__KERNEL_NAME__);
+        }
       else
         {
-          int elem = 0;
-          if (face_voisins(face_globale, 0) != -1)
-            elem = face_voisins(face_globale, 0);
-          else
-            elem = face_voisins(face_globale, 1);
-          const int comp2 = comploc + Objet_U::dimension;
-          return (tab_valeurs(elem_faces(elem, comploc), n) * porosite[elem_faces(elem, comploc)] + tab_valeurs(elem_faces(elem, comp2), n) * porosite[elem_faces(elem, comp2)])
-                 / (porosite[elem_faces(elem, comploc)] + porosite[elem_faces(elem, comp2)]);
+          CIntArrView le_bord_num_face = le_bord.num_face().view_ro();
+          DoubleTabView val_imp_face_bord = val_imp_face_bord_.view_wo();
+          Kokkos::parallel_for(start_gpu_timer(__KERNEL_NAME__), le_bord.nb_faces_tot(), KOKKOS_LAMBDA(const int ind_face)
+          {
+            int face = le_bord_num_face(ind_face);
+            int nb_comp = N * dim;
+            for (int comp = 0; comp < nb_comp; comp++)
+              val_imp_face_bord(face, comp) = 0;
+          });
+          end_gpu_timer(__KERNEL_NAME__);
         }
     }
-
-  if (!cl.champ_front().has_valeurs_au_temps(temp)) // si pas encore initialise !!
-    return 0.;
-
-  const DoubleTab& vals = cl.champ_front().valeurs_au_temps(temp);
-  const int face_de_vals = vals.dimension(0) == 1 ? 0 : face_locale;
-
-  if (sub_type(Dirichlet_entree_fluide, cl))
-    return vals(face_de_vals, comp);
-  else if (sub_type(Dirichlet_paroi_fixe, cl))
-    return 0.;
-  else if (sub_type(Dirichlet_paroi_defilante, cl))
-    return vals(face_de_vals, comp);
-
-  return 0.; // All other cases
 }
 
-double Champ_Face_get_val_imp_face_bord(const double temp, int face, int comp, const Domaine_Cl_VDF& zclo)
+double Champ_Face_get_val_imp_face_bord(const double temp, int face_globale, int comp, const Domaine_Cl_VDF& zcl, const DoubleTab* tab_valeurs)
 {
-  const Domaine_VDF& domaine_vdf = zclo.domaine_VDF();
+  const Domaine_VDF& domaine_vdf = zcl.domaine_VDF();
+
   int face_locale = -123;
-  const int face_globale = face + domaine_vdf.premiere_face_bord(); // Maintenant numero dans le tableau global des faces.
-  const Domaine_Cl_dis_base& zcl = zclo; //equation().domaine_Cl_dis();
-  // On recupere la CL associee a la face et le numero local de la face dans la frontiere.
-  //assert(equation().domaine_Cl_dis()==zclo);
+  const Cond_lim_base& cl = (face_globale < domaine_vdf.nb_faces()) ? zcl.condition_limite_de_la_face_reelle(face_globale, face_locale) : zcl.condition_limite_de_la_face_virtuelle(face_globale, face_locale);
 
-  const Cond_lim_base& cl = (face < domaine_vdf.nb_faces()) ? zcl.condition_limite_de_la_face_reelle(face_globale, face_locale) :
-                            zcl.condition_limite_de_la_face_virtuelle(face_globale, face_locale);
-  const int ori = domaine_vdf.orientation()(face_globale);
+  if (!cl.champ_front().has_valeurs_au_temps(temp)) // si pas encore initialise !!
+    return 0.;
 
   if (sub_type(Navier, cl))
     {
+      double val_imp;
+      const int ori = domaine_vdf.orientation()(face_globale);
       if (comp == ori)
-        return 0.;
+        val_imp = 0.;
       else
         {
-          Process::exit("You should call Champ_Face_get_val_imp_face_bord_sym and not Champ_Face_get_val_imp_face_bord\n");
-          return 1.e9;
+          const int N = tab_valeurs->line_size();
+          const int n = comp % N, comploc = (comp - n) / N;
+          if (comploc == ori)
+            val_imp = 0.;
+          else
+            {
+              const IntTab& face_voisins = domaine_vdf.face_voisins();
+              const IntTab& elem_faces = domaine_vdf.elem_faces();
+              const DoubleVect& porosite = zcl.equation().milieu().porosite_face();
+              int elem = 0;
+              if (face_voisins(face_globale, 0) != -1)
+                elem = face_voisins(face_globale, 0);
+              else
+                elem = face_voisins(face_globale, 1);
+              const int comp2 = comploc + Objet_U::dimension;
+              val_imp = ((*tab_valeurs)(elem_faces(elem, comploc), n) * porosite[elem_faces(elem, comploc)] +
+                         (*tab_valeurs)(elem_faces(elem, comp2), n) * porosite[elem_faces(elem, comp2)])
+                        / (porosite[elem_faces(elem, comploc)] + porosite[elem_faces(elem, comp2)]);
+            }
         }
+      return val_imp;
     }
-
-  if (!cl.champ_front().has_valeurs_au_temps(temp)) // si pas encore initialise !!
-    return 0.;
-
-  const DoubleTab& vals = cl.champ_front().valeurs_au_temps(temp);
-  int face_de_vals = vals.dimension(0) == 1 ? 0 : face_locale;
-
-  if (sub_type(Dirichlet_entree_fluide, cl))
-    return vals(face_de_vals, comp);
-  else if (sub_type(Dirichlet_paroi_fixe, cl))
-    return 0.;
-  else if (sub_type(Dirichlet_paroi_defilante, cl))
-    return vals(face_de_vals, comp);
-
-  return 0.; // All other cases
-}
-
-double Champ_Face_get_val_imp_face_bord(const double temp, int face, int comp, int comp2, const Domaine_Cl_VDF& zclo)
-{
-  Process::exit("Champ_Face_VDF::val_imp_face_bord(,,) exit\n");
-  return 0.; // For compilers
+  else if (sub_type(Dirichlet_entree_fluide, cl) || sub_type(Dirichlet_paroi_defilante, cl))
+    {
+      const DoubleTab& vals = cl.champ_front().valeurs_au_temps(temp);
+      return vals(vals.dimension(0) == 1 ? 0 : face_locale, comp);
+    }
+  else
+    return 0.; // All other cases
 }
 
-double Champ_Face_coeff_frottement_face_bord(const int f, const int n, const Domaine_Cl_VDF& zclo)
+void Champ_Face_coeff_frottement_face_bord(DoubleTab& coeff_frottement_face_bord_, const Domaine_Cl_VDF& zcl)
 {
-  const Domaine_VDF& domaine_vdf = zclo.domaine_VDF();
-  const Domaine_Cl_dis_base& zcl = zclo;
-  const int face_globale = f + domaine_vdf.premiere_face_bord(); // Maintenant numero dans le tableau global des faces.
-
-  int face_locale = -123;
-  const Cond_lim_base& cl = (f < domaine_vdf.nb_faces()) ? zcl.condition_limite_de_la_face_reelle(face_globale, face_locale) :
-                            zcl.condition_limite_de_la_face_virtuelle(face_globale, face_locale);
-
-  return sub_type(Navier, cl) ? ref_cast(Navier, cl).coefficient_frottement(face_locale,n) : 0.;
+  const Domaine_VDF& domaine_vdf = zcl.domaine_VDF();
+  //ToDo_Kokkos("Reduce size of coeff_frottement_face_bord_ !");
+  if (coeff_frottement_face_bord_.size() == 0)
+    coeff_frottement_face_bord_.resize(domaine_vdf.nb_faces_tot(), Objet_U::dimension);
+
+  // Compute coeff_frottement_face_bord_
+  int dim = Objet_U::dimension;
+  for (int n_bord = 0; n_bord < domaine_vdf.nb_front_Cl(); n_bord++)
+    {
+      const Cond_lim_base& cl = zcl.les_conditions_limites(n_bord).valeur();
+      const Front_VF& le_bord = ref_cast(Front_VF, cl.frontiere_dis());
+      if (sub_type(Navier, cl))
+        {
+          int nb_comp = dim;
+          const Navier& la_cl = ref_cast(Navier, cl);
+          CIntArrView le_bord_num_face = le_bord.num_face().view_ro();
+          CDoubleTabView coefficient_frottement;
+          if (la_cl.coefficient_frottement())
+            {
+              coefficient_frottement = la_cl.coefficient_frottement()->view_ro();
+              nb_comp = (int)coefficient_frottement.extent(1);
+            }
+          DoubleTabView coeff_frottement_face_bord = coeff_frottement_face_bord_.view_wo();
+          Kokkos::parallel_for(start_gpu_timer(__KERNEL_NAME__), le_bord.nb_faces_tot(), KOKKOS_LAMBDA(const int ind_face)
+          {
+            int face = le_bord_num_face(ind_face);
+            for (int comp = 0; comp < nb_comp; comp++)
+              coeff_frottement_face_bord(face, comp) = coefficient_frottement.data() ? coefficient_frottement(ind_face, comp) : 0;
+          });
+          end_gpu_timer(__KERNEL_NAME__);
+        }
+    }
 }
 
-double Champ_Face_coeff_frottement_grad_face_bord(const int f, const int n, const Domaine_Cl_VDF& zclo)
+double Champ_Face_coeff_frottement_grad_face_bord(const int face_globale, const int n, const Domaine_Cl_VDF& zcl)
 {
-  const Domaine_VDF& domaine_vdf = zclo.domaine_VDF();
-  const Domaine_Cl_dis_base& zcl = zclo;
-  const int face_globale = f + domaine_vdf.premiere_face_bord(); // Maintenant numero dans le tableau global des faces.
+  const Domaine_VDF& domaine_vdf = zcl.domaine_VDF();
 
   int face_locale = -123;
-  const Cond_lim_base& cl = (f < domaine_vdf.nb_faces()) ? zcl.condition_limite_de_la_face_reelle(face_globale, face_locale) :
-                            zcl.condition_limite_de_la_face_virtuelle(face_globale, face_locale);
+  const Cond_lim_base& cl = (face_globale < domaine_vdf.nb_faces()) ? zcl.condition_limite_de_la_face_reelle(face_globale, face_locale) : zcl.condition_limite_de_la_face_virtuelle(face_globale, face_locale);
 
   return sub_type(Navier, cl) ? ref_cast(Navier, cl).coefficient_frottement_grad(face_locale,n) : 0.;
 }
diff --git a/src/VDF/Champs/Champ_Face_VDF.h b/src/VDF/Champs/Champ_Face_VDF.h
index 3205d4e880..f4bb1b7c6d 100644
--- a/src/VDF/Champs/Champ_Face_VDF.h
+++ b/src/VDF/Champs/Champ_Face_VDF.h
@@ -92,6 +92,11 @@ class Champ_Face_VDF : public Champ_Face_base, public Champ_Face_VDF_implementat
     return Champ_Face_VDF_implementation::valeur_a_elem_compo(position, le_poly, ncomp);
   }
 
+  inline DoubleTab& valeur_aux_centres_de_gravite(const Domaine& dom, DoubleTab& tab_valeurs) const override
+  {
+    return Champ_Face_VDF_implementation::valeur_aux_centres_de_gravite(tab_valeurs);
+  }
+
   inline DoubleTab& valeur_aux_elems(const DoubleTab& positions, const IntVect& les_polys, DoubleTab& tab_valeurs) const override
   {
     return Champ_Face_VDF_implementation::valeur_aux_elems(positions, les_polys, tab_valeurs);
@@ -165,12 +170,12 @@ class Champ_Face_VDF : public Champ_Face_base, public Champ_Face_VDF_implementat
 
   DoubleTab tau_diag_;       // termes diagonaux du tenseur Grad
   DoubleTab tau_croises_;    // termes extradiagonaux du tenseur Grad
+  mutable DoubleTab val_imp_face_bord_; // Tableau de travail qui stocke les valeurs imposees aux faces de bord. Utile pour le GPU
 };
 
-double Champ_Face_coeff_frottement_face_bord(const int, const int , const Domaine_Cl_VDF& zclo);
+void Champ_Face_coeff_frottement_face_bord(DoubleTab&, const Domaine_Cl_VDF& zcl);
+void Champ_Face_get_val_imp_face_bord(const double temp, DoubleTab&, const Domaine_Cl_VDF& zcl, const DoubleTab* tab_valeurs=nullptr);
 double Champ_Face_coeff_frottement_grad_face_bord(const int, const int , const Domaine_Cl_VDF& zclo);
-double Champ_Face_get_val_imp_face_bord_sym(const DoubleTab& tab_valeurs, const double temp,int face,int comp, const Domaine_Cl_VDF& zclo);
-double Champ_Face_get_val_imp_face_bord( const double temp,int face,int comp, const Domaine_Cl_VDF& zclo) ;
-double Champ_Face_get_val_imp_face_bord( const double temp,int face,int comp, int comp2, const Domaine_Cl_VDF& zclo) ;
+double Champ_Face_get_val_imp_face_bord(const double temp,int face,int comp, const Domaine_Cl_VDF& zclo, const DoubleTab* tab_valeurs=nullptr);
 
 #endif /* Champ_Face_VDF_included */
diff --git a/src/VDF/Champs/Champ_Face_VDF_implementation.cpp b/src/VDF/Champs/Champ_Face_VDF_implementation.cpp
index 272cc32075..dc7ed330a5 100644
--- a/src/VDF/Champs/Champ_Face_VDF_implementation.cpp
+++ b/src/VDF/Champs/Champ_Face_VDF_implementation.cpp
@@ -19,6 +19,8 @@
 #include <LecFicDiffuse.h>
 #include <Domaine_VDF.h>
 #include <TRUSTTab.h>
+#include <kokkos++.h>
+#include <TRUSTArray_kokkos.tpp>
 
 DoubleTab& Champ_Face_VDF_implementation::valeur_aux_elems(const DoubleTab& positions, const IntVect& les_polys, DoubleTab& val_elem) const
 {
@@ -35,53 +37,94 @@ DoubleVect& Champ_Face_VDF_implementation::valeur_a_elem(const DoubleVect& posit
   return valeur_a_elem_(le_champ().valeurs(), position, val, e);
 }
 
-DoubleTab& Champ_Face_VDF_implementation::valeur_aux_elems_(const DoubleTab& val_face, const DoubleTab& positions, const IntVect& les_polys, DoubleTab& val_elem) const
+DoubleTab& Champ_Face_VDF_implementation::valeur_aux_centres_de_gravite(DoubleTab& tab_val_elem) const
 {
-  if (val_elem.nb_dim() > 2)
+  if (tab_val_elem.nb_dim() > 2)
     {
       Cerr << "Erreur TRUST dans Champ_Face_implementation::valeur_aux_elems()" << finl;
       Cerr << "Le DoubleTab val a plus de 2 entrees" << finl;
       Process::exit();
     }
-
-
-  const int N = val_face.line_size(), D = Objet_U::dimension, M = le_champ().nb_comp();
-  DoubleVect val_e(N * D), x(D);
-  val_elem = 0.0;
-  //assert(val_elem.line_size()==N * std::min(D, M));
-
-  for (int p = 0; p < les_polys.size(); p++)
-    {
-      for (int d = 0; d < D; d++) x(d) = positions(p, d);
-      valeur_a_elem_(val_face, x, val_e, les_polys(p));
-      for (int i = 0; i < N * std::min(D, M); i++) val_elem(p, i) = val_e(i);
-    }
-
-  return val_elem;
+  const DoubleTab& tab_val_face = le_champ().valeurs();
+  const int N = tab_val_face.line_size(), D = Objet_U::dimension;
+  const int nb_comp = le_champ().nb_comp();
+  tab_val_elem = 0.0;
+  int size = tab_val_elem.dimension(0);
+  CIntTabView e_f = domaine_vdf().elem_faces().view_ro();
+  CDoubleTabView val_face = tab_val_face.view_ro();
+  DoubleTabView val_elem = tab_val_elem.view_rw();
+  Kokkos::parallel_for(start_gpu_timer(__KERNEL_NAME__), range_1D(0, size), KOKKOS_LAMBDA(const int e)
+  {
+    // TODO : FIXME : cas avec line_size 1 mais nb_dim != 2 ... vu dans cathare3D
+    for (int d = 0; d < D; d++)
+      {
+        for (int n = 0; n < N; n++)
+          {
+            const double v1 = val_face(e_f(e, d), n);
+            const double v2 = val_face(e_f(e, d + D), n);
+            const double interp = 0.5 * (v1 + v2);
+            if (nb_comp == 1)
+              {
+              }
+            else if (d < nb_comp)
+              val_elem(e, N * d + n) = interp;
+          }
+      }
+  });
+  end_gpu_timer(__KERNEL_NAME__);
+  return tab_val_elem;
 }
 
-/* Elie SAIKALI : utilise pour CGNS => passer champ face a un champ vect aux faces ! */
-DoubleTab& Champ_Face_VDF_implementation::valeur_aux_faces_post_impl(const Domaine_VDF& vdf,  DoubleTab& result) const
+DoubleTab& Champ_Face_VDF_implementation::valeur_aux_elems_(const DoubleTab& tab_val_face, const DoubleTab& tab_positions, const IntVect& tab_les_polys, DoubleTab& tab_val_elem) const
 {
-  const Champ_base& cha = le_champ();
-  const DoubleTab& val = cha.valeurs();
-  const int nb_compo = cha.nb_comp(), N = val.line_size(), D = Objet_U::dimension;
-
-  if (nb_compo == 1)
-    Process::exit("TRUST error in Champ_Face_VDF_implementation::valeur_aux_faces_post_impl : A scalar field cannot be of Champ_Face type !");
-
-  const int nb_faces = vdf.nb_faces();
-
-  assert(nb_faces == val.dimension(0));
-
-  result.resize(nb_faces, N * D);
+  if (tab_val_elem.nb_dim() > 2)
+    {
+      Cerr << "Erreur TRUST dans Champ_Face_implementation::valeur_aux_elems()" << finl;
+      Cerr << "Le DoubleTab val a plus de 2 entrees" << finl;
+      Process::exit();
+    }
 
-  for (int f = 0; f < nb_faces; f++)
+  const int N = tab_val_face.line_size(), D = Objet_U::dimension;
+  const int nb_comp = le_champ().nb_comp();
+  tab_val_elem = 0.0;
+  const Domaine_VDF& domaine_VDF = domaine_vdf();
+  const Domaine& domaine_geom = get_domaine_geom();
+  int size = tab_les_polys.size();
+  CIntTabView f_s = domaine_VDF.face_sommets().view_ro();
+  CIntTabView e_f = domaine_VDF.elem_faces().view_ro();
+  CDoubleTabView coord = domaine_geom.coord_sommets().view_ro();
+  CDoubleTabView positions = tab_positions.view_ro();
+  CIntArrView les_polys = tab_les_polys.view_ro();
+  CDoubleTabView val_face = tab_val_face.view_ro();
+  DoubleTabView val_elem = tab_val_elem.view_rw();
+  const double precision_geom = Objet_U::precision_geom;
+  Kokkos::parallel_for(start_gpu_timer(__KERNEL_NAME__), range_1D(0, size), KOKKOS_LAMBDA(const int p)
+  {
+    // TODO : FIXME : cas avec line_size 1 mais nb_dim != 2 ... vu dans cathare3D
+    const int e = les_polys(p);
+    if (e == -1) return;
     for (int d = 0; d < D; d++)
-      for (int n = 0; n < N; n++)
-        result(f, N * d + n) = val(f, n) * vdf.face_normales(f, d) / vdf.face_surfaces(f);
-
-  return result;
+      {
+        const int som0 = f_s(e_f(e, d), 0);
+        const int som1 = f_s(e_f(e, d + D), 0);
+        const double psi = (positions(p, d) - coord(som0, d)) / (coord(som1, d) - coord(som0, d));
+        for (int n = 0; n < N; n++)
+          {
+            const double v1 = val_face(e_f(e, d), n);
+            const double v2 = val_face(e_f(e, d + D), n);
+            const double interp = interpolation(v1, v2, psi);
+            if (nb_comp == 1)
+              {
+                if (Kokkos::fabs(psi) < precision_geom || Kokkos::fabs(1. - psi) < precision_geom)
+                  val_elem(p, 0) = interp;
+              }
+            else if (d < nb_comp)
+              val_elem(p, N * d + n) = interp;
+          }
+      }
+  });
+  end_gpu_timer(__KERNEL_NAME__);
+  return tab_val_elem;
 }
 
 DoubleVect& Champ_Face_VDF_implementation::valeur_a_elem_(const DoubleTab& val_face, const DoubleVect& position, DoubleVect& val, int e) const
@@ -115,16 +158,47 @@ DoubleVect& Champ_Face_VDF_implementation::valeur_a_elem_(const DoubleTab& val_f
   return val;
 }
 
-DoubleVect& Champ_Face_VDF_implementation::valeur_aux_elems_compo(const DoubleTab& positions, const IntVect& les_polys, DoubleVect& val, int ncomp) const
+/* Elie SAIKALI : utilise pour CGNS => passer champ face a un champ vect aux faces ! */
+DoubleTab& Champ_Face_VDF_implementation::valeur_aux_faces_post_impl(const Domaine_VDF& vdf,  DoubleTab& result) const
+{
+  const Champ_base& cha = le_champ();
+  const DoubleTab& val = cha.valeurs();
+  const int nb_compo = cha.nb_comp(), N = val.line_size(), D = Objet_U::dimension;
+
+  if (nb_compo == 1)
+    Process::exit("TRUST error in Champ_Face_VDF_implementation::valeur_aux_faces_post_impl : A scalar field cannot be of Champ_Face type !");
+
+  const int nb_faces = vdf.nb_faces();
+
+  assert(nb_faces == val.dimension(0));
+
+  result.resize(nb_faces, N * D);
+  ToDo_Kokkos("critical");
+  for (int f = 0; f < nb_faces; f++)
+    for (int d = 0; d < D; d++)
+      for (int n = 0; n < N; n++)
+        result(f, N * d + n) = val(f, n) * vdf.face_normales(f, d) / vdf.face_surfaces(f);
+
+  return result;
+}
+
+DoubleVect& Champ_Face_VDF_implementation::valeur_aux_elems_compo(const DoubleTab& tab_positions, const IntVect& tab_les_polys, DoubleVect& tab_val, int ncomp) const
 {
-  assert(val.size_totale() >= les_polys.size());
+  assert(tab_val.size_totale() >= tab_les_polys.size());
   const int D = Objet_U::dimension;
-  const DoubleTab& coord = domaine_vdf().domaine().coord_sommets();
-  const IntTab& f_s = domaine_vdf().face_sommets(), &e_f = domaine_vdf().elem_faces();
-  const DoubleTab& vals = le_champ().valeurs();
-  int size = les_polys.size();
-  for(int p = 0; p < size; p++)
-    {
+  const DoubleTab& tab_coord = domaine_vdf().domaine().coord_sommets();
+  const IntTab& tab_f_s = domaine_vdf().face_sommets(), &tab_e_f = domaine_vdf().elem_faces();
+  const DoubleTab& tab_vals = le_champ().valeurs();
+  int size = tab_les_polys.size();
+
+  CDoubleTabView positions = tab_positions.view_ro();
+  CIntArrView les_polys = tab_les_polys.view_ro();
+  DoubleArrView val = tab_val.view_wo();
+  CDoubleArrView vals = static_cast<const DoubleVect&>(tab_vals).view_ro();
+  CIntTabView e_f = tab_e_f.view_ro(); 
+  CIntTabView f_s = tab_f_s.view_ro();
+  CDoubleTabView coord = tab_coord.view_ro();
+  Kokkos::parallel_for(start_gpu_timer(__KERNEL_NAME__),range_1D(0, size), KOKKOS_LAMBDA(const int p){
       int e = les_polys(p);
       if (e<0)
         {
@@ -132,13 +206,15 @@ DoubleVect& Champ_Face_VDF_implementation::valeur_aux_elems_compo(const DoubleTa
         }
       else
         {
-          const double val1 = vals(e_f(e, ncomp)), val2 = vals(e_f(e, D + ncomp));
+          // const double val1 = tab_vals(tab_e_f(e, ncomp)), val2 = tab_vals(tab_e_f(e, D + ncomp));
+          const double val1 = vals(e_f(e, ncomp)), val2 = vals(e_f(e, D + ncomp)); // I'm a bit worried about the layout consistency here moving between host (LayoutRight) and device (LayoutLeft)!
           const int som0 = f_s(e_f(e, ncomp), 0), som1 = f_s(e_f(e, D + ncomp), 0);
           const double psi = (positions(p, ncomp) - coord(som0, ncomp)) / (coord(som1, ncomp) - coord(som0, ncomp));
           val(p) = interpolation(val1, val2, psi);
         }
-    }
-  return val;
+    });
+    end_gpu_timer(__KERNEL_NAME__);
+  return tab_val;
 }
 
 double Champ_Face_VDF_implementation::valeur_a_elem_compo(const DoubleVect& position, int e, int d) const
@@ -177,28 +253,52 @@ DoubleTab& Champ_Face_VDF_implementation::valeur_aux_sommets(const Domaine& dom,
 
   const int nb_elem_tot = dom.nb_elem_tot(), nb_som = dom.nb_som(), nb_som_elem = dom.nb_som_elem();
   const int N = le_champ().valeurs().line_size(), D = Objet_U::dimension;
-  IntVect compteur(nb_som);
-  ch_som = 0, compteur = 0;
+  ArrOfInt tab_compteur(nb_som);
+  ch_som = 0, tab_compteur = 0;
 
-  DoubleVect position(D), val_e(N * D);
+  // Count queries (only nodes local to this process)
+  int nb_queries = 0;
   for (int e = 0; e < nb_elem_tot; e++)
-    for (int j = 0, s; j < nb_som_elem; j++)
-      if ((s = dom.sommet_elem(e, j)) < nb_som)
-        {
-          for(int d = 0; d < D; d++)
-            position(d) = dom.coord(s, d);
+    for (int j = 0; j < nb_som_elem; j++)
+      if (dom.sommet_elem(e, j) < nb_som)
+        nb_queries++;
 
-          compteur[s]++;
-          valeur_a_elem(position, val_e, e);
-          for (int n = 0; n < N; n++)
+  DoubleTab tab_positions(nb_queries, D);
+  IntVect tab_les_polys(nb_queries);
+  IntVect tab_sommets(nb_queries);
+
+  int q = 0;
+  for (int e = 0; e < nb_elem_tot; e++)
+    for (int j = 0; j < nb_som_elem; j++)
+      {
+        const int s = dom.sommet_elem(e, j);
+        if (s < nb_som)
+          {
             for (int d = 0; d < D; d++)
-              ch_som(s, N * d + n) += val_e(N * d + n);
-        }
+              tab_positions(q, d) = dom.coord(s, d);
+            tab_les_polys(q) = e;
+            tab_sommets(q) = s;
+            q++;
+          }
+      }
+
+  DoubleTab tab_val_elem(nb_queries, N * D);
+  tab_val_elem = 0.;
+  valeur_aux_elems_(le_champ().valeurs(), tab_positions, tab_les_polys, tab_val_elem);
+
+  for (int qi = 0; qi < nb_queries; qi++)
+    {
+      const int s = tab_sommets(qi);
+      tab_compteur[s]++;
+      for (int n = 0; n < N; n++)
+        for (int d = 0; d < D; d++)
+          ch_som(s, N * d + n) += tab_val_elem(qi, N * d + n);
+    }
 
   for (int s = 0; s < nb_som; s++)
     for (int n = 0; n < N; n++)
       for (int d = 0; d < D; d++)
-        ch_som(s, N * d + n) /= compteur[s];
+        ch_som(s, N * d + n) /= tab_compteur[s];
 
   return ch_som;
 }
@@ -223,6 +323,7 @@ DoubleVect& Champ_Face_VDF_implementation::valeur_aux_sommets_compo(const Domain
   compteur = 0;
 
   DoubleVect position(Objet_U::dimension);
+  ToDo_Kokkos("critical");
   for (num_elem=0; num_elem<nb_elem_tot; num_elem++)
     for (j=0; j<nb_som_elem; j++)
       {
@@ -236,6 +337,7 @@ DoubleVect& Champ_Face_VDF_implementation::valeur_aux_sommets_compo(const Domain
           }
       }
 
+  ToDo_Kokkos("critical");
   for (num_som=0; num_som<nb_som; num_som++)
     ch_som(num_som) /= compteur[num_som];
 
@@ -266,6 +368,7 @@ int Champ_Face_VDF_implementation::remplir_coord_noeuds_et_polys(DoubleTab& posi
   remplir_coord_noeuds(positions);
   polys.resize(nb_faces);
 
+  ToDo_Kokkos("critical");
   for(int face=0; face<nb_faces; face++)
     if( (polys(face)=face_voisins(face,0)) == -1)
       polys(face)=face_voisins(face,1);
@@ -398,82 +501,84 @@ int Champ_Face_VDF_implementation::imprime_Face(Sortie& os, int ncomp) const
   return 1;
 }
 
-DoubleTab& Champ_Face_VDF_implementation::trace(const Frontiere_dis_base& fr, const DoubleTab& y, DoubleTab& x,int distant) const
+DoubleTab& Champ_Face_VDF_implementation::trace(const Frontiere_dis_base& fr, const DoubleTab& tab_y, DoubleTab& tab_x,int distant) const
 {
   assert(distant==0);
   const Front_VF& fr_vf=ref_cast(Front_VF, fr);
   const Domaine_VDF& zvdf=domaine_vdf();
-  const IntVect& ori = zvdf.orientation();
+  const IntVect& tab_ori = zvdf.orientation();
   const IntTab& face_voisins = zvdf.face_voisins();
   const IntTab& elem_faces = zvdf.elem_faces();
   int elem1,elem2;
-  int face,i,f1,f2,f3,f4;
+  int i,f1,f2,f3,f4;
   int nb_faces = fr_vf.nb_faces();
-  if (x.dimension(0)!=nb_faces)
+  if (tab_x.dimension(0)!=nb_faces)
     {
       Cerr << "The number of faces " << nb_faces << " on the remote boundary " << fr.le_nom() << finl;
-      Cerr << "does not match the number of faces " << x.dimension(0) << " on the local boundary." << finl;
+      Cerr << "does not match the number of faces " << tab_x.dimension(0) << " on the local boundary." << finl;
       Cerr << "Please, check if the boundary condition is not applied on wrong boundaries." << finl;
       Process::exit();
     }
 //  assert(x.dimension(1)==Objet_U::dimension);
 
-  if (x.dimension(1) == 1)
+  DoubleTabView x = tab_x.view_wo();
+  CDoubleArrView y = static_cast<const DoubleVect&>(tab_y).view_ro();
+  CIntArrView ori = tab_ori.view_ro();
+  if (tab_x.dimension(1) == 1)
     {
-      for (i=0; i<fr_vf.nb_faces(); i++)
-        {
-          face=fr_vf.num_premiere_face()+i;
-          x(i,0)=y(face);
-        }
-      return x;
+        int ff= fr_vf.num_premiere_face();
+        Kokkos::parallel_for(start_gpu_timer(__KERNEL_NAME__), range_1D(0, fr_vf.nb_faces()), KOKKOS_LAMBDA(const int k){
+          int face = ff + k;
+          x(k, 0) = y(face);
+        });
+      return tab_x;
     }
-
-  for (i=0; i<fr_vf.nb_faces(); i++)
-    {
-      face=fr_vf.num_premiere_face()+i;
-      x(i,ori[face])=y(face);
-    }
-
+    int ff= fr_vf.num_premiere_face();
+    Kokkos::parallel_for(start_gpu_timer(), range_1D(0, fr_vf.nb_faces()), KOKKOS_LAMBDA(const int k){
+      int face = ff + k;
+      x(k, ori[face]) = y(face);
+    })
+  ToDo_Kokkos("critical");
   for (i=0; i<fr_vf.nb_faces(); i++)
     {
-      face=fr_vf.num_premiere_face()+i;
+      int face=fr_vf.num_premiere_face()+i;
       elem1 = face_voisins(face,0);
       if (elem1 != -1)
         {
           if (Objet_U::dimension == 2)
             {
-              if (ori[face] == 0)
+              if (tab_ori[face] == 0)
                 {
                   f1 = elem_faces(elem1,1);
                   f2 = elem_faces(elem1,3);
-                  x(i,1)= 0.5*(y[f1] + y[f2]);
+                  tab_x(i,1)= 0.5*(tab_y[f1] + tab_y[f2]);
                 }
               else
                 {
                   f1 = elem_faces(elem1,0);
                   f2 = elem_faces(elem1,2);
-                  x(i,0)= 0.5*(y[f1] + y[f2]);
+                  tab_x(i,0)= 0.5*(tab_y[f1] + tab_y[f2]);
                 }
             }
           else if (Objet_U::dimension == 3)
             {
-              if (ori[face] == 0)
+              if (tab_ori[face] == 0)
                 {
                   f1 = elem_faces(elem1,1);
                   f2 = elem_faces(elem1,4);
                   f3 = elem_faces(elem1,2);
                   f4 = elem_faces(elem1,5);
-                  x(i,1)= 0.5*(y[f1] + y[f2]);
-                  x(i,2)= 0.5*(y[f3] + y[f4]);
+                  tab_x(i,1)= 0.5*(tab_y[f1] + tab_y[f2]);
+                  tab_x(i,2)= 0.5*(tab_y[f3] + tab_y[f4]);
                 }
-              else if (ori[face] == 1)
+              else if (tab_ori[face] == 1)
                 {
                   f1 = elem_faces(elem1,0);
                   f2 = elem_faces(elem1,3);
                   f3 = elem_faces(elem1,2);
                   f4 = elem_faces(elem1,5);
-                  x(i,0)= 0.5*(y[f1] + y[f2]);
-                  x(i,2)= 0.5*(y[f3] + y[f4]);
+                  tab_x(i,0)= 0.5*(tab_y[f1] + tab_y[f2]);
+                  tab_x(i,2)= 0.5*(tab_y[f3] + tab_y[f4]);
                 }
               else // ori[face] = 2
                 {
@@ -481,8 +586,8 @@ DoubleTab& Champ_Face_VDF_implementation::trace(const Frontiere_dis_base& fr, co
                   f2 = elem_faces(elem1,3);
                   f3 = elem_faces(elem1,1);
                   f4 = elem_faces(elem1,4);
-                  x(i,0)= 0.5*(y[f1] + y[f2]);
-                  x(i,1)= 0.5*(y[f3] + y[f4]);
+                  tab_x(i,0)= 0.5*(tab_y[f1] + tab_y[f2]);
+                  tab_x(i,1)= 0.5*(tab_y[f3] + tab_y[f4]);
                 }
             }
         }
@@ -491,38 +596,38 @@ DoubleTab& Champ_Face_VDF_implementation::trace(const Frontiere_dis_base& fr, co
           elem2 = face_voisins(face,1);
           if (Objet_U::dimension == 2)
             {
-              if (ori[face] == 0)
+              if (tab_ori[face] == 0)
                 {
                   f1 = elem_faces(elem2,1);
                   f2 = elem_faces(elem2,3);
-                  x(i,1)= 0.5*(y[f1] + y[f2]);
+                  tab_x(i,1)= 0.5*(tab_y[f1] + tab_y[f2]);
                 }
               else
                 {
                   f1 = elem_faces(elem2,0);
                   f2 = elem_faces(elem2,2);
-                  x(i,0)= 0.5*(y[f1] + y[f2]);
+                  tab_x(i,0)= 0.5*(tab_y[f1] + tab_y[f2]);
                 }
             }
           else if (Objet_U::dimension == 3)
             {
-              if (ori[face] == 0)
+              if (tab_ori[face] == 0)
                 {
                   f1 = elem_faces(elem2,1);
                   f2 = elem_faces(elem2,4);
                   f3 = elem_faces(elem2,2);
                   f4 = elem_faces(elem2,5);
-                  x(i,1)= 0.5*(y[f1] + y[f2]);
-                  x(i,2)= 0.5*(y[f3] + y[f4]);
+                  tab_x(i,1)= 0.5*(tab_y[f1] + tab_y[f2]);
+                  tab_x(i,2)= 0.5*(tab_y[f3] + tab_y[f4]);
                 }
-              else if (ori[face] == 1)
+              else if (tab_ori[face] == 1)
                 {
                   f1 = elem_faces(elem2,0);
                   f2 = elem_faces(elem2,3);
                   f3 = elem_faces(elem2,2);
                   f4 = elem_faces(elem2,5);
-                  x(i,0)= 0.5*(y[f1] + y[f2]);
-                  x(i,2)= 0.5*(y[f3] + y[f4]);
+                  tab_x(i,0)= 0.5*(tab_y[f1] + tab_y[f2]);
+                  tab_x(i,2)= 0.5*(tab_y[f3] + tab_y[f4]);
                 }
               else // ori[face] = 2
                 {
@@ -530,12 +635,12 @@ DoubleTab& Champ_Face_VDF_implementation::trace(const Frontiere_dis_base& fr, co
                   f2 = elem_faces(elem2,3);
                   f3 = elem_faces(elem2,1);
                   f4 = elem_faces(elem2,4);
-                  x(i,0)= 0.5*(y[f1] + y[f2]);
-                  x(i,1)= 0.5*(y[f3] + y[f4]);
+                  tab_x(i,0)= 0.5*(tab_y[f1] + tab_y[f2]);
+                  tab_x(i,1)= 0.5*(tab_y[f3] + tab_y[f4]);
                 }
             }
         }
     }
   // Useless ?x.echange_espace_virtuel();
-  return x;
+  return tab_x;
 }
diff --git a/src/VDF/Champs/Champ_Face_VDF_implementation.h b/src/VDF/Champs/Champ_Face_VDF_implementation.h
index 6ce8859239..d9f3c3d272 100644
--- a/src/VDF/Champs/Champ_Face_VDF_implementation.h
+++ b/src/VDF/Champs/Champ_Face_VDF_implementation.h
@@ -17,6 +17,7 @@
 #define Champ_Face_VDF_implementation_included
 
 #include <Champ_implementation_divers.h>
+#include <kokkos++.h>
 
 class Frontiere_dis_base;
 class Domaine_VDF;
@@ -27,6 +28,7 @@ class Champ_Face_VDF_implementation : public Champ_implementation_divers
 public:
   DoubleVect& valeur_a_elem(const DoubleVect& position, DoubleVect& val, int le_poly) const override;
   double valeur_a_elem_compo(const DoubleVect& position, int le_poly, int ncomp) const override;
+  DoubleTab& valeur_aux_centres_de_gravite(DoubleTab& valeurs) const;
   DoubleTab& valeur_aux_elems(const DoubleTab& positions, const IntVect& les_polys, DoubleTab& valeurs) const override;
   DoubleTab& valeur_aux_elems_passe(const DoubleTab& positions, const IntVect& les_polys, DoubleTab& valeurs) const;
   DoubleVect& valeur_aux_elems_compo(const DoubleTab& positions, const IntVect& les_polys, DoubleVect& valeurs, int ncomp) const override;
@@ -38,21 +40,23 @@ class Champ_Face_VDF_implementation : public Champ_implementation_divers
 
   DoubleTab& valeur_aux_faces_post_impl(const Domaine_VDF&,  DoubleTab& result) const;
 
+  protected_but_public_for_cuda
+  DoubleTab& valeur_aux_elems_(const DoubleTab& val_face, const DoubleTab& positions, const IntVect& les_polys, DoubleTab& valeurs) const;
+
 protected:
   virtual const Domaine_VDF& domaine_vdf() const = 0;
-  inline double interpolation(const double, const double, const double) const;
+  KOKKOS_INLINE_FUNCTION static double interpolation(const double, const double, const double);
   DoubleTab& trace(const Frontiere_dis_base& fr, const DoubleTab& y, DoubleTab& x, int distant) const;
 
 private:
-  DoubleTab& valeur_aux_elems_(const DoubleTab& val_face, const DoubleTab& positions, const IntVect& les_polys, DoubleTab& valeurs) const;
   DoubleVect& valeur_a_elem_(const DoubleTab& val_face, const DoubleVect& position, DoubleVect& val, int le_poly) const;
 };
-inline double Champ_Face_VDF_implementation::interpolation(const double val1, const double val2, const double psi) const
+KOKKOS_INLINE_FUNCTION double Champ_Face_VDF_implementation::interpolation(const double val1, const double val2, const double psi)
 {
   double epsilon=1.e-12;
-  if (std::fabs(psi) < epsilon)
+  if (Kokkos::fabs(psi) < epsilon)
     return val1 ;
-  else if (std::fabs(1.-psi) < epsilon)
+  else if (Kokkos::fabs(1.-psi) < epsilon)
     return val2 ;
   else
     return val1 + psi * (val2-val1) ;
diff --git a/src/VDF/Champs/Champ_Fonc_Face_VDF.cpp b/src/VDF/Champs/Champ_Fonc_Face_VDF.cpp
index 98ef8d7afa..9c3d33d6a2 100644
--- a/src/VDF/Champs/Champ_Fonc_Face_VDF.cpp
+++ b/src/VDF/Champs/Champ_Fonc_Face_VDF.cpp
@@ -1,5 +1,5 @@
 /****************************************************************************
-* Copyright (c) 2023, CEA
+* Copyright (c) 2026, CEA
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
@@ -63,6 +63,7 @@ Champ_base& Champ_Fonc_Face_VDF::affecter_(const Champ_base& ch)
 
   if (sub_type(Champ_Uniforme, ch))
     {
+      ToDo_Kokkos("critical");
       for (int num_face = 0; num_face < nb_faces; num_face++)
         val(num_face) = v(0, orientation(num_face));
     }
@@ -72,6 +73,7 @@ Champ_base& Champ_Fonc_Face_VDF::affecter_(const Champ_base& ch)
       const IntTab& face_voisins = domaine_VDF.face_voisins();
       int num_face;
 
+      ToDo_Kokkos("critical");
       for (num_face = 0; num_face < ndeb_int; num_face++)
         {
           ori = orientation(num_face);
@@ -82,6 +84,7 @@ Champ_base& Champ_Fonc_Face_VDF::affecter_(const Champ_base& ch)
             val(num_face) = v(face_voisins(num_face, 1), ori);
         }
 
+      ToDo_Kokkos("critical");
       for (num_face = ndeb_int; num_face < nb_faces; num_face++)
         {
           ori = orientation(num_face);
@@ -106,6 +109,7 @@ Champ_base& Champ_Fonc_Face_VDF::affecter_(const Champ_base& ch)
       int nbz = 0;
       int num_face, k;
 
+      ToDo_Kokkos("critical");
       for (num_face = 0; num_face < nb_faces; num_face++)
         {
           ori = orientation(num_face);
@@ -134,6 +138,7 @@ Champ_base& Champ_Fonc_Face_VDF::affecter_(const Champ_base& ch)
       if (dimension == 3)
         ch.valeur_aux_compo(positionZ, W, 2);
       nbx = nby = nbz = 0;
+      ToDo_Kokkos("critical");
       for (num_face = 0; num_face < nb_faces; num_face++)
         {
           ori = orientation(num_face);
diff --git a/src/VDF/Champs/Champ_Fonc_Face_VDF.h b/src/VDF/Champs/Champ_Fonc_Face_VDF.h
index 07ef7d0605..322bc24675 100644
--- a/src/VDF/Champs/Champ_Fonc_Face_VDF.h
+++ b/src/VDF/Champs/Champ_Fonc_Face_VDF.h
@@ -1,5 +1,5 @@
 /****************************************************************************
-* Copyright (c) 2025, CEA
+* Copyright (c) 2026, CEA
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
@@ -48,6 +48,11 @@ class Champ_Fonc_Face_VDF : public Champ_Fonc_base, public Champ_Face_VDF_implem
     return Champ_Face_VDF_implementation::valeur_a_elem_compo(position, le_poly, ncomp);
   }
 
+  inline DoubleTab& valeur_aux_centres_de_gravite(const Domaine& dom, DoubleTab& tab_valeurs) const override
+  {
+    return Champ_Face_VDF_implementation::valeur_aux_centres_de_gravite(tab_valeurs);
+  }
+
   inline DoubleTab& valeur_aux_elems(const DoubleTab& positions, const IntVect& les_polys, DoubleTab& tab_valeurs) const override
   {
     return Champ_Face_VDF_implementation::valeur_aux_elems(positions, les_polys, tab_valeurs);
diff --git a/src/VDF/Champs/Champ_Fonc_Q1_VDF.cpp b/src/VDF/Champs/Champ_Fonc_Q1_VDF.cpp
index 88547783d3..589e247cfa 100644
--- a/src/VDF/Champs/Champ_Fonc_Q1_VDF.cpp
+++ b/src/VDF/Champs/Champ_Fonc_Q1_VDF.cpp
@@ -1,5 +1,5 @@
 /****************************************************************************
-* Copyright (c) 2022, CEA
+* Copyright (c) 2026, CEA
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
@@ -37,6 +37,7 @@ int Champ_Fonc_Q1_VDF::imprime(Sortie& os, int ncomp) const
   const DoubleTab& val = valeurs();
   int som;
   os << nb_som << finl;
+  ToDo_Kokkos("critical");
   for (som = 0; som < nb_som; som++)
     {
       if (dimension == 3)
diff --git a/src/VDF/Champs/Champ_Fonc_Tabule_P0_VDF.cpp b/src/VDF/Champs/Champ_Fonc_Tabule_P0_VDF.cpp
index ba3887fb36..acc18e15f5 100644
--- a/src/VDF/Champs/Champ_Fonc_Tabule_P0_VDF.cpp
+++ b/src/VDF/Champs/Champ_Fonc_Tabule_P0_VDF.cpp
@@ -14,8 +14,10 @@
 *****************************************************************************/
 
 #include <Champ_Fonc_Tabule_P0_VDF.h>
+#include <TRUSTTrav.h>
 #include <Domaine_VF.h>
 #include <Table.h>
+#include <ParserView.h>
 
 Implemente_instanciable(Champ_Fonc_Tabule_P0_VDF, "Champ_Fonc_Tabule_P0_VDF", Champ_Fonc_P0_VDF);
 
@@ -31,46 +33,7 @@ void Champ_Fonc_Tabule_P0_VDF::associer_param(const VECT(OBS_PTR(Champ_base)) &l
 
 void Champ_Fonc_Tabule_P0_VDF::mettre_a_jour(double t)
 {
-  const Domaine_VF& domaine_VF = le_dom_VF.valeur();
-  const Table& table = la_table.valeur();
-  DoubleTab& mes_valeurs = valeurs();
-  const int nb_elem = domaine_VF.nb_elem(), nb_elem_tot = domaine_VF.nb_elem_tot(), nb_param = les_ch_param.size();
-  DoubleTabs val_params_aux_elems;
-  for (int i = 0; i < nb_param; i++)
-    {
-      assert(les_ch_param[i]->valeurs().dimension(1) == 1 || les_ch_param[i]->valeurs().dimension(1) == mes_valeurs.dimension(1));
-      DoubleTab vp(nb_elem_tot, les_ch_param[i]->valeurs().dimension(1));
-      val_params_aux_elems.add(vp);
-    }
-  const DoubleTab& centres_de_gravites = domaine_VF.xp();
-  IntVect les_polys(nb_elem_tot);
-  for (int elem = 0; elem < nb_elem_tot; elem++)
-    les_polys(elem) = elem;
-
-  // Estimate the field parameter on cells:
-  for (int i = 0; i < nb_param; i++)
-    les_ch_param[i]->valeur_aux_elems(centres_de_gravites, les_polys, val_params_aux_elems[i]);
-  // Compute the field according to the parameter field
-  if (table.isfonction() != 2)
-    {
-      const int nbcomp = mes_valeurs.dimension(1);
-      std::vector<double> vals;
-      vals.reserve(nb_param); // Pre-allocate space once
-      for (int num_elem = 0; num_elem < nb_elem; num_elem++)
-        for (int ncomp = 0; ncomp < nbcomp; ncomp++)
-          {
-            vals.clear();
-            for (int n = 0; n < nb_param; n++)
-              vals.push_back(val_params_aux_elems[n](num_elem, les_ch_param[n]->valeurs().dimension(1) == 1 ? 0 : ncomp));
-            mes_valeurs(num_elem, ncomp) = table.val(vals, ncomp);
-          }
-    }
-  else
-    {
-      table.valeurs(val_params_aux_elems[0], centres_de_gravites, t, mes_valeurs);
-    }
-
-  Champ_Fonc_base::mettre_a_jour(t);
+  Champ_Fonc_P0_base::mettre_a_jour(t, la_table.valeur(), les_ch_param);
 }
 
 int Champ_Fonc_Tabule_P0_VDF::initialiser(const double un_temps)
diff --git a/src/VDF/Champs/Champ_P0_VDF.cpp b/src/VDF/Champs/Champ_P0_VDF.cpp
index 41e72794c2..269dd4efa2 100644
--- a/src/VDF/Champs/Champ_P0_VDF.cpp
+++ b/src/VDF/Champs/Champ_P0_VDF.cpp
@@ -1,5 +1,5 @@
 /****************************************************************************
-* Copyright (c) 2023, CEA
+* Copyright (c) 2026, CEA
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
@@ -59,6 +59,7 @@ DoubleVect Champ_P0_VDF::moyenne(const DoubleVect& porosite_elem) const
   moy = 0;
   double coef, sum_vol = 0;
 
+  ToDo_Kokkos("critical");
   for (int num_elem = 0; num_elem < domaine_VDF().nb_elem(); num_elem++)
     {
       coef = porosite_elem(num_elem) * volumes(num_elem);
@@ -82,6 +83,7 @@ double Champ_P0_VDF::moyenne(const DoubleVect& porosite_elem, int ncomp) const
   double moy = 0;
   double coef, sum_vol = 0;
 
+  ToDo_Kokkos("critical");
   for (int num_elem = 0; num_elem < domaine_VDF().nb_elem(); num_elem++)
     {
       coef = porosite_elem(num_elem) * volumes(num_elem);
@@ -232,6 +234,7 @@ double Champ_P0_VDF::integrale_espace(int ncomp) const
   const DoubleTab& val = valeurs();
   assert(ncomp < val.line_size());
 
+  ToDo_Kokkos("critical");
   for (elem = 0; elem < nb_elem; elem++)
     integr += val(elem, ncomp) * volumes(elem);
 
diff --git a/src/VDF/Champs/Champ_P0_VDF.h b/src/VDF/Champs/Champ_P0_VDF.h
index 62ea5b38ca..037c787b27 100644
--- a/src/VDF/Champs/Champ_P0_VDF.h
+++ b/src/VDF/Champs/Champ_P0_VDF.h
@@ -1,5 +1,5 @@
 /****************************************************************************
-* Copyright (c) 2023, CEA
+* Copyright (c) 2026, CEA
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
@@ -35,6 +35,7 @@ public :
   DoubleTab& remplir_coord_noeuds(DoubleTab& ) const override;
   int imprime(Sortie& os, int nb_compo_) const override;
   double integrale_espace(int ncomp) const override;
+  DoubleTab& valeur_aux_centres_de_gravite(const Domaine& dom, DoubleTab& result) const override { return Champ_implementation_P0::valeur_aux_centres_de_gravite(dom, result); }
 };
 
 #endif /* Champ_P0_VDF_included */
diff --git a/src/VDF/Champs/Champ_front_debit_QC.cpp b/src/VDF/Champs/Champ_front_debit_QC.cpp
index 080f4a3b21..f9b11b3092 100644
--- a/src/VDF/Champs/Champ_front_debit_QC.cpp
+++ b/src/VDF/Champs/Champ_front_debit_QC.cpp
@@ -129,21 +129,24 @@ void Champ_front_debit_QC::mettre_a_jour(double tps)
   int nfin = ndeb + nb_faces;
   const DoubleTab& tab_rhonp1P0 =fluide->loi_etat()->rho_np1();
   if (ismoyen==0)
+    {
+      ToDo_Kokkos("critical");
+      for (int num_face=ndeb; num_face<nfin; num_face++)
+        {
+          int n0 = face_voisins(num_face, 0);
 
-    for (int num_face=ndeb; num_face<nfin; num_face++)
-      {
-        int n0 = face_voisins(num_face, 0);
-
-        if (n0 == -1)
-          n0 = face_voisins(num_face, 1);
-        for (int ori=0; ori<dim; ori++)
-          tab(num_face-ndeb,ori)=Debit(ori)/tab_rhonp1P0(n0);
-      }
+          if (n0 == -1)
+            n0 = face_voisins(num_face, 1);
+          for (int ori=0; ori<dim; ori++)
+            tab(num_face-ndeb,ori)=Debit(ori)/tab_rhonp1P0(n0);
+        }
+    }
   else
     {
       int num_face;
       double rho_moy=0,S=0,s;
       const DoubleVect& surface=le_dom_VDF.face_surfaces();
+      ToDo_Kokkos("critical");
       for ( num_face=ndeb; num_face<nfin; num_face++)
         {
           int n0 = face_voisins(num_face, 0);
@@ -157,6 +160,7 @@ void Champ_front_debit_QC::mettre_a_jour(double tps)
       // Optimization: combine 2 mp_sum into 1 collective call
       mp_sum_for_each(S, rho_moy);
       rho_moy/=S;
+      ToDo_Kokkos("critical");
       for ( num_face=ndeb; num_face<nfin; num_face++)
         for (int ori=0; ori<dim; ori++)
           tab(num_face-ndeb,ori)=Debit(ori)/rho_moy;
diff --git a/src/VDF/Champs/Champ_front_debit_QC_fonc_t.cpp b/src/VDF/Champs/Champ_front_debit_QC_fonc_t.cpp
index f3360abc39..caae94bfd9 100644
--- a/src/VDF/Champs/Champ_front_debit_QC_fonc_t.cpp
+++ b/src/VDF/Champs/Champ_front_debit_QC_fonc_t.cpp
@@ -1,5 +1,5 @@
 /****************************************************************************
-* Copyright (c) 2024, CEA
+* Copyright (c) 2026, CEA
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
@@ -140,25 +140,28 @@ void Champ_front_debit_QC_fonc_t::mettre_a_jour(double tps)
   int nfin = ndeb + nb_faces;
   const DoubleTab& tab_rhonp1P0 =fluide->loi_etat()->rho_np1();
   if (ismoyen==0)
+    {
+      ToDo_Kokkos("critical");
+      for (int num_face=ndeb; num_face<nfin; num_face++)
+        {
+          int n0 = face_voisins(num_face, 0);
 
-    for (int num_face=ndeb; num_face<nfin; num_face++)
-      {
-        int n0 = face_voisins(num_face, 0);
-
-        if (n0 == -1)
-          n0 = face_voisins(num_face, 1);
-        for (int ori=0; ori<dim; ori++)
-          {
-            f_debit_t[ori].setVar("t",tps);
-            Debit(num_face-ndeb,ori)=f_debit_t[ori].eval();
-            tab(num_face-ndeb,ori)=Debit(num_face-ndeb,ori)/tab_rhonp1P0(n0);
-          }
-      }
+          if (n0 == -1)
+            n0 = face_voisins(num_face, 1);
+          for (int ori=0; ori<dim; ori++)
+            {
+              f_debit_t[ori].setVar("t",tps);
+              Debit(num_face-ndeb,ori)=f_debit_t[ori].eval();
+              tab(num_face-ndeb,ori)=Debit(num_face-ndeb,ori)/tab_rhonp1P0(n0);
+            }
+        }
+    }
   else
     {
       int num_face;
       double rho_moy=0,S=0,s;
       const DoubleVect& surface=le_dom_VDF.face_surfaces();
+      ToDo_Kokkos("critical");
       for ( num_face=ndeb; num_face<nfin; num_face++)
         {
           int n0 = face_voisins(num_face, 0);
@@ -172,6 +175,7 @@ void Champ_front_debit_QC_fonc_t::mettre_a_jour(double tps)
       S = mp_sum(S);
       rho_moy = mp_sum(rho_moy);
       rho_moy/=S;
+      ToDo_Kokkos("critical");
       for ( num_face=ndeb; num_face<nfin; num_face++)
         for (int ori=0; ori<dim; ori++)
           {
diff --git a/src/VDF/Champs/Champ_val_tot_sur_vol_VDF.cpp b/src/VDF/Champs/Champ_val_tot_sur_vol_VDF.cpp
index 6b89772e7a..fa91a58c87 100644
--- a/src/VDF/Champs/Champ_val_tot_sur_vol_VDF.cpp
+++ b/src/VDF/Champs/Champ_val_tot_sur_vol_VDF.cpp
@@ -1,5 +1,5 @@
 /****************************************************************************
-* Copyright (c) 2024, CEA
+* Copyright (c) 2026, CEA
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
@@ -50,6 +50,7 @@ DoubleVect& Champ_val_tot_sur_vol_VDF::eval_contrib_loc(const Domaine_dis_base&
       int size_sz = sz.nb_elem_tot();
       int el;
 
+      ToDo_Kokkos("critical");
       for (int elem=0; elem<size_sz; elem++)
         {
           el = sz(elem);
@@ -60,6 +61,7 @@ DoubleVect& Champ_val_tot_sur_vol_VDF::eval_contrib_loc(const Domaine_dis_base&
       cpt++;
     }
 
+  ToDo_Kokkos("critical");
   for (int elem=0; elem<nb_elem; elem++)
     vol_glob_pond(0) += vol(elem)*por_elem(elem);
 
diff --git a/src/VDF/Champs/Correlation_Vec_Sca_VDF.cpp b/src/VDF/Champs/Correlation_Vec_Sca_VDF.cpp
index bf5ac1fb95..70a78e0d28 100644
--- a/src/VDF/Champs/Correlation_Vec_Sca_VDF.cpp
+++ b/src/VDF/Champs/Correlation_Vec_Sca_VDF.cpp
@@ -1,5 +1,5 @@
 /****************************************************************************
-* Copyright (c) 2025, CEA
+* Copyright (c) 2026, CEA
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
@@ -49,6 +49,7 @@ void Correlation_Vec_Sca_VDF::mettre_a_jour(double tps)
   DoubleTab valeurs_Vec(nb_elem, mon_champ_Vec_->nb_comp());
   mon_champ_Vec_->valeur_aux_centres_de_gravite(dom, valeurs_Vec);
 
+  ToDo_Kokkos("critical");
   for(int elem=0; elem<nb_elem; elem++)
     {
       correlation(elem,0) = valeurs_Sca(elem,0);
diff --git a/src/VDF/Champs/Courant_maille_Champ_Face.cpp b/src/VDF/Champs/Courant_maille_Champ_Face.cpp
index dded454477..3b70198c39 100644
--- a/src/VDF/Champs/Courant_maille_Champ_Face.cpp
+++ b/src/VDF/Champs/Courant_maille_Champ_Face.cpp
@@ -1,5 +1,5 @@
 /****************************************************************************
-* Copyright (c) 2023, CEA
+* Copyright (c) 2026, CEA
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
@@ -49,6 +49,7 @@ void Courant_maille_Champ_Face::mettre_a_jour(double tps)
   const int nb_faces = domaine_vdf().nb_faces();
   DoubleTab& co = valeurs(); // Courant de maille
   double dt = sch_->pas_de_temps();
+  ToDo_Kokkos("critical");
   for (int face = 0; face < nb_faces; face++)
     {
       // Calcul de la taille de maille entourant la face
diff --git a/src/VDF/Champs/Reynolds_maille_Champ_Face.cpp b/src/VDF/Champs/Reynolds_maille_Champ_Face.cpp
index 3a18dca308..3b9c517e3a 100644
--- a/src/VDF/Champs/Reynolds_maille_Champ_Face.cpp
+++ b/src/VDF/Champs/Reynolds_maille_Champ_Face.cpp
@@ -1,5 +1,5 @@
 /****************************************************************************
-* Copyright (c) 2024, CEA
+* Copyright (c) 2026, CEA
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
@@ -47,6 +47,7 @@ void Reynolds_maille_Champ_Face::mettre_a_jour(double tps)
 {
   const int nb_faces = domaine_vdf().nb_faces();
   DoubleTab& re = valeurs(); // Reynolds de maille
+  ToDo_Kokkos("critical");
   for (int face = 0; face < nb_faces; face++)
     {
       // Calcul de la viscosite face
diff --git a/src/VDF/Champs/T_paroi_Champ_P0_VDF.cpp b/src/VDF/Champs/T_paroi_Champ_P0_VDF.cpp
index 3f08952437..79c567467f 100644
--- a/src/VDF/Champs/T_paroi_Champ_P0_VDF.cpp
+++ b/src/VDF/Champs/T_paroi_Champ_P0_VDF.cpp
@@ -1,5 +1,5 @@
 /****************************************************************************
-* Copyright (c) 2025, CEA
+* Copyright (c) 2026, CEA
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
@@ -79,6 +79,7 @@ void T_paroi_Champ_P0_VDF::me_calculer(double tps)
 
         if (sub_type(Periodique, la_cl.valeur()))
           {
+            ToDo_Kokkos("critical");
             for (int num_face = ndeb; num_face < nfin; num_face++)
               {
                 const int elem1 = face_voisins(num_face, 0), elem2 = face_voisins(num_face, 1);
@@ -92,6 +93,7 @@ void T_paroi_Champ_P0_VDF::me_calculer(double tps)
           }
         else if (sub_type(Dirichlet, la_cl.valeur()))
           {
+            ToDo_Kokkos("critical");
             for (int num_face = ndeb, num_face_cl = 0; num_face < nfin; num_face++, num_face_cl++)
               {
                 int elem = face_voisins(num_face, 0);
@@ -105,6 +107,7 @@ void T_paroi_Champ_P0_VDF::me_calculer(double tps)
           }
         else if (sub_type(Dirichlet_homogene, la_cl.valeur()))
           {
+            ToDo_Kokkos("critical");
             for (int num_face = ndeb; num_face < nfin; num_face++)
               {
                 int elem = face_voisins(num_face, 0);
@@ -117,6 +120,7 @@ void T_paroi_Champ_P0_VDF::me_calculer(double tps)
           }
         else if (sub_type(Neumann_homogene, la_cl.valeur()) || sub_type(Navier, la_cl.valeur())) // grad nulle
           {
+            ToDo_Kokkos("critical");
             for (int num_face = ndeb; num_face < nfin; num_face++)
               {
                 int elem = face_voisins(num_face, 0);
@@ -130,6 +134,7 @@ void T_paroi_Champ_P0_VDF::me_calculer(double tps)
         else if (sub_type(Echange_externe_impose, la_cl.valeur()))
           {
             const Echange_externe_impose& la_cl_ext = ref_cast(Echange_externe_impose, la_cl.valeur());
+            ToDo_Kokkos("critical");
             for (int num_face = ndeb, num_face_cl = 0; num_face < nfin; num_face++, num_face_cl++)
               {
                 const int elem1 = face_voisins(num_face, 0), elem2 = face_voisins(num_face, 1);
@@ -160,6 +165,7 @@ void T_paroi_Champ_P0_VDF::me_calculer(double tps)
         else if (sub_type(Echange_global_impose, la_cl.valeur()))
           {
             const Echange_global_impose& la_cl_glob = ref_cast(Echange_global_impose, la_cl.valeur());
+            ToDo_Kokkos("critical");
             for (int num_face = ndeb, num_face_cl = 0; num_face < nfin; num_face++, num_face_cl++)
               {
                 const int elem1 = face_voisins(num_face, 0), elem2 = face_voisins(num_face, 1);
@@ -175,20 +181,24 @@ void T_paroi_Champ_P0_VDF::me_calculer(double tps)
               }
           }
         else if (sub_type(Neumann_paroi, la_cl.valeur()))
-          for (int num_face = ndeb, num_face_cl = 0; num_face < nfin; num_face++, num_face_cl++)
-            {
-              const int elem1 = face_voisins(num_face, 0), elem2 = face_voisins(num_face, 1);
-              const double signe = elem1 > -1 ? -1.0 : 1.0;
-              const int elem = elem1 > -1 ? elem1 : elem2;
-              const double e = Objet_U::axi ? dvdf.dist_norm_bord_axi(num_face) : dvdf.dist_norm_bord(num_face);
-              const double nu = eval.nu_2_impl(elem, k), t_elem = temp(elem, k);
-
-              val(elem, k) += signe * e * ref_cast(Neumann_paroi, la_cl.valeur()).flux_impose(num_face_cl, k) / nu + t_elem;
-              indx_pond(elem, k)++;
-            }
+          {
+            ToDo_Kokkos("critical");
+            for (int num_face = ndeb, num_face_cl = 0; num_face < nfin; num_face++, num_face_cl++)
+              {
+                const int elem1 = face_voisins(num_face, 0), elem2 = face_voisins(num_face, 1);
+                const double signe = elem1 > -1 ? -1.0 : 1.0;
+                const int elem = elem1 > -1 ? elem1 : elem2;
+                const double e = Objet_U::axi ? dvdf.dist_norm_bord_axi(num_face) : dvdf.dist_norm_bord(num_face);
+                const double nu = eval.nu_2_impl(elem, k), t_elem = temp(elem, k);
+
+                val(elem, k) += signe * e * ref_cast(Neumann_paroi, la_cl.valeur()).flux_impose(num_face_cl, k) / nu + t_elem;
+                indx_pond(elem, k)++;
+              }
+          }
       }
 
   // On moyenne la contribution
+  ToDo_Kokkos("critical");
   for (int elem = 0; elem < n_elem; elem++)
     for (int k = 0; k < N; k++) // pour multiphase
       if (indx_pond(elem, k) > 0)
diff --git a/src/VDF/Champs/Taux_cisaillement_P0_VDF.cpp b/src/VDF/Champs/Taux_cisaillement_P0_VDF.cpp
index a981348b27..f295f664cf 100644
--- a/src/VDF/Champs/Taux_cisaillement_P0_VDF.cpp
+++ b/src/VDF/Champs/Taux_cisaillement_P0_VDF.cpp
@@ -17,6 +17,8 @@
 #include <Champ_Face_VDF.h>
 #include <Domaine_Cl_VDF.h>
 #include <Domaine_VF.h>
+#include <kokkos++.h>
+#include <TRUSTArray_kokkos.tpp>
 
 Implemente_instanciable(Taux_cisaillement_P0_VDF, "Taux_cisaillement_P0_VDF", Champ_Fonc_P0_VDF);
 
@@ -35,13 +37,18 @@ void Taux_cisaillement_P0_VDF::mettre_a_jour(double tps)
   int nb_elem = le_dom_VF->nb_elem();
   int N = vitesse_->valeurs().line_size();
 
-  DoubleTab tmp(nb_elem,N);
-  vitesse_->calcul_S_barre_Multiphase(vitesse_->valeurs(), tmp, le_dom_Cl_VDF.valeur());
-
-  DoubleTab& S = valeurs(); // Shear rate
-  for (int n = 0; n < N; n++)
-    for (int i = 0; i < nb_elem; i++)
-      S(i,n) = sqrt(tmp(i,n));
+  DoubleTrav tab_S_barre(nb_elem,N);
+  vitesse_->calcul_S_barre_Multiphase(vitesse_->valeurs(), tab_S_barre, le_dom_Cl_VDF.valeur());
+
+  // Convert to Kokkos parallel_for
+  CDoubleTabView S_barre = tab_S_barre.view_ro();
+  DoubleTabView S = valeurs().view_wo(); // Shear rate
+  Kokkos::parallel_for(start_gpu_timer(__KERNEL_NAME__), range_1D(0, nb_elem), KOKKOS_LAMBDA(const int i)
+  {
+    for (int n = 0; n < N; n++)
+      S(i,n) = Kokkos::sqrt(S_barre(i,n));
+  });
+  end_gpu_timer(__KERNEL_NAME__);
 
   changer_temps(tps);
   Champ_Fonc_base::mettre_a_jour(tps);
diff --git a/src/VDF/Cond_Lim/Echange_contact_Correlation_VDF.cpp b/src/VDF/Cond_Lim/Echange_contact_Correlation_VDF.cpp
index 128073a5e4..0c5a30ee9b 100644
--- a/src/VDF/Cond_Lim/Echange_contact_Correlation_VDF.cpp
+++ b/src/VDF/Cond_Lim/Echange_contact_Correlation_VDF.cpp
@@ -670,6 +670,7 @@ void Echange_contact_Correlation_VDF::calculer_h_solide(DoubleTab& tab,const Equ
 
   e.resize(front_vf.nb_faces());
 
+  ToDo_Kokkos("critical");
   for (int face=ndeb; face<nfin; face++)
     e(face-ndeb) = zvdf_2.dist_norm_bord(face);
 
@@ -678,6 +679,7 @@ void Echange_contact_Correlation_VDF::calculer_h_solide(DoubleTab& tab,const Equ
     {
       //Cerr << "raccord local homogene et conductivite non uniforme" << finl;
       const DoubleTab& tab_lambda = le_milieu.conductivite().valeurs();
+      ToDo_Kokkos("critical");
       for (int face=ndeb; face<nfin; face++)
         {
           int elem = face_voisins(face,0);
@@ -692,6 +694,7 @@ void Echange_contact_Correlation_VDF::calculer_h_solide(DoubleTab& tab,const Equ
     }
   else  // la conductivite est un OWN_PTR(Champ_base) uniforme
     {
+      ToDo_Kokkos("critical");
       for (int face=ndeb; face<nfin; face++)
         {
           for(i=0; i<nb_comp; i++)
diff --git a/src/VDF/Cond_Lim/Echange_contact_VDF.cpp b/src/VDF/Cond_Lim/Echange_contact_VDF.cpp
index 0b595e0db0..24ccabf278 100644
--- a/src/VDF/Cond_Lim/Echange_contact_VDF.cpp
+++ b/src/VDF/Cond_Lim/Echange_contact_VDF.cpp
@@ -129,6 +129,7 @@ void calculer_h_local(DoubleTab& tab,const Equation_base& une_eqn,const Domaine_
     }
   if( ! dequiv )
     {
+      ToDo_Kokkos("critical");
       for (int face=ndeb; face<nfin; face++)
         {
           e(face-ndeb) = zvdf_2.dist_norm_bord(face);
@@ -139,6 +140,7 @@ void calculer_h_local(DoubleTab& tab,const Equation_base& une_eqn,const Domaine_
   if(!sub_type(Champ_Uniforme,le_milieu.conductivite()))
     {
       const DoubleTab& lambda = le_milieu.conductivite().valeurs();
+      ToDo_Kokkos("critical");
       for (int face=ndeb; face<nfin; face++)
         {
           int elem = face_voisins(face,0);
@@ -153,6 +155,7 @@ void calculer_h_local(DoubleTab& tab,const Equation_base& une_eqn,const Domaine_
     }
   else  // la conductivite est un OWN_PTR(Champ_base) uniforme
     {
+      ToDo_Kokkos("critical");
       for (int face=ndeb; face<nfin; face++)
         {
           for(i=0; i<nb_comp; i++)
@@ -201,6 +204,7 @@ void calculer_h_distant(DoubleTab& tab,const Equation_base& une_eqn,const Domain
     {
       DoubleTab lambda;
       front_vf.frontiere().trace_elem_distant(le_milieu.conductivite().valeurs(),lambda);
+      ToDo_Kokkos("critical");
       for (int face=0; face<nb_faces_raccord1; face++)
         for(i=0; i<nb_comp; i++)
           {
@@ -211,6 +215,7 @@ void calculer_h_distant(DoubleTab& tab,const Equation_base& une_eqn,const Domain
   else  // la conductivite est un OWN_PTR(Champ_base) uniforme
     {
       const DoubleTab& lambda = le_milieu.conductivite().valeurs();
+      ToDo_Kokkos("critical");
       for (int face=0; face<nb_faces_raccord1; face++)
         for(i=0; i<nb_comp; i++)
           {
diff --git a/src/VDF/Cond_Lim/Flux_radiatif_VDF.cpp b/src/VDF/Cond_Lim/Flux_radiatif_VDF.cpp
index 1abbef43de..ca4f1075ce 100644
--- a/src/VDF/Cond_Lim/Flux_radiatif_VDF.cpp
+++ b/src/VDF/Cond_Lim/Flux_radiatif_VDF.cpp
@@ -64,6 +64,7 @@ void Flux_radiatif_VDF::evaluer_cl_rayonnement(Champ_front_base& Tb, const Champ
   double epsi = -123., T = -123.;
 
   // Boucle sur les faces de le_bord
+  ToDo_Kokkos("critical");
   for (int face = ndeb; face < nfin; face++)
     {
       int elem = face_voisins(face, 0);
@@ -200,6 +201,7 @@ void Flux_radiatif_VDF::calculer_flux_radiatif(const Equation_base& eq_temp)
   double Tbord = -123., n = -123.;
 
   // On fait une boucle sur les faces
+  ToDo_Kokkos("critical");
   for (int face = 0; face < nb_faces; face++)
     {
       int elem = face_voisins(face + ndeb, 0);
diff --git a/src/VDF/Cond_Lim/Neumann_paroi_rayo_semi_transp_VDF.cpp b/src/VDF/Cond_Lim/Neumann_paroi_rayo_semi_transp_VDF.cpp
index b8282e48b2..4cbc7f3a18 100644
--- a/src/VDF/Cond_Lim/Neumann_paroi_rayo_semi_transp_VDF.cpp
+++ b/src/VDF/Cond_Lim/Neumann_paroi_rayo_semi_transp_VDF.cpp
@@ -132,6 +132,7 @@ void Neumann_paroi_rayo_semi_transp_VDF::calculer_temperature_bord(double temps)
 
   int face = 0;
   int num_face;
+  ToDo_Kokkos("critical");
   for (face = 0; face < nb_faces; face++)
     {
       num_face = face + ndeb;
@@ -190,6 +191,7 @@ void Neumann_paroi_rayo_semi_transp_VDF::completer()
   // Debut de la boucle sur les faces de bord
   //
 
+  ToDo_Kokkos("critical");
   for (face = 0; face < front_vf.nb_faces(); face++)
     {
       int elem = face_voisins(face + ndeb, 0);
diff --git a/src/VDF/Cond_Lim/PlaqThVDF.cpp b/src/VDF/Cond_Lim/PlaqThVDF.cpp
index bab33ab420..8c8034cecc 100644
--- a/src/VDF/Cond_Lim/PlaqThVDF.cpp
+++ b/src/VDF/Cond_Lim/PlaqThVDF.cpp
@@ -68,6 +68,7 @@ void PlaqThVDF::mettre_a_jour(double )
           if (le_dom_VDF.front_VF(n_bord).le_nom() == front.le_nom())
             boundary_index=n_bord;
         }
+      ToDo_Kokkos("critical");
       for(int face=0; face < nbfs2; face++)
         {
           // double e1 = loipar.d_equiv(face);
@@ -105,6 +106,7 @@ void PlaqThVDF::mettre_a_jour(double )
   int premiere = front.num_premiere_face();
   int nbfs2=front.nb_faces()/2;
   int derniere = premiere + nbfs2;
+  ToDo_Kokkos("critical");
   for(face=premiere; face < derniere; face++)
     {
       int num=face-premiere;
diff --git a/src/VDF/Cond_Lim/Sortie_libre_Gradient_Pression_impose.cpp b/src/VDF/Cond_Lim/Sortie_libre_Gradient_Pression_impose.cpp
index 378db2632f..64c66f16c9 100644
--- a/src/VDF/Cond_Lim/Sortie_libre_Gradient_Pression_impose.cpp
+++ b/src/VDF/Cond_Lim/Sortie_libre_Gradient_Pression_impose.cpp
@@ -61,6 +61,7 @@ void Sortie_libre_Gradient_Pression_impose::completer()
   coeff.resize(nb_faces_loc);
   trace_pression_int.resize(nb_faces_loc);
 
+  ToDo_Kokkos("critical");
   for (face = ndeb; face < ndeb + nb_faces_loc; face++)
     if (face_voisins(face, 0) != -1)
       coeff[face - ndeb] = volumes_entrelaces[face] / face_surfaces[face];
@@ -80,6 +81,7 @@ void Sortie_libre_Gradient_Pression_impose::mettre_a_jour(double temps)
   int nb_faces_loc = le_bord.nb_faces();
 
   assert(pression_interne);
+  ToDo_Kokkos("critical");
   for (int face = ndeb; face < ndeb + nb_faces_loc; face++)
     trace_pression_int[face - ndeb] = pression_interne->valeur_au_bord(face);
 }
diff --git a/src/VDF/Cond_Lim/Sortie_libre_Pression_imposee_Orlansky.cpp b/src/VDF/Cond_Lim/Sortie_libre_Pression_imposee_Orlansky.cpp
index 129238203a..4cece14a46 100644
--- a/src/VDF/Cond_Lim/Sortie_libre_Pression_imposee_Orlansky.cpp
+++ b/src/VDF/Cond_Lim/Sortie_libre_Pression_imposee_Orlansky.cpp
@@ -104,6 +104,7 @@ void Sortie_libre_Pression_imposee_Orlansky::mettre_a_jour(double temps)
 
   int face, compo;
 
+  ToDo_Kokkos("critical");
   for (face = ndeb; face < ndeb + nb_faces_loc; face++)
     {
       int i = face - ndeb;
@@ -154,53 +155,59 @@ void Sortie_libre_Pression_imposee_Orlansky::mettre_a_jour(double temps)
   //Debog::verifier_bord("Orlansky::mettre_a_jour() : vitesse_moins_deux : " , vitesse_moins_deux, ndeb);
   //Debog::verifier_bord("Orlansky::mettre_a_jour() : vitesse_moins_un : " , vitesse_moins_un, ndeb);
 
+  ToDo_Kokkos("critical");
   for (compo = 0; compo < dimension; compo++)
-    for (face = ndeb; face < ndeb + nb_faces_loc; face++)
-      {
-        int i = face - ndeb;
-
-        int ori = zvdf.orientation(face);
-
-        vitesse_temps_moins_deux(i, compo) = vitesse_temps_moins_un(i, compo);
-        vitesse_temps_moins_un(i, compo) = vit_ext(i, compo);
-        vitesse_moins_un_temps_moins_deux(i, compo) = vitesse_moins_un_temps_moins_un(i, compo);
-
-        vitesse_moins_un_temps_moins_un(i, compo) = vitesse_moins_un(i, compo);
-        vitesse_moins_deux_temps_moins_un(i, compo) = vitesse_moins_deux(i, compo);
-
-        int elem_un = zvdf.face_voisins(face, 0);
-        if (elem_un < 0)
-          elem_un = zvdf.face_voisins(face, 1);
-        int face_moins_un = zvdf.elem_faces(elem_un, ori);
-        if (face_moins_un == face)
-          face_moins_un = zvdf.elem_faces(elem_un, ori + dimension);
-        double vit = 0.5 * (vitesse(zvdf.elem_faces(elem_un, compo)) + vitesse(zvdf.elem_faces(elem_un, compo + dimension)));
-
-        vitesse_moins_un(i, compo) = vit;
-
-        int elem_deux = zvdf.face_voisins(face_moins_un, 0);
-        if (elem_deux == elem_un)
-          elem_deux = zvdf.face_voisins(face_moins_un, 1);
-        vit = 0.5 * (vitesse(zvdf.elem_faces(elem_deux, compo)) + vitesse(zvdf.elem_faces(elem_deux, compo + dimension)));
-        vitesse_moins_deux(i, compo) = vit;
-
-        double pre_m_un_t_m_deux = vitesse_moins_un_temps_moins_deux(i, compo);
-        double pre_m_deux_t_m_un = vitesse_moins_deux_temps_moins_un(i, compo);
-        double pre_m_un = vitesse_moins_un(i, compo);
-
-        if (pre_m_un_t_m_deux == pre_m_un)
-          VPhiV(i, compo) = 0;
-        else
-          VPhiV(i, compo) = (pre_m_un_t_m_deux - pre_m_un) / (pre_m_un + pre_m_un_t_m_deux - 2 * pre_m_deux_t_m_un);
-        if (VPhiV(i, compo) <= 1.e-24)
-          VPhiV(i, compo) = 0.0;
-        if (VPhiV(i, compo) > 1.)
-          VPhiV(i, compo) = 1.0;
-        assert(VPhiV(i, compo) < 1.e12);
-
-        vit_ext(i, compo) = (1 - VPhiV(i, compo)) / (1 + VPhiV(i, compo)) * vitesse_temps_moins_un(i, compo) + (2 * VPhiV(i, compo) / (1 + VPhiV(i, compo))) * vitesse_moins_un(i, compo);
-
-      }
+    {
+      for (face = ndeb; face < ndeb + nb_faces_loc; face++)
+        {
+          int i = face - ndeb;
+
+          int ori = zvdf.orientation(face);
+
+          vitesse_temps_moins_deux(i, compo) = vitesse_temps_moins_un(i, compo);
+          vitesse_temps_moins_un(i, compo) = vit_ext(i, compo);
+          vitesse_moins_un_temps_moins_deux(i, compo) = vitesse_moins_un_temps_moins_un(i, compo);
+
+          vitesse_moins_un_temps_moins_un(i, compo) = vitesse_moins_un(i, compo);
+          vitesse_moins_deux_temps_moins_un(i, compo) = vitesse_moins_deux(i, compo);
+
+          int elem_un = zvdf.face_voisins(face, 0);
+          if (elem_un < 0)
+            elem_un = zvdf.face_voisins(face, 1);
+          int face_moins_un = zvdf.elem_faces(elem_un, ori);
+          if (face_moins_un == face)
+            face_moins_un = zvdf.elem_faces(elem_un, ori + dimension);
+          double vit = 0.5 * (vitesse(zvdf.elem_faces(elem_un, compo)) +
+                              vitesse(zvdf.elem_faces(elem_un, compo + dimension)));
+
+          vitesse_moins_un(i, compo) = vit;
+
+          int elem_deux = zvdf.face_voisins(face_moins_un, 0);
+          if (elem_deux == elem_un)
+            elem_deux = zvdf.face_voisins(face_moins_un, 1);
+          vit = 0.5 *
+                (vitesse(zvdf.elem_faces(elem_deux, compo)) + vitesse(zvdf.elem_faces(elem_deux, compo + dimension)));
+          vitesse_moins_deux(i, compo) = vit;
+
+          double pre_m_un_t_m_deux = vitesse_moins_un_temps_moins_deux(i, compo);
+          double pre_m_deux_t_m_un = vitesse_moins_deux_temps_moins_un(i, compo);
+          double pre_m_un = vitesse_moins_un(i, compo);
+
+          if (pre_m_un_t_m_deux == pre_m_un)
+            VPhiV(i, compo) = 0;
+          else
+            VPhiV(i, compo) = (pre_m_un_t_m_deux - pre_m_un) / (pre_m_un + pre_m_un_t_m_deux - 2 * pre_m_deux_t_m_un);
+          if (VPhiV(i, compo) <= 1.e-24)
+            VPhiV(i, compo) = 0.0;
+          if (VPhiV(i, compo) > 1.)
+            VPhiV(i, compo) = 1.0;
+          assert(VPhiV(i, compo) < 1.e12);
+
+          vit_ext(i, compo) = (1 - VPhiV(i, compo)) / (1 + VPhiV(i, compo)) * vitesse_temps_moins_un(i, compo) +
+                              (2 * VPhiV(i, compo) / (1 + VPhiV(i, compo))) * vitesse_moins_un(i, compo);
+
+        }
+    }
   Debog::verifier_bord("Orlansky::mettre_a_jour() :  vit_ext : ", vit_ext, ndeb);
   Debog::verifier_bord("Orlansky::mettre_a_jour() : VPhiV  : ", VPhiV, ndeb);
 }
diff --git a/src/VDF/Geometrie/Domaine_Cl_VDF.cpp b/src/VDF/Geometrie/Domaine_Cl_VDF.cpp
index 3f86323c61..769482a038 100644
--- a/src/VDF/Geometrie/Domaine_Cl_VDF.cpp
+++ b/src/VDF/Geometrie/Domaine_Cl_VDF.cpp
@@ -353,8 +353,6 @@ void Domaine_Cl_VDF::imposer_cond_lim(Champ_Inc_base& ch, double temps)
     {
       Champ_Face_VDF& ch_face = ref_cast(Champ_Face_VDF, ch);
       const Domaine_VDF& mon_dom_VDF = ch_face.domaine_vdf();
-      int ndeb,nfin, num_face;
-
       for(int i=0; i<nb_cond_lim(); i++)
         {
           const Cond_lim_base& la_cl = les_conditions_limites(i).valeur();
@@ -366,66 +364,58 @@ void Domaine_Cl_VDF::imposer_cond_lim(Champ_Inc_base& ch, double temps)
                   // sur deux faces de periodicite qui sont en face l'une de l'autre
                   const Periodique& la_cl_perio = ref_cast(Periodique,la_cl);
                   const Front_VF& le_bord = ref_cast(Front_VF,la_cl.frontiere_dis());
-                  ndeb = le_bord.num_premiere_face();
-                  nfin = ndeb + le_bord.nb_faces();
-                  int voisine;
-                  double moy;
-                  for (num_face=ndeb; num_face<nfin; num_face++)
-                    {
-                      voisine = la_cl_perio.face_associee(num_face-ndeb) + ndeb;
-                      if ( ch_tab[num_face] != ch_tab[voisine] )
-                        {
-                          //                           Cerr << "dans Domaine_Cl_VDF::imposer_cond_lim : on reajuste les vitesses!! pour la face num=" << num_face << finl;
-                          //                           Cerr << "difference = ch_tab[num_face]-ch_tab[voisine]=" << ch_tab[num_face]-ch_tab[voisine] << finl;
-                          moy = 0.5*(ch_tab[num_face] + ch_tab[voisine]);
-                          ch_tab[num_face] = moy;
-                          ch_tab[voisine] = moy;
-                        }
-                    }
-                  // Il ne faut pas le faire a la premiere cl mais une fois toutes les cl faites une fois, cas multi perio avec ci non perio
-                  // init = 1;
+                  int ndeb = le_bord.num_premiere_face();
+                  int nfin = ndeb + le_bord.nb_faces();
+
+                  // Using Kokkos to compute average values for periodic faces
+                  // First get all associated faces into an array
+                  CIntArrView face_associee = la_cl_perio.face_associee().view_ro();
+                  DoubleArrView tab = static_cast<ArrOfDouble&>(ch_tab).view_rw();
+                  Kokkos::parallel_for(start_gpu_timer(__KERNEL_NAME__), range_1D(ndeb, nfin), KOKKOS_LAMBDA(const int num_face)
+                  {
+                    int voisine = face_associee(num_face-ndeb) + ndeb;
+                    if (tab(num_face) != tab(voisine))
+                      {
+                        double moy = 0.5 * (tab(num_face) + tab(voisine));
+                        // Atomic operations to avoid race conditions when multiple threads
+                        // try to update the same location (if two faces reference each other)
+                        Kokkos::atomic_store(&tab(num_face), moy);
+                        Kokkos::atomic_store(&tab(voisine), moy);
+                      }
+                  });
+                  end_gpu_timer(__KERNEL_NAME__);
                 }
             }
-          else if( sub_type(Navier,la_cl) )
+          else if( sub_type(Navier,la_cl) || sub_type(Dirichlet_paroi_fixe,la_cl) || sub_type(Dirichlet_paroi_defilante,la_cl) )
             {
               const Front_VF& le_bord = ref_cast(Front_VF,la_cl.frontiere_dis());
-              ndeb = le_bord.num_premiere_face();
-              nfin = ndeb + le_bord.nb_faces();
-              for (num_face=ndeb; num_face<nfin; num_face++)
+              int ndeb = le_bord.num_premiere_face();
+              int nfin = ndeb + le_bord.nb_faces();
+              DoubleTabView tab = ch_tab.view_wo();
+              Kokkos::parallel_for(start_gpu_timer(__KERNEL_NAME__), range_1D(ndeb, nfin), KOKKOS_LAMBDA(const int num_face)
+              {
                 for (int n = 0; n < N; n++)
-                  ch_tab(num_face, n) = 0;
+                  tab(num_face, n) = 0.0;
+              });
+              end_gpu_timer(__KERNEL_NAME__);
             }
           else if ( sub_type(Dirichlet_entree_fluide,la_cl) )
             {
               const Dirichlet_entree_fluide& la_cl_diri = ref_cast(Dirichlet_entree_fluide,la_cl);
               const Front_VF& le_bord = ref_cast(Front_VF,la_cl.frontiere_dis());
-              ndeb = le_bord.num_premiere_face();
-              nfin = ndeb + le_bord.nb_faces();
-
-              for (num_face = ndeb; num_face < nfin; num_face++)
-                for (int n = 0; n < N; n++)
-                  {
-                    // WEC : optimisable (pour chaque face recherche le bon temps !)
-                    ch_tab(num_face, n) = la_cl_diri.val_imp_au_temps(temps, num_face - ndeb, N * mon_dom_VDF.orientation(num_face) + n);
-                  }
-            }
-          else if ( sub_type(Dirichlet_paroi_fixe,la_cl) )
-            {
-              const Front_VF& le_bord = ref_cast(Front_VF,la_cl.frontiere_dis());
-              ndeb = le_bord.num_premiere_face();
-              nfin = ndeb + le_bord.nb_faces();
-              for (num_face=ndeb; num_face<nfin; num_face++)
-                for (int n = 0; n < N; n++)
-                  ch_tab(num_face, n) = 0;
-            }
-          else if ( sub_type(Dirichlet_paroi_defilante,la_cl) )
-            {
-              const Front_VF& le_bord = ref_cast(Front_VF,la_cl.frontiere_dis());
-              ndeb = le_bord.num_premiere_face();
-              nfin = ndeb + le_bord.nb_faces();
-              for (num_face=ndeb; num_face<nfin; num_face++)
+              int ndeb = le_bord.num_premiere_face();
+              int nfin = ndeb + le_bord.nb_faces();
+              CDoubleTabView val_imp = la_cl_diri.tab_val_imp(temps).view_ro();
+              CIntArrView orientation = mon_dom_VDF.orientation().view_ro();
+              DoubleTabView tab = ch_tab.view_wo();
+              Kokkos::parallel_for(start_gpu_timer(__KERNEL_NAME__), range_1D(ndeb, nfin), KOKKOS_LAMBDA(const int num_face)
+              {
+                const int idx = num_face - ndeb;
+                const int orient = orientation(num_face);
                 for (int n = 0; n < N; n++)
-                  ch_tab(num_face, n) = 0;
+                  tab(num_face, n) = val_imp(idx, N * orient + n);
+              });
+              end_gpu_timer(__KERNEL_NAME__);
             }
         }
       init = 1;
diff --git a/src/VDF/Geometrie/Domaine_VDF.cpp b/src/VDF/Geometrie/Domaine_VDF.cpp
index 696ce43ba6..5339270e28 100644
--- a/src/VDF/Geometrie/Domaine_VDF.cpp
+++ b/src/VDF/Geometrie/Domaine_VDF.cpp
@@ -105,6 +105,7 @@ void Domaine_VDF::compute_sort_key(Faces& les_faces, IntTab& sort_key)
       sort_key(i, 1) = i;
     }
 
+  ToDo_Kokkos("critical");
   for (int i=nb_faces_front; i < nb_faces; i++)
     {
       const int ori = orientation_[i];
@@ -124,6 +125,7 @@ void Domaine_VDF::renumber_faces(Faces& les_faces, IntTab& sort_key)
   const int nb_faces = les_faces.nb_faces();
 
   IntVect old_orien(orientation_);
+  ToDo_Kokkos("critical");
   for (int i=nb_faces_front; i < nb_faces; i++)
     {
       const int idx = sort_key(i, 1);
@@ -227,6 +229,7 @@ void Domaine_VDF::remplir_face_normales()
   face_normales_ = xv_; // already has // structure
   face_normales_ = 0.;
 
+  ToDo_Kokkos("critical");
   for (int f = 0; f < nb_faces_tot(); f++)
     {
       int ori = orientation(f);
@@ -246,6 +249,7 @@ void Domaine_VDF::calculer_volumes_entrelaces()
 
   const int nb_faces_front = premiere_face_int();
   const int nbf = nb_faces();
+  ToDo_Kokkos("critical");
   for (int num_face = 0; num_face<nbf; num_face++)
     {
       const double f = (num_face < nb_faces_front) ? 2. : 1.;
@@ -353,6 +357,7 @@ void Domaine_VDF::genere_aretes()
   P1=0;
   P2=0;
   double eps = 10.0*Objet_U::precision_geom;
+  ToDo_Kokkos("critical");
   for (int face=0; face<premiere_face_int(); face++)
     {
       int ori = orientation(face);
@@ -887,6 +892,7 @@ void Domaine_VDF::creer_elements_fictifs(const Domaine_Cl_dis_base& zcldisbase)
             {
               ndeb = le_bord.num_premiere_face();
               nfin = ndeb + le_bord.nb_faces();
+              ToDo_Kokkos("critical");
               for (face=ndeb; face<nfin; face++)
                 if (face_voisins(face,0) != -1)
                   face_voisins_fictifs_(face,1) = compteur++;
@@ -912,6 +918,7 @@ DoubleVect& Domaine_VDF::dist_norm_bord(DoubleVect& dist, const Nom& nom_bord) c
             {
               dist.resize(fr_vf.nb_faces());
               int ndeb = fr_vf.num_premiere_face();
+              ToDo_Kokkos("critical");
               for (int face=ndeb; face<ndeb+fr_vf.nb_faces(); face++)
                 dist(face-ndeb) = dist_norm_bord_axi(face);
             }
@@ -926,6 +933,7 @@ DoubleVect& Domaine_VDF::dist_norm_bord(DoubleVect& dist, const Nom& nom_bord) c
             {
               dist.resize(fr_vf.nb_faces());
               int ndeb = fr_vf.num_premiere_face();
+              ToDo_Kokkos("critical");
               for (int face=ndeb; face<ndeb+fr_vf.nb_faces(); face++)
                 dist(face-ndeb) = dist_norm_bord(face);
             }
diff --git a/src/VDF/Geometrie/Domaine_VDF.h b/src/VDF/Geometrie/Domaine_VDF.h
index 009e935d17..9a71d88c89 100644
--- a/src/VDF/Geometrie/Domaine_VDF.h
+++ b/src/VDF/Geometrie/Domaine_VDF.h
@@ -305,7 +305,7 @@ inline int Domaine_VDF::orientation(int i) const
 /*! @brief
  *
  */
-inline double Domaine_VDF::dist_face(int fac1, int fac2, int k) const
+KOKKOS_INLINE_FUNCTION double Domaine_VDF::dist_face(int fac1, int fac2, int k) const
 {
   // Attention cette methode n'est plus appelee par les methodes dist_face de
   // Eval_Diff_VDF_Multi_inco_const.cpp et Eval_Diff_VDF_const.cpp pour optimiser les evaluateurs
@@ -652,11 +652,13 @@ inline double Domaine_VDF::delta_C(int elem) const
 /*! @brief
  *
  */
+/* ToDo Kokkos clean ! */
 inline int Domaine_VDF::amont_amont(int num_face, int i) const
 {
-  int k=orientation_[num_face];
+  int dim = elem_faces_.dimension(1) == 4 ? 2 : 3;
+  int k = orientation_[num_face];
   int num_elem = face_voisins_(num_face,i);
-  int face = elem_faces_(num_elem,k+i*dimension);
+  int face = elem_faces_(num_elem,k+i*dim);
   return face_voisins_(face,i);
 }
 
@@ -672,39 +674,25 @@ inline int Domaine_VDF::face_amont_princ(int num_face, int i) const
   return elem;
 }
 
-/*! @brief
- *
- */
 inline int Domaine_VDF::face_amont_conj(int num_face, int k, int i) const
 {
-  int ori = orientation(num_face);
-  int elem = face_voisins_(num_face,1);
-  int face_conj=-2,face,elem_bis=-2;
-
-  if(elem != -1)
+  const int ori = orientation(num_face);
+  const int elem1 = face_voisins_(num_face, 1);
+  if (elem1 != -1)
     {
-      face = elem_faces_(elem, k+i*dimension);
-      elem_bis = face_voisins_(face,i);
+      const int elem_bis = face_voisins_(elem_faces_(elem1, k + i * dimension), i);
       if (elem_bis != -1)
-        face_conj = elem_faces_(elem_bis, ori);
-      else
-        face_conj = -1;
+        return elem_faces_(elem_bis, ori);
     }
-  if ((elem==-1) || (elem_bis==-1))
+  const int elem0 = face_voisins_(num_face, 0);
+  assert(elem0 != -1 || elem1 != -1); // face must have at least one element neighbor
+  if (elem0 != -1)
     {
-      elem = face_voisins_(num_face,0);
-      if(elem != -1)
-        {
-          face = elem_faces_(elem, k+i*dimension);
-          elem_bis = face_voisins_(face,i);
-          if (elem_bis != -1)
-            face_conj = elem_faces_(elem_bis, ori+dimension);
-          else
-            face_conj = -1;
-        }
+      const int elem_bis = face_voisins_(elem_faces_(elem0, k + i * dimension), i);
+      if (elem_bis != -1)
+        return elem_faces_(elem_bis, ori + dimension);
     }
-  assert(face_conj!=-2);
-  return face_conj;
+  return -1;
 }
 
 /*! @brief Determine la face voisine de notre face en prevoyant que cette derniere puisse etre de type bord.
@@ -848,11 +836,219 @@ inline double Domaine_VDF::dist_face_period(int fac1 , int fac2, int k) const
   double dist= std::fabs(coord_sommets(face_sommets(fac1,1),k)-xv_(fac1,k));
   dist += std::fabs(xv_(fac2,k) - coord_sommets(face_sommets(fac2,0),k));
   return dist;
-
 }
 
-#endif
+// ToDo Kokkos: suppress/clean Domaine_VDF methods if useless now like amont_amont...
+struct Domaine_VDF_View
+{
+private:
+  mutable CIntArrView ind_faces_virt_bord_;
+  mutable CIntArrView orientation_;
+  mutable CIntTabView face_voisins_;
+  mutable CIntTabView elem_faces_;
+  mutable CDoubleTabView xp_;
+  mutable CDoubleTabView xv_;
+  mutable CIntTabView face_sommets_;
+  mutable CDoubleTabView coord_sommets_;
+  mutable int dim, nb_faces_;
+public:
+  Domaine_VDF_View() {}
+  Domaine_VDF_View(const Domaine_VDF& dom) { set(dom); }
+  void set(const Domaine_VDF& dom) const
+  {
+    ind_faces_virt_bord_ = dom.ind_faces_virt_bord().view_ro();
+    orientation_ = dom.orientation().view_ro();
+    face_voisins_ = dom.face_voisins().view_ro();
+    elem_faces_ = dom.elem_faces().view_ro();
+    xp_ = dom.xp().view_ro();
+    xv_ = dom.xv().view_ro();
+    face_sommets_ = dom.face_sommets().view_ro();
+    coord_sommets_ = dom.domaine().coord_sommets().view_ro();
+    dim = elem_faces_.extent(1) == 4 ? 2 : 3;
+    nb_faces_ = dom.nb_faces();
+  }
+  KOKKOS_INLINE_FUNCTION
+  int nb_faces() const { return nb_faces_; }
+  KOKKOS_INLINE_FUNCTION
+  int ind_faces_virt_bord(int num_face) const { return ind_faces_virt_bord_(num_face); }
+  KOKKOS_INLINE_FUNCTION
+  int amont_amont(int num_face, int i) const
+  {
+    int k = orientation_[num_face];
+    int num_elem = face_voisins_(num_face,i);
+    int face = elem_faces_(num_elem,k+i*dim);
+    return face_voisins_(face,i);
+  }
+  KOKKOS_INLINE_FUNCTION
+  int face_amont_princ(int num_face, int i) const
+  {
+    int ori=orientation_(num_face);
+    int elem=face_voisins_(num_face,i);
+    if(elem !=-1)
+      elem=elem_faces_(elem,ori+i*dim);
+    return elem;
+  }
+  KOKKOS_FORCEINLINE_FUNCTION
+  int face_amont_conj(int num_face, int k, int i) const
+  {
+    const int ori = orientation_(num_face);
+    const int elem1 = face_voisins_(num_face, 1);
+    if (elem1 != -1)
+      {
+        const int elem_bis = face_voisins_(elem_faces_(elem1, k + i * dim), i);
+        if (elem_bis != -1)
+          return elem_faces_(elem_bis, ori);
+      }
+    const int elem0 = face_voisins_(num_face, 0);
+    assert(elem0 != -1 || elem1 != -1); // face must have at least one element neighbor
+    if (elem0 != -1)
+      {
+        const int elem_bis = face_voisins_(elem_faces_(elem0, k + i * dim), i);
+        if (elem_bis != -1)
+          return elem_faces_(elem_bis, ori + dim);
+      }
+    return -1;
+  }
+  KOKKOS_INLINE_FUNCTION
+  double dist_elem(int n1, int n2, int k) const
+  {
+    return xp_(n2,k)-xp_(n1,k);
+  }
+  KOKKOS_INLINE_FUNCTION
+  double dist_elem_period(int n1, int n2, int k) const
+  {
+    return xp_(n2,k) - xv_(elem_faces_(n2,k),k)
+           + xv_(elem_faces_(n1,k+dim),k) - xp_(n1,k);
+  }
+  KOKKOS_FORCEINLINE_FUNCTION
+  double dim_elem(int n1, int k) const
+  {
+    return xv_(elem_faces_(n1,k+dim),k)-xv_(elem_faces_(n1,k),k);
+  }
+  KOKKOS_INLINE_FUNCTION
+  double dim_face(int n1, int k) const
+  {
+    int elem = Kokkos::max(face_voisins_(n1,0), face_voisins_(n1,1));
+    return dim_elem(elem, k);
+  }
+  KOKKOS_INLINE_FUNCTION
+  double dist_face_elem0(int num_face,int n0) const
+  {
+    int ori = orientation_[num_face];
+    return xv_(num_face,ori) - xp_(n0,ori);
+  }
+  KOKKOS_INLINE_FUNCTION
+  double dist_face_elem1(int num_face,int n1) const
+  {
+    int ori = orientation_[num_face];
+    return xp_(n1,ori) - xv_(num_face,ori);
+  }
+  KOKKOS_INLINE_FUNCTION
+  double dist_face_elem0_axi(int num_face,int n0) const
+  {
+    int ori = orientation_[num_face];
+    double dist;
+    if (ori!=1)
+      dist = xv_(num_face,ori) - xp_(n0,ori);
+    else
+      {
+        double d_teta = xv_(num_face,1) - xp_(n0,1);
+        if (d_teta < 0)
+          d_teta += 2.0*M_PI;
+        dist = d_teta*xp_(n0,0);
+      }
+    return dist;
+  }
+  KOKKOS_INLINE_FUNCTION
+  double dist_face_elem1_axi(int num_face,int n1) const
+  {
+    int ori = orientation_[num_face];
+    double dist;
+    if (ori!=1)
+      dist = xp_(n1,ori) - xv_(num_face,ori);
+    else
+      {
+        double d_teta = xp_(n1,1) - xv_(num_face,1);
+        if (d_teta < 0)
+          d_teta += 2.0*M_PI;
+        dist = d_teta*xp_(n1,0);
+      }
+    return dist;
+  }
+  KOKKOS_INLINE_FUNCTION
+  double dist_face(int fac1, int fac2, int k) const
+  {
+    return xv_(fac2,k) - xv_(fac1,k);
+  }
+  KOKKOS_INLINE_FUNCTION
+  double dist_face_period(int fac1 , int fac2, int k) const
+  {
+    double dist= Kokkos::fabs(coord_sommets_(face_sommets_(fac1,1),k)-xv_(fac1,k));
+    dist += Kokkos::fabs(xv_(fac2,k) - coord_sommets_(face_sommets_(fac2,0),k));
+    return dist;
+  }
+  KOKKOS_INLINE_FUNCTION
+  double dist_norm_bord(int num_face) const
+  {
+    int n1 = face_voisins_(num_face,0);
+    int n2 = face_voisins_(num_face,1);
+    int k = orientation_[num_face];
+    if (n1!=-1)
+      return (xv_(num_face,k) - xp_(n1,k));
+    else
+      return (xp_(n2,k) - xv_(num_face,k));
+  }
+  KOKKOS_INLINE_FUNCTION
+  double dist_norm_bord_axi(int num_face) const
+  {
+    int n1 = face_voisins_(num_face,0);
+    int n2 = face_voisins_(num_face,1);
+    int k = orientation_[num_face];
+    double dist;
+    if (n1!=-1)
+      if (k != 1)
+        dist = xv_(num_face,k) - xp_(n1,k);
+      else
+        {
+          double d_teta = xv_(num_face,1) - xp_(n1,1);
+          if (d_teta < 0)
+            d_teta += 2.0*M_PI;
+          dist = d_teta*xp_(n1,0);
+        }
+    else if (k != 1)
+      dist = xp_(n2,k) - xv_(num_face,k);
+    else
+      {
+        double d_teta = xp_(n2,1) - xv_(num_face,1);
+        if (d_teta < 0)
+          d_teta += 2.0*M_PI;
+        dist = d_teta*xp_(n2,0);
+      }
+    return dist;
+  }
+  KOKKOS_INLINE_FUNCTION
+  double dist_face_elem0_period(int num_face,int n0,double l) const
+  {
+    int ori = orientation_[num_face];
+    double dist = xv_(num_face,ori) - xp_(n0,ori);
+    if (dist > 0)
+      return dist;
+    else
+      return dist + l;
+  }
+  KOKKOS_INLINE_FUNCTION
+  double dist_face_elem1_period(int num_face,int n1,double l) const
+  {
+    int ori = orientation_[num_face];
+    double dist = xp_(n1,ori) - xv_(num_face,ori);
+    if (dist > 0)
+      return dist;
+    else
+      return dist + l;
+  }
+};
 
+#endif
 
 
 
diff --git a/src/VDF/Geometrie/Faces_VDF.cpp b/src/VDF/Geometrie/Faces_VDF.cpp
index 350aa0f3d3..17cd8ed6f8 100644
--- a/src/VDF/Geometrie/Faces_VDF.cpp
+++ b/src/VDF/Geometrie/Faces_VDF.cpp
@@ -1,5 +1,5 @@
 /****************************************************************************
-* Copyright (c) 2023, CEA
+* Copyright (c) 2026, CEA
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
@@ -46,6 +46,7 @@ void Faces_VDF::calculer_orientation(IntVect& tab_orientation,
   const Domaine& dom=mondomaine;
   double dx=0, dy=0, dz=1.e30;
 
+  ToDo_Kokkos("critical");
   for(int face=0; face<nb_faces_tot(); face++)
     {
       if(dimension == 2)
diff --git a/src/VDF/Geometrie/distances_VDF.cpp b/src/VDF/Geometrie/distances_VDF.cpp
index 8949dd94ec..55bdbae931 100644
--- a/src/VDF/Geometrie/distances_VDF.cpp
+++ b/src/VDF/Geometrie/distances_VDF.cpp
@@ -1,5 +1,5 @@
 /****************************************************************************
-* Copyright (c) 2023, CEA
+* Copyright (c) 2026, CEA
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
@@ -17,23 +17,9 @@
 
 void moy_2D_vit(const DoubleVect& vit, int elem, int iori, const Domaine_VDF& domaine, double& u)
 {
-  int num1, num2;
-  if (iori == 0)
-    {
-      num1 = domaine.elem_faces(elem, 1);
-      num2 = domaine.elem_faces(elem, 3);
-    }
-  else if (iori == 1)
-    {
-      num1 = domaine.elem_faces(elem, 0);
-      num2 = domaine.elem_faces(elem, 2);
-    }
-  else
-    {
-      Cerr << "valeur de iori " << iori << " impossible en 2D" << finl;
-      Process::exit();
-      num1 = num2 = -1;
-    }
+  assert(iori==0 || iori==1);
+  int num1 = domaine.elem_faces(elem, 1-iori);
+  int num2 = domaine.elem_faces(elem, 3-iori);
   u = 0.5 * (vit(num1) + vit(num2));
 }
 
@@ -42,38 +28,23 @@ double norm_2D_vit(const DoubleVect& vit, int elem, int iori, const Domaine_VDF&
   double v;
   moy_2D_vit(vit, elem, iori, domaine, v);
   v = std::fabs(v);
-  if (v == 0)
-    u = 0;
-  else
-    u = 1;
+  u = (v == 0) ? 0 : 1;
   return v;
 }
 
 double norm_2D_vit(const DoubleVect& vit, int elem, int iori, const Domaine_VDF& domaine, double u_paroi, double v_paroi, double& u)
 {
-  double vit_paroi;
-  if (iori == 0)
-    vit_paroi = v_paroi;
-  else if (iori == 1)
-    vit_paroi = u_paroi;
-  else
-    {
-      Cerr << "valeur de iori " << iori << " impossible en 2D" << finl;
-      Process::exit();
-      vit_paroi = 0;
-    }
+  assert(iori==0 || iori==1);
+  double vit_paroi = (iori==0) ? v_paroi : u_paroi;
   double v;
-  double n_v;
   moy_2D_vit(vit, elem, iori, domaine, v);
 
   //YB:30/01/04:
   //Les valeurs du cisaillement parietal sont maintenant signees.
   //En considerant les valeurs signees de la projection de la vitesse sur le plan parietal
   v = v - vit_paroi;
-  n_v = std::fabs(v);
-
+  double n_v = std::fabs(v);
   //Fin modif YB
-
   if (v == 0)
     u = 0;
   else if (v > 0)
@@ -85,34 +56,13 @@ double norm_2D_vit(const DoubleVect& vit, int elem, int iori, const Domaine_VDF&
 
 void moy_3D_vit(const DoubleVect& vit, int elem, int iori, const Domaine_VDF& domaine, double& val1, double& val2)
 {
-  int num1, num2, num3, num4;
-  if (iori == 0)
-    {
-      num1 = domaine.elem_faces(elem, 1);
-      num2 = domaine.elem_faces(elem, 4);
-      num3 = domaine.elem_faces(elem, 2);
-      num4 = domaine.elem_faces(elem, 5);
-    }
-  else if (iori == 1)
-    {
-      num1 = domaine.elem_faces(elem, 0);
-      num2 = domaine.elem_faces(elem, 3);
-      num3 = domaine.elem_faces(elem, 2);
-      num4 = domaine.elem_faces(elem, 5);
-    }
-  else if (iori == 2)
-    {
-      num1 = domaine.elem_faces(elem, 0);
-      num2 = domaine.elem_faces(elem, 3);
-      num3 = domaine.elem_faces(elem, 1);
-      num4 = domaine.elem_faces(elem, 4);
-    }
-  else
-    {
-      Cerr << "valeur de iori " << iori << " impossible en 3D" << finl;
-      Process::exit();
-      num1 = num2 = num3 = num4 = -1;
-    }
+  assert(iori==0 || iori==1 || iori==2);
+  int i = iori==0 ? 1 : 0;
+  int j = iori==2 ? 0 : 1;
+  int num1 = domaine.elem_faces(elem, 0+i);
+  int num2 = domaine.elem_faces(elem, 3+i);
+  int num3 = domaine.elem_faces(elem, 1+j);
+  int num4 = domaine.elem_faces(elem, 4+j);
   val1 = 0.5 * (vit(num1) + vit(num2));
   val2 = 0.5 * (vit(num3) + vit(num4));
 }
@@ -130,8 +80,8 @@ double norm_3D_vit(const DoubleVect& vit, int elem, int iori, const Domaine_VDF&
 
 double norm_3D_vit(const DoubleVect& vit, int elem, int iori, const Domaine_VDF& domaine, double u_paroi, double v_paroi, double w_paroi, double& val1, double& val2)
 {
-  double v1, v2, norm_vit;
   moy_3D_vit(vit, elem, iori, domaine, val1, val2);
+  double v1, v2;
   if (iori == 0)
     {
       v1 = val1 - v_paroi; // EB 28/08/25 : for a wall of normal x, val1 is the velocity in y direction and val2 in z direction
@@ -148,14 +98,13 @@ double norm_3D_vit(const DoubleVect& vit, int elem, int iori, const Domaine_VDF&
       v2 = val2 - v_paroi;
     }
   //Fin modif YB
-
   else
     {
       Cerr << "valeur de iori " << iori << " impossible en 3D" << finl;
       Process::exit();
       v1 = v2 = 0;
     }
-  norm_vit = sqrt(v1 * v1 + v2 * v2);
+  double norm_vit = sqrt(v1 * v1 + v2 * v2);
   val1 = v1 / (norm_vit + DMINFLOAT);
   val2 = v2 / (norm_vit + DMINFLOAT);
   return norm_vit;
@@ -170,14 +119,14 @@ double norm_vit(const DoubleVect& vit, int elem, int ori, const Domaine_VDF& dom
 }
 
 
-void calcul_interne2D(int num_elem, int elx0, int elx1, int ely0, int ely1, const Domaine_VDF& domaine_VDF, const DoubleTab& val, DoubleTab& rot)
+KOKKOS_INLINE_FUNCTION
+void calcul_interne2D(int num_elem, int elx0, int elx1, int ely0, int ely1, const Domaine_VDF_View& domaine_VDF, CIntTabView elem_faces, CDoubleTabView val, DoubleTabView rot)
 {
-  const IntTab& elem_faces = domaine_VDF.elem_faces();
   double delta_x_0, delta_x_1, delta_y_0, delta_y_1;
   double delta_x, delta_y;
   double deriv_vx, deriv_uy;
 
-  int N = val.line_size(), n;
+  int N = (int)val.extent(1), n;
 
   deriv_vx = 0;
   deriv_uy = 0;
@@ -189,7 +138,6 @@ void calcul_interne2D(int num_elem, int elx0, int elx1, int ely0, int ely1, cons
 
   delta_x = (delta_x_1 - delta_x_0) * (delta_x_1 + delta_x_0) / (delta_x_1 * delta_x_0);
   delta_y = (delta_y_1 - delta_y_0) * (delta_y_1 + delta_y_0) / (delta_y_1 * delta_y_0);
-
   for (n=0; n<N; n++)
     {
       deriv_vx  = (delta_x_0 / delta_x_1 * val(elem_faces(elx1, 1), n) + delta_x * val(elem_faces(num_elem, 1), n) - delta_x_1 / delta_x_0 * val(elem_faces(elx0, 1), n));
@@ -204,18 +152,17 @@ void calcul_interne2D(int num_elem, int elx0, int elx1, int ely0, int ely1, cons
     }
 }
 
-void calcul_bord2D(int num_elem, int elx0, int elx1, int ely0, int ely1, const Domaine_VDF& domaine_VDF, const DoubleTab& val, DoubleTab& rot)
+KOKKOS_INLINE_FUNCTION
+void calcul_bord2D(int num_elem, int elx0, int elx1, int ely0, int ely1, const Domaine_VDF_View& domaine_VDF, CIntTabView elem_faces, CDoubleTabView val, DoubleTabView rot)
 {
-  const IntTab& elem_faces = domaine_VDF.elem_faces();
   double delta_x_0, delta_x_1, delta_y_0, delta_y_1;
   double delta_x, delta_y;
   double deriv_vx, deriv_uy;
 
-  int N = val.line_size(), n;
+  int N = (int)val.extent(1), n;
 
   deriv_vx = 0;
   deriv_uy = 0;
-
   for (n=0; n<N; n++)
     {
       // Traitement des elements bord
@@ -290,37 +237,41 @@ void calcul_bord2D(int num_elem, int elx0, int elx1, int ely0, int ely1, const D
     }
 }
 
-void calrotord2centelemdim2(DoubleTab& rot, const DoubleTab& val, const Domaine_VDF& domaine_VDF, int nb_elem, const IntTab& face_voisins, const IntTab& elem_faces)
+void calrotord2centelemdim2(DoubleTab& tab_rot, const DoubleTab& tab_val, const Domaine_VDF& domaine_VDF)
 {
-  if (rot.dimension(0) != nb_elem)
-    rot.resize(nb_elem);
-  int elx0, elx1, ely0, ely1;
-
-  for (int num_elem = 0; num_elem < nb_elem; num_elem++)
-    {
-      elx0 = face_voisins(elem_faces(num_elem, 0), 0);
-      elx1 = face_voisins(elem_faces(num_elem, 2), 1);
-      ely0 = face_voisins(elem_faces(num_elem, 1), 0);
-      ely1 = face_voisins(elem_faces(num_elem, 3), 1);
-
-      if ((elx0 != -1) && (elx1 != -1) && (ely0 != -1) && (ely1 != -1))
-        // Cas d'un element interne
-
-        calcul_interne2D(num_elem, elx0, elx1, ely0, ely1, domaine_VDF, val, rot);
-      else
-        calcul_bord2D(num_elem, elx0, elx1, ely0, ely1, domaine_VDF, val, rot);
-    }
+  int nb_elem = domaine_VDF.nb_elem();
+  if (tab_rot.dimension(0) != nb_elem)
+    tab_rot.resize(nb_elem);
+  Domaine_VDF_View dom_vdf_v(domaine_VDF);
+  CIntTabView elem_faces = domaine_VDF.elem_faces().view_ro();
+  CIntTabView face_voisins = domaine_VDF.face_voisins().view_ro();
+  CDoubleTabView val = tab_val.view_ro();
+  DoubleTabView rot = tab_rot.view_rw();
+  Kokkos::parallel_for(start_gpu_timer(__KERNEL_NAME__), range_1D(0, nb_elem), KOKKOS_LAMBDA(const int num_elem)
+  {
+    const int elx0 = face_voisins(elem_faces(num_elem, 0), 0);
+    const int elx1 = face_voisins(elem_faces(num_elem, 2), 1);
+    const int ely0 = face_voisins(elem_faces(num_elem, 1), 0);
+    const int ely1 = face_voisins(elem_faces(num_elem, 3), 1);
+
+    if ((elx0 != -1) && (elx1 != -1) && (ely0 != -1) && (ely1 != -1))
+      // Cas d'un element interne
+      calcul_interne2D(num_elem, elx0, elx1, ely0, ely1, dom_vdf_v, elem_faces, val, rot);
+    else
+      calcul_bord2D(num_elem, elx0, elx1, ely0, ely1, dom_vdf_v, elem_faces, val, rot);
+  });
+  end_gpu_timer(__KERNEL_NAME__);
 }
 
 // Traitement des elements internes
-void calcul_interne3D(int num_elem, int elx0, int elx1, int ely0, int ely1, int elz0, int elz1, const Domaine_VDF& domaine_VDF, const DoubleTab& val, DoubleTab& rot)
+KOKKOS_INLINE_FUNCTION
+void calcul_interne3D(int num_elem, int elx0, int elx1, int ely0, int ely1, int elz0, int elz1, const Domaine_VDF_View& domaine_VDF, CIntTabView elem_faces, CDoubleTabView val, DoubleTabView rot)
 {
-  const IntTab& elem_faces = domaine_VDF.elem_faces();
   double delta_x_0, delta_x_1, delta_y_0, delta_y_1, delta_z_0, delta_z_1;
   double delta_x, delta_y, delta_z;
   double deriv_wy, deriv_vz, deriv_uz, deriv_wx, deriv_vx, deriv_uy;
 
-  int N = val.line_size();
+  int N = (int)val.extent(1);
 
   deriv_wy = 0;
   deriv_vz = 0;
@@ -339,7 +290,6 @@ void calcul_interne3D(int num_elem, int elx0, int elx1, int ely0, int ely1, int
   delta_x = (delta_x_1 - delta_x_0) * (delta_x_1 + delta_x_0) / (delta_x_1 * delta_x_0);
   delta_y = (delta_y_1 - delta_y_0) * (delta_y_1 + delta_y_0) / (delta_y_1 * delta_y_0);
   delta_z = (delta_z_1 - delta_z_0) * (delta_z_1 + delta_z_0) / (delta_z_1 * delta_z_0);
-
   for (int n=0; n<N; n++)
     {
       deriv_vz  = (delta_z_0 / delta_z_1 * val(elem_faces(elz1, 1), n) + delta_z * val(elem_faces(num_elem, 1), n) - delta_z_1 / delta_z_0 * val(elem_faces(elz0, 1), n));
@@ -373,14 +323,14 @@ void calcul_interne3D(int num_elem, int elx0, int elx1, int ely0, int ely1, int
 }
 
 // Traitement des elements bord
-void calcul_bord3D(int num_elem, int elx0, int elx1, int ely0, int ely1, int elz0, int elz1, const Domaine_VDF& domaine_VDF, const DoubleTab& val, DoubleTab& rot)
+KOKKOS_INLINE_FUNCTION
+void calcul_bord3D(int num_elem, int elx0, int elx1, int ely0, int ely1, int elz0, int elz1, const Domaine_VDF_View& domaine_VDF, CIntTabView elem_faces, CDoubleTabView val, DoubleTabView rot)
 {
-  const IntTab& elem_faces = domaine_VDF.elem_faces();
   double delta_x_0, delta_x_1, delta_y_0, delta_y_1, delta_z_0, delta_z_1;
   double delta_x, delta_y, delta_z;
   double deriv_wy, deriv_vz, deriv_uz, deriv_wx, deriv_vx, deriv_uy;
 
-  int N = val.line_size();
+  int N = (int)val.extent(1);
 
   deriv_wy = 0;
   deriv_vz = 0;
@@ -388,7 +338,6 @@ void calcul_bord3D(int num_elem, int elx0, int elx1, int ely0, int ely1, int elz
   deriv_wx = 0;
   deriv_vx = 0;
   deriv_uy = 0;
-
   for (int n=0; n<N; n++)
     {
 
@@ -646,29 +595,32 @@ void calcul_bord3D(int num_elem, int elx0, int elx1, int ely0, int ely1, int elz
     }
 }
 
-void calrotord2centelemdim3(DoubleTab& rot, const DoubleTab& val, const Domaine_VDF& domaine_VDF, int nb_elem, const IntTab& face_voisins, const IntTab& elem_faces)
+void calrotord2centelemdim3(DoubleTab& tab_rot, const DoubleTab& tab_val, const Domaine_VDF& domaine_VDF)
 {
-  if (rot.dimension(0) != nb_elem)
-    rot.resize(nb_elem, 3);
-  int elx0, elx1, ely0, ely1, elz0, elz1;
-
-  for (int num_elem = 0; num_elem < nb_elem; num_elem++)
-    {
-
-      elx0 = face_voisins(elem_faces(num_elem, 0), 0);
-      elx1 = face_voisins(elem_faces(num_elem, 3), 1);
-      ely0 = face_voisins(elem_faces(num_elem, 1), 0);
-      ely1 = face_voisins(elem_faces(num_elem, 4), 1);
-      elz0 = face_voisins(elem_faces(num_elem, 2), 0);
-      elz1 = face_voisins(elem_faces(num_elem, 5), 1);
-
-      if ((elx0 != -1) && (elx1 != -1) && (ely0 != -1) && (ely1 != -1) && (elz0 != -1) && (elz1 != -1))
-        // Cas d'un element interne
-
-        calcul_interne3D(num_elem, elx0, elx1, ely0, ely1, elz0, elz1, domaine_VDF, val, rot);
-      else
-        calcul_bord3D(num_elem, elx0, elx1, ely0, ely1, elz0, elz1, domaine_VDF, val, rot);
-    }
+  int nb_elem = domaine_VDF.nb_elem();
+  if (tab_rot.dimension(0) != nb_elem)
+    tab_rot.resize(nb_elem, 3);
+  Domaine_VDF_View dom_vdf_v(domaine_VDF);
+  CIntTabView elem_faces = domaine_VDF.elem_faces().view_ro();
+  CIntTabView face_voisins = domaine_VDF.face_voisins().view_ro();
+  CDoubleTabView val = tab_val.view_ro();
+  DoubleTabView rot = tab_rot.view_rw();
+  Kokkos::parallel_for(start_gpu_timer(__KERNEL_NAME__), range_1D(0, nb_elem), KOKKOS_LAMBDA(const int num_elem)
+  {
+    const int elx0 = face_voisins(elem_faces(num_elem, 0), 0);
+    const int elx1 = face_voisins(elem_faces(num_elem, 3), 1);
+    const int ely0 = face_voisins(elem_faces(num_elem, 1), 0);
+    const int ely1 = face_voisins(elem_faces(num_elem, 4), 1);
+    const int elz0 = face_voisins(elem_faces(num_elem, 2), 0);
+    const int elz1 = face_voisins(elem_faces(num_elem, 5), 1);
+
+    if ((elx0 != -1) && (elx1 != -1) && (ely0 != -1) && (ely1 != -1) && (elz0 != -1) && (elz1 != -1))
+      // Cas d'un element interne
+      calcul_interne3D(num_elem, elx0, elx1, ely0, ely1, elz0, elz1, dom_vdf_v, elem_faces, val, rot);
+    else
+      calcul_bord3D(num_elem, elx0, elx1, ely0, ely1, elz0, elz1, dom_vdf_v, elem_faces, val, rot);
+  });
+  end_gpu_timer(__KERNEL_NAME__);
 }
 
 // Calcul du produit scalaire du tenseur des vitesses de deformation
@@ -1245,7 +1197,7 @@ void caldscaldcentelemdim2(DoubleTab& dscald, const DoubleTab& val, const Domain
   if (dscald.dimension(0) != nb_elem)
     dscald.resize(nb_elem);
   int elx0, elx1, ely0, ely1;
-
+  ToDo_Kokkos("critical");
   for (int num_elem = 0; num_elem < nb_elem; num_elem++)
     {
 
@@ -1269,7 +1221,7 @@ void caldscaldcentelemdim3(DoubleTab& dscald, const DoubleTab& val, const Domain
   if (dscald.dimension(0) != nb_elem)
     dscald.resize(nb_elem);
   int elx0, elx1, ely0, ely1, elz0, elz1;
-
+  ToDo_Kokkos("critical");
   for (int num_elem = 0; num_elem < nb_elem; num_elem++)
     {
 
diff --git a/src/VDF/Geometrie/distances_VDF.h b/src/VDF/Geometrie/distances_VDF.h
index f797286913..0de15a7cc1 100644
--- a/src/VDF/Geometrie/distances_VDF.h
+++ b/src/VDF/Geometrie/distances_VDF.h
@@ -1,5 +1,5 @@
 /****************************************************************************
-* Copyright (c) 2023, CEA
+* Copyright (c) 2026, CEA
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
@@ -26,12 +26,68 @@ double norm_3D_vit(const DoubleVect&, int, int, const Domaine_VDF&, double&, dou
 double norm_3D_vit(const DoubleVect&, int, int, const Domaine_VDF&, double, double, double, double&, double&);
 double norm_vit(const DoubleVect&, int, int, const Domaine_VDF&, const ArrOfDouble& vit_paroi, ArrOfDouble& val);
 
-void calcul_interne2D(int num_elem, int elx0, int elx1, int ely0, int ely1, const Domaine_VDF& domaine_VDF, const DoubleTab& val, DoubleTab& rot);
-void calcul_bord2D(int num_elem, int elx0, int elx1, int ely0, int ely1, const Domaine_VDF& domaine_VDF, const DoubleTab& val, DoubleTab& rot);
-void calrotord2centelemdim2(DoubleTab& rot, const DoubleTab& val, const Domaine_VDF& domaine_VDF, int nb_elem, const IntTab& face_voisins, const IntTab& elem_faces);
-void calcul_interne3D(int num_elem, int elx0, int elx1, int ely0, int ely1, int elz0, int elz1, const Domaine_VDF& domaine_VDF, const DoubleTab& val, DoubleTab& rot);
-void calcul_bord3D(int num_elem, int elx0, int elx1, int ely0, int ely1, int elz0, int elz1, const Domaine_VDF& domaine_VDF, const DoubleTab& val, DoubleTab& rot);
-void calrotord2centelemdim3(DoubleTab& rot, const DoubleTab& val, const Domaine_VDF& domaine_VDF, int nb_elem, const IntTab& face_voisins, const IntTab& elem_faces);
+KOKKOS_INLINE_FUNCTION void moy_2D_vit(CDoubleArrView vit, int elem, int iori, CIntTabView elem_faces, double& u)
+{
+  assert(iori==0 || iori==1);
+  u = 0.5 * (vit(elem_faces(elem, 1-iori)) + vit(elem_faces(elem, 3-iori)));
+}
+
+KOKKOS_INLINE_FUNCTION double norm_2D_vit(CDoubleArrView vit, int elem, int iori, CIntTabView elem_faces, double& u)
+{
+  double v;
+  moy_2D_vit(vit, elem, iori, elem_faces, v);
+  v = Kokkos::fabs(v);
+  u = (v == 0) ? 0 : 1;
+  return v;
+}
+
+KOKKOS_INLINE_FUNCTION double norm_2D_vit(CDoubleArrView vit, int elem, int iori, CIntTabView elem_faces, double u_paroi, double v_paroi, double& u)
+{
+  assert(iori==0 || iori==1);
+  double v;
+  moy_2D_vit(vit, elem, iori, elem_faces, v);
+  v -= (iori==0) ? v_paroi : u_paroi;
+  double n_v = Kokkos::fabs(v);
+  u = (v == 0) ? 0 : (v > 0 ? 1 : -1);
+  return n_v;
+}
+
+KOKKOS_INLINE_FUNCTION void moy_3D_vit(CDoubleArrView vit, int elem, int iori, CIntTabView elem_faces, double& val1, double& val2)
+{
+  assert(iori==0 || iori==1 || iori==2);
+  int i = iori==0 ? 1 : 0;
+  int j = iori==2 ? 0 : 1;
+  val1 = 0.5 * (vit(elem_faces(elem, i))   + vit(elem_faces(elem, 3+i)));
+  val2 = 0.5 * (vit(elem_faces(elem, 1+j)) + vit(elem_faces(elem, 4+j)));
+}
+
+KOKKOS_INLINE_FUNCTION double norm_3D_vit(CDoubleArrView vit, int elem, int iori, CIntTabView elem_faces, double& val1, double& val2)
+{
+  moy_3D_vit(vit, elem, iori, elem_faces, val1, val2);
+  double v1 = Kokkos::fabs(val1), v2 = Kokkos::fabs(val2);
+  double norm_vit = Kokkos::sqrt(v1 * v1 + v2 * v2);
+  val1 = v1 / (norm_vit + DMINFLOAT);
+  val2 = v2 / (norm_vit + DMINFLOAT);
+  return norm_vit;
+}
+
+KOKKOS_INLINE_FUNCTION double norm_3D_vit(CDoubleArrView vit, int elem, int iori, CIntTabView elem_faces, double u_paroi, double v_paroi, double w_paroi, double& val1, double& val2)
+{
+  moy_3D_vit(vit, elem, iori, elem_faces, val1, val2);
+  double v1 = val1 - (iori==0 ? v_paroi : u_paroi);
+  double v2 = val2 - (iori==2 ? v_paroi : w_paroi);
+  double norm_vit = Kokkos::sqrt(v1 * v1 + v2 * v2);
+  val1 = v1 / (norm_vit + DMINFLOAT);
+  val2 = v2 / (norm_vit + DMINFLOAT);
+  return norm_vit;
+}
+
+KOKKOS_INLINE_FUNCTION void calcul_interne2D(int num_elem, int elx0, int elx1, int ely0, int ely1, const Domaine_VDF_View& domaine_VDF, CIntTabView elem_faces, CDoubleTabView val, DoubleTabView rot);
+KOKKOS_INLINE_FUNCTION void calcul_bord2D(int num_elem, int elx0, int elx1, int ely0, int ely1, const Domaine_VDF_View& domaine_VDF, CIntTabView elem_faces, CDoubleTabView val, DoubleTabView rot);
+void calrotord2centelemdim2(DoubleTab& rot, const DoubleTab& val, const Domaine_VDF& domaine_VDF);
+KOKKOS_INLINE_FUNCTION void calcul_interne3D(int num_elem, int elx0, int elx1, int ely0, int ely1, int elz0, int elz1, const Domaine_VDF_View& domaine_VDF, CIntTabView elem_faces, CDoubleTabView val, DoubleTabView rot);
+KOKKOS_INLINE_FUNCTION void calcul_bord3D(int num_elem, int elx0, int elx1, int ely0, int ely1, int elz0, int elz1, const Domaine_VDF_View& domaine_VDF, CIntTabView elem_faces, CDoubleTabView val, DoubleTabView rot);
+void calrotord2centelemdim3(DoubleTab& rot, const DoubleTab& val, const Domaine_VDF& domaine_VDF);
 
 // Calcul du produit scalaire du tenseur des vitesses de deformation en coordonnees cartesiennes : calcul 2D puis 3D.
 void calcul_dscald_interne2D(int num_elem, int elx0, int elx1, int ely0, int ely1, const Domaine_VDF& domaine_VDF, const DoubleTab& val, DoubleTab& dscald);
diff --git a/src/VDF/Milieu/EDO_Pression_th_VDF.cpp b/src/VDF/Milieu/EDO_Pression_th_VDF.cpp
index de532824e3..2fa2164d2c 100644
--- a/src/VDF/Milieu/EDO_Pression_th_VDF.cpp
+++ b/src/VDF/Milieu/EDO_Pression_th_VDF.cpp
@@ -1,5 +1,5 @@
 /****************************************************************************
-* Copyright (c) 2024, CEA
+* Copyright (c) 2025, CEA
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
@@ -17,6 +17,8 @@
 #include <Loi_Etat_Multi_GP_QC.h>
 #include <EDO_Pression_th_VDF.h>
 #include <Domaine_VDF.h>
+#include <kokkos++.h>
+#include <TRUSTArray_kokkos.tpp>
 
 Implemente_base(EDO_Pression_th_VDF, "EDO_Pression_th_VDF", EDO_Pression_th_base);
 
@@ -90,27 +92,27 @@ double EDO_Pression_th_VDF::masse_totale(const DoubleTab& P, const DoubleTab& T)
   return M;
 }
 
-void EDO_Pression_th_VDF::calculer_grad(const DoubleTab& inco, DoubleTab& resu)
+void EDO_Pression_th_VDF::calculer_grad(const DoubleTab& tab_inco, DoubleTab& tab_resu)
 {
-  int face, n0, n1, ori;
-  double coef;
   const Domaine_VDF& dom = ref_cast(Domaine_VDF, le_dom.valeur());
 
-  const IntTab& face_voisins = dom.face_voisins();
-  const IntVect& orientation = dom.orientation();
-  const DoubleVect& porosite_surf = le_fluide_->porosite_face();
-  const DoubleTab& xp = dom.xp();
-  const DoubleVect& volume_entrelaces = le_dom->volumes_entrelaces();
+  CIntTabView face_voisins = dom.face_voisins().view_ro();
+  CIntArrView orientation = dom.orientation().view_ro();
+  CDoubleArrView porosite_surf = le_fluide_->porosite_face().view_ro();
+  CDoubleTabView xp = dom.xp().view_ro();
+  CDoubleArrView volume_entrelaces = le_dom->volumes_entrelaces().view_ro();
+  CDoubleArrView inco = static_cast<const ArrOfDouble&>(tab_inco).view_ro();
+  DoubleArrView resu = static_cast<ArrOfDouble&>(tab_resu).view_rw();
 
   // Boucle sur les faces internes
-  ToDo_Kokkos("critical");
-  for (face = dom.premiere_face_int(); face < dom.nb_faces(); face++)
-    {
-      n0 = face_voisins(face, 0);
-      n1 = face_voisins(face, 1);
-      ori = orientation(face);
-      coef = volume_entrelaces(face) * porosite_surf(face);
-      coef = 1;
-      resu(face) += coef * (inco(n1) - inco(n0)) / (xp(n1, ori) - xp(n0, ori));
-    }
+  Kokkos::parallel_for(start_gpu_timer(__KERNEL_NAME__), range_1D(dom.premiere_face_int(), dom.nb_faces()), KOKKOS_LAMBDA(const int face)
+  {
+    const int n0 = face_voisins(face, 0);
+    const int n1 = face_voisins(face, 1);
+    const int ori = orientation(face);
+    double coef = volume_entrelaces(face) * porosite_surf(face);
+    coef = 1;
+    resu(face) += coef * (inco(n1) - inco(n0)) / (xp(n1, ori) - xp(n0, ori));
+  });
+  end_gpu_timer(__KERNEL_NAME__);
 }
diff --git a/src/VDF/Milieu/EDO_Pression_th_VDF_Gaz_Parfait.cpp b/src/VDF/Milieu/EDO_Pression_th_VDF_Gaz_Parfait.cpp
index 13bd387699..4eff72d8ce 100644
--- a/src/VDF/Milieu/EDO_Pression_th_VDF_Gaz_Parfait.cpp
+++ b/src/VDF/Milieu/EDO_Pression_th_VDF_Gaz_Parfait.cpp
@@ -65,6 +65,7 @@ double EDO_Pression_th_VDF_Gaz_Parfait::resoudre(double Pth_n)
       const DoubleTab& tempn = le_fluide_->inco_chaleur().passe();
       double cn1 = 0, cn = 0, v;
       int elem, nb_elem = le_dom->nb_elem();
+      ToDo_Kokkos("critical");
       for (elem = 0; elem < nb_elem; elem++)
         {
           v = le_dom->volumes(elem);
@@ -87,6 +88,7 @@ double EDO_Pression_th_VDF_Gaz_Parfait::resoudre(double Pth_n)
               const Front_VF& la_front_dis = ref_cast(Front_VF, la_cl.frontiere_dis());
               int ndeb = la_front_dis.num_premiere_face();
               int nfin = ndeb + la_front_dis.nb_faces();
+              ToDo_Kokkos("critical");
               for (int num_face = ndeb; num_face < nfin; num_face++)
                 {
                   int n0 = face_voisins(num_face, 0);
@@ -148,6 +150,7 @@ double EDO_Pression_th_VDF_Gaz_Parfait::resoudre(double Pth_n)
   ref_cast(Navier_Stokes_std,le_fluide_->vitesse().equation()).operateur_divergence().calculer(tab_vit, divU);
   DoubleTrav gradT(tab_vit.dimension(0));
   DoubleTrav Tstar(tab_vit.dimension(0));
+  ToDo_Kokkos("critical");
   for (elem = 0; elem < nb_elem; elem++)
     {
       Tstar(elem) = .5 * (tempn(elem) + tempnp1(elem));
@@ -155,6 +158,7 @@ double EDO_Pression_th_VDF_Gaz_Parfait::resoudre(double Pth_n)
   calculer_grad(Tstar, gradT);
   DoubleTab u_gradT(nb_elem);
   int f1, f2, i;
+  ToDo_Kokkos("critical");
   for (elem = 0; elem < nb_elem; elem++)
     {
       u_gradT(elem) = 0;
@@ -167,6 +171,7 @@ double EDO_Pression_th_VDF_Gaz_Parfait::resoudre(double Pth_n)
         }
     }
 
+  ToDo_Kokkos("critical");
   for (elem = 0; elem < nb_elem; elem++)
     {
       v = le_dom->volumes(elem);
@@ -186,6 +191,7 @@ double EDO_Pression_th_VDF_Gaz_Parfait::resoudre(double Pth_n)
       ndeb = frontiere_dis.num_premiere_face();
       nfin = ndeb + frontiere_dis.nb_faces();
       //if (sub_type(Neumann_sortie_libre, la_cl.valeur()) || sub_type(Dirichlet_entree_fluide, la_cl.valeur())) {
+      ToDo_Kokkos("critical");
       for (face = ndeb; face < nfin; face++)
         {
           elem = le_dom->face_voisins(face, 0);
@@ -236,6 +242,7 @@ void EDO_Pression_th_VDF_Gaz_Parfait::resoudre(DoubleTab& Pth_n)
 
       double cn1 = 0., cn = 0., v = -123.;
 
+      ToDo_Kokkos("critical");
       for (int elem = 0; elem < le_dom->nb_elem(); elem++)
         {
           v = le_dom->volumes(elem);
@@ -257,6 +264,7 @@ void EDO_Pression_th_VDF_Gaz_Parfait::resoudre(DoubleTab& Pth_n)
               const Front_VF& la_front_dis = ref_cast(Front_VF, la_cl.frontiere_dis());
               int ndeb = la_front_dis.num_premiere_face();
               int nfin = ndeb + la_front_dis.nb_faces();
+              ToDo_Kokkos("critical");
               for (int num_face = ndeb; num_face < nfin; num_face++)
                 {
                   int n0 = face_voisins(num_face, 0);
@@ -281,6 +289,7 @@ void EDO_Pression_th_VDF_Gaz_Parfait::resoudre(DoubleTab& Pth_n)
       double cnt = cn, cn1t = cn1, cmt = cm;
       mp_sum_for_each(cnt, cn1t, cmt);
 
+      ToDo_Kokkos("critical");
       for (int elem = 0; elem < le_dom->nb_elem(); elem++)
         Pth_n(elem) = Pth_n(elem) * cnt / cn1t / (1. + dt / cn1t * cmt);
     }
diff --git a/src/VDF/Milieu/EDO_Pression_th_VDF_Gaz_Reel.cpp b/src/VDF/Milieu/EDO_Pression_th_VDF_Gaz_Reel.cpp
index 433dda371f..d3cf4c7703 100644
--- a/src/VDF/Milieu/EDO_Pression_th_VDF_Gaz_Reel.cpp
+++ b/src/VDF/Milieu/EDO_Pression_th_VDF_Gaz_Reel.cpp
@@ -1,5 +1,5 @@
 /****************************************************************************
-* Copyright (c) 2024, CEA
+* Copyright (c) 2026, CEA
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
@@ -66,6 +66,7 @@ double EDO_Pression_th_VDF_Gaz_Reel::resoudre(double Pth_n)
   ref_cast(Navier_Stokes_std,le_fluide_->vitesse().equation()).operateur_divergence().calculer(tab_vit, divU);
   DoubleTrav gradh(tab_vit.dimension(0));
   DoubleTrav Hstar(tab_vit.dimension(0));
+  ToDo_Kokkos("critical");
   for (elem = 0; elem < nb_elem; elem++)
     {
       Hstar(elem) = .5 * (tab_hn(elem) + tab_hnp1(elem));
@@ -73,6 +74,7 @@ double EDO_Pression_th_VDF_Gaz_Reel::resoudre(double Pth_n)
   calculer_grad(Hstar, gradh);
   DoubleTab u_gradh(nb_elem);
   int f1, f2;
+  ToDo_Kokkos("critical");
   for (elem = 0; elem < nb_elem; elem++)
     {
       u_gradh(elem) = 0;
@@ -84,6 +86,7 @@ double EDO_Pression_th_VDF_Gaz_Reel::resoudre(double Pth_n)
         }
     }
 
+  ToDo_Kokkos("critical");
   for (elem = 0; elem < nb_elem; elem++)
     {
       v = dom.volumes(elem);
@@ -111,6 +114,7 @@ double EDO_Pression_th_VDF_Gaz_Reel::resoudre(double Pth_n)
     {
       tmp = Pth;
       Fnp1 = 0;
+      ToDo_Kokkos("critical");
       for (elem = 0; elem < nb_elem; elem++)
         {
           v = dom.volumes(elem);
diff --git a/src/VDF/Milieu/EOS_Tools_VDF.cpp b/src/VDF/Milieu/EOS_Tools_VDF.cpp
index fefee85d96..2677ef1c15 100644
--- a/src/VDF/Milieu/EOS_Tools_VDF.cpp
+++ b/src/VDF/Milieu/EOS_Tools_VDF.cpp
@@ -1,5 +1,5 @@
 /****************************************************************************
-* Copyright (c) 2024, CEA
+* Copyright (c) 2025, CEA
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
@@ -37,7 +37,7 @@ Entree& EOS_Tools_VDF::readOn(Entree& is)
   return is;
 }
 
-void  EOS_Tools_VDF::associer_domaines(const Domaine_dis_base& dds, const Domaine_Cl_dis_base& domaine_cl)
+void EOS_Tools_VDF::associer_domaines(const Domaine_dis_base& dds, const Domaine_Cl_dis_base& domaine_cl)
 {
   le_dom = ref_cast(Domaine_VDF,dds);
   le_dom_Cl = domaine_cl;
@@ -57,46 +57,58 @@ void  EOS_Tools_VDF::associer_domaines(const Domaine_dis_base& dds, const Domain
 double EOS_Tools_VDF::moyenne_vol(const DoubleTab& tab) const
 {
   int nb_elem=le_dom->nb_elem();
-  const DoubleVect& volumes = le_dom->volumes();
   assert(tab.dimension(0)==nb_elem);
-  ArrOfDouble sum(2);
-  sum = 0;
-  for (int elem=0 ; elem<nb_elem ; elem++)
-    {
-      double v = volumes(elem);
-      sum[0] += v;
-      sum[1] += v*tab(elem);
-    }
-  mp_sum_for_each_item(sum);
-  return sum[1]/sum[0];
+  double volume = 0, sum = 0;
+  CDoubleArrView volumes = le_dom->volumes().view_ro();
+  CDoubleArrView val = static_cast<const ArrOfDouble&>(tab).view_ro();
+  Kokkos::parallel_reduce(start_gpu_timer(__KERNEL_NAME__), nb_elem, KOKKOS_LAMBDA(const int elem, double & sum_tmp, double & volume_tmp)
+  {
+    double v = volumes(elem);
+    volume_tmp += v;
+    sum_tmp += v * val(elem);
+  }, sum, volume);
+  end_gpu_timer(__KERNEL_NAME__);
+
+  DoubleTrav pair(2);
+  pair[0] = volume;
+  pair[1] = sum;
+  mp_sum_for_each_item(pair);
+  return pair[1]/pair[0];
 }
 
 void EOS_Tools_VDF::calculer_rho_face_np1(const DoubleTab& tab_rhoP0)
 {
-  int face, elem, nb_faces_tot = le_dom->nb_faces_tot();
+  int nb_faces_tot = le_dom->nb_faces_tot();
   Debog::verifier("tab_rhoP0",tab_rhoP0);
-  int i, nb_comp;
-  IntTab& face_voisins = le_dom->face_voisins();
-  for (face=0 ; face<nb_faces_tot ; face++)
-    {
-      nb_comp=0;
-      tab_rho_face_np1(face) = 0;
-      for (i=0 ; i<2 ; i++)
-        {
-          elem= face_voisins(face,i);
-          if (elem!=-1)
-            {
-              nb_comp++;
-              tab_rho_face_np1(face) += tab_rhoP0(elem);
-            }
-        }
-      tab_rho_face_np1(face) /= nb_comp;
-    }
+  CIntTabView face_voisins = le_dom->face_voisins().view_ro();
+  CDoubleArrView rhoP0 = static_cast<const ArrOfDouble&>(tab_rhoP0).view_ro();
+  CDoubleArrView rho_face = static_cast<const ArrOfDouble&>(tab_rho_face).view_ro();
+  DoubleArrView rho_face_np1 = static_cast<ArrOfDouble&>(tab_rho_face_np1).view_wo();
+  Kokkos::parallel_for(start_gpu_timer(__KERNEL_NAME__), nb_faces_tot, KOKKOS_LAMBDA(const int face)
+  {
+    int nb_comp=0;
+    rho_face_np1(face) = 0;
+    for (int i=0 ; i<2 ; i++)
+      {
+        const int elem = face_voisins(face,i);
+        if (elem!=-1)
+          {
+            nb_comp++;
+            rho_face_np1(face) += rhoP0(elem);
+          }
+      }
+    rho_face_np1(face) /= nb_comp;
+  });
+  end_gpu_timer(__KERNEL_NAME__);
 
   tab_rho_face_np1.echange_espace_virtuel();
   Debog::verifier("tab_rho_face_np1",tab_rho_face_np1);
-  for (face=0 ; face<nb_faces_tot ; face++)
-    tab_rho_face_demi(face)=(tab_rho_face_np1(face)+tab_rho_face(face))/2.;
+  DoubleArrView rho_face_demi = static_cast<ArrOfDouble&>(tab_rho_face_demi).view_wo();
+  Kokkos::parallel_for(start_gpu_timer(__KERNEL_NAME__), nb_faces_tot, KOKKOS_LAMBDA(const int face)
+  {
+    rho_face_demi(face)=0.5*(rho_face_np1(face)+rho_face(face));
+  });
+  end_gpu_timer(__KERNEL_NAME__);
 }
 
 /*! @brief Renvoie rho avec la meme discretisation que la vitesse : une valeur par face en VDF
@@ -114,6 +126,7 @@ const DoubleTab& EOS_Tools_VDF::rho_discvit() const
  */
 void EOS_Tools_VDF::divu_discvit(const DoubleTab& secmem1, DoubleTab& secmem2)
 {
+  ToDo_Kokkos("VDF critical but not tested...");
   assert_espace_virtuel_vect(secmem1);
   int nb_faces_tot = le_dom->nb_faces_tot();
   IntTab& face_voisins = le_dom->face_voisins();
@@ -143,17 +156,21 @@ void EOS_Tools_VDF::divu_discvit(const DoubleTab& secmem1, DoubleTab& secmem2)
 void EOS_Tools_VDF::secmembre_divU_Z(DoubleTab& tab_W) const
 {
   double dt = le_fluide().vitesse().equation().schema_temps().pas_de_temps();
-  int elem,nb_elem = le_dom->nb_elem();//,nb_faces = le_dom->nb_faces();
-  DoubleVect tab_dZ(nb_elem);
-  //DoubleTab tab_gradZ(nb_faces);
+  int nb_elem = le_dom->nb_elem();
+  DoubleTrav tab_dZ(nb_elem);
   const DoubleTab& tab_rhonP0 = le_fluide().loi_etat()->rho_n();
   const DoubleTab& tab_rhonp1P0 = le_fluide().loi_etat()->rho_np1();
   Debog::verifier("divU tab_rhonP0",tab_rhonP0);
   Debog::verifier("divU tab_rhonp1P0",tab_rhonp1P0);
-  const DoubleVect& volumes = le_dom->volumes();
 
-  for (elem=0 ; elem<nb_elem ; elem++)
-    tab_dZ(elem) = (tab_rhonp1P0(elem)-tab_rhonP0(elem))/dt;
+  CDoubleArrView rhonP0 = static_cast<const ArrOfDouble&>(tab_rhonP0).view_ro();
+  CDoubleArrView rhonp1P0 = static_cast<const ArrOfDouble&>(tab_rhonp1P0).view_ro();
+  DoubleArrView dZ = static_cast<ArrOfDouble&>(tab_dZ).view_wo();
+  Kokkos::parallel_for(start_gpu_timer(__KERNEL_NAME__), nb_elem, KOKKOS_LAMBDA(const int elem)
+  {
+    dZ(elem) = (rhonp1P0(elem)-rhonP0(elem))/dt;
+  });
+  end_gpu_timer(__KERNEL_NAME__);
 
   // Ajout des termes sources speciaux de l'equation de masse:
   const bool has_mass_flux = (sub_type(Navier_Stokes_Fluide_Dilatable_base, le_fluide().vitesse().equation())) ?
@@ -165,16 +182,26 @@ void EOS_Tools_VDF::secmembre_divU_Z(DoubleTab& tab_W) const
       src_mass.ajouter_projection(le_fluide(), static_cast<DoubleTab&>(tab_dZ));
     }
 
-  for (elem = 0; elem < nb_elem; elem++)
-    tab_W(elem) = -tab_dZ(elem) * volumes(elem);
+  CDoubleArrView volumes = le_dom->volumes().view_ro();
+  CDoubleArrView dZ2 = static_cast<const ArrOfDouble&>(tab_dZ).view_ro(); // dZ sync cause tab_dZ may be host updated into src_mass.ajouter_projection !!!!
+  DoubleArrView W = static_cast<ArrOfDouble&>(tab_W).view_wo();
+  Kokkos::parallel_for(start_gpu_timer(__KERNEL_NAME__), nb_elem, KOKKOS_LAMBDA(const int elem)
+  {
+    W(elem) = -dZ2(elem) * volumes(elem);
+  });
+  end_gpu_timer(__KERNEL_NAME__);
 }
 
 void EOS_Tools_VDF::mettre_a_jour(double temps)
 {
   int n=tab_rho_face_np1.size_totale();
-  for (int i=0; i<n; i++)
-    {
-      tab_rho_face(i)=tab_rho_face_np1(i);
-      tab_rho_face_demi(i)=tab_rho_face_np1(i);
-    }
+  CDoubleArrView rho_face_np1 = static_cast<const ArrOfDouble&>(tab_rho_face_np1).view_ro();
+  DoubleArrView rho_face = static_cast<ArrOfDouble&>(tab_rho_face).view_wo();
+  DoubleArrView rho_face_demi = static_cast<ArrOfDouble&>(tab_rho_face_demi).view_wo();
+  Kokkos::parallel_for(start_gpu_timer(__KERNEL_NAME__), n, KOKKOS_LAMBDA(const int i)
+  {
+    rho_face(i)      = rho_face_np1(i);
+    rho_face_demi(i) = rho_face_np1(i);
+  });
+  end_gpu_timer(__KERNEL_NAME__);
 }
diff --git a/src/VDF/Operateurs/Eval_Conv/Eval_Conv_VDF.h b/src/VDF/Operateurs/Eval_Conv/Eval_Conv_VDF.h
index d85514240b..018b44d3ad 100644
--- a/src/VDF/Operateurs/Eval_Conv/Eval_Conv_VDF.h
+++ b/src/VDF/Operateurs/Eval_Conv/Eval_Conv_VDF.h
@@ -1,5 +1,5 @@
 /****************************************************************************
-* Copyright (c) 2024, CEA
+* Copyright (c) 2026, CEA
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
@@ -33,7 +33,11 @@ class Eval_Conv_VDF : public Evaluateur_VDF, public Eval_Conv_VDF_tools
 
 public:
   inline Eval_Conv_VDF() { }
-  inline Eval_Conv_VDF(const Eval_Conv_VDF& eval) : Evaluateur_VDF(eval), vitesse_(eval.vitesse_) { dt_vitesse.ref(eval.dt_vitesse); }
+  inline Eval_Conv_VDF(const Eval_Conv_VDF& eval) : Evaluateur_VDF(eval), vitesse_(eval.vitesse_)
+  {
+    dt_vitesse.ref(eval.dt_vitesse);
+    dt_vitesse_v_ = eval.dt_vitesse_v_;
+  }
 
   inline void associer(const Champ_Face_VDF& );
   inline void mettre_a_jour( ) { dt_vitesse.ref(vitesse_->valeurs()); }
@@ -41,19 +45,52 @@ class Eval_Conv_VDF : public Evaluateur_VDF, public Eval_Conv_VDF_tools
   inline Champ_Inc_base& vitesse() { return vitesse_.valeur(); }
 
   // pour CRTP
-  inline int get_elem(int i, int j) const { return elem_(i,j); }
-  inline int get_orientation(int i ) const { return orientation(i); }
-  inline int get_premiere_face_bord() const { return premiere_face_bord; }
-  inline double get_dt_vitesse(int face, int comp = 0) const { return dt_vitesse(face, comp); }
-  inline double get_surface_porosite(int face) const { return surface(face)*porosite(face); }
-  inline double get_surface(int face) const { return surface(face); }
-  inline double get_porosite(int face) const { return porosite(face); }
   inline const DoubleTab& get_tab_vitesse() const { return dt_vitesse; }
-  inline const Domaine_Cl_VDF& get_la_zcl() const { return la_zcl.valeur(); }
+
+  // For views:
+  template<typename ExecSpace=HostSpace>
+  KOKKOS_INLINE_FUNCTION int get_elem(int i, int j) const { if constexpr (std::is_same<ExecSpace, HostSpace>::value) return elem_(i,j); else return elem_v_(i,j); }
+  template<typename ExecSpace=HostSpace>
+  KOKKOS_INLINE_FUNCTION int get_orientation(int i ) const { if constexpr (std::is_same<ExecSpace, HostSpace>::value) return orientation(i); else return orientation_v_(i); }
+  template<typename ExecSpace=HostSpace>
+  KOKKOS_INLINE_FUNCTION double get_surface(int face) const { if constexpr (std::is_same<ExecSpace, HostSpace>::value) return surface(face); else return surface_v_(face); }
+  template<typename ExecSpace=HostSpace>
+  KOKKOS_INLINE_FUNCTION int get_dt_vitesse_nb_comp() const { if constexpr (std::is_same<ExecSpace, HostSpace>::value) return dt_vitesse.line_size(); else return (int)dt_vitesse_v_.extent(1); }
+  template<typename ExecSpace=HostSpace>
+  KOKKOS_INLINE_FUNCTION double get_dt_vitesse(int face, int comp = 0) const { if constexpr (std::is_same<ExecSpace, HostSpace>::value) return dt_vitesse(face, comp); else return dt_vitesse_v_(face, comp); }
+  template<typename ExecSpace=HostSpace>
+  KOKKOS_INLINE_FUNCTION double get_surface_porosite(int face) const { if constexpr (std::is_same<ExecSpace, HostSpace>::value) return surface(face)*porosite(face); else return surface_v_(face)*porosite_v_(face); }
+  template<typename ExecSpace=HostSpace>
+  KOKKOS_INLINE_FUNCTION double get_porosite(int face) const { if constexpr (std::is_same<ExecSpace, HostSpace>::value) return porosite(face); else return porosite_v_(face); }
+  template<typename ExecSpace=HostSpace>
+  KOKKOS_INLINE_FUNCTION double get_dist_face(int n1,int n2,int k) const { if constexpr (std::is_same<ExecSpace, HostSpace>::value) return le_dom->dist_face(n1,n2,k); else return le_dom_v_.dist_face(n1,n2,k); }
+  template<typename ExecSpace=HostSpace>
+  KOKKOS_INLINE_FUNCTION double get_dist_face_period(int n1,int n2,int k) const { if constexpr (std::is_same<ExecSpace, HostSpace>::value) return le_dom->dist_face_period(n1,n2,k); else return le_dom_v_.dist_face_period(n1,n2,k); }
+  template<typename ExecSpace=HostSpace>
+  KOKKOS_INLINE_FUNCTION int get_amont_amont(int face, int i) const { if constexpr (std::is_same<ExecSpace, HostSpace>::value) return le_dom->amont_amont(face,i); else return le_dom_v_.amont_amont(face,i); }
+  template<typename ExecSpace=HostSpace>
+  KOKKOS_INLINE_FUNCTION int get_face_amont_princ(int num_face, int i) const { if constexpr (std::is_same<ExecSpace, HostSpace>::value) return le_dom->face_amont_princ(num_face,i); else return le_dom_v_.face_amont_princ(num_face,i); }
+  template<typename ExecSpace=HostSpace>
+  KOKKOS_FORCEINLINE_FUNCTION int get_face_amont_conj(int num_face,int i, int k) const { if constexpr (std::is_same<ExecSpace, HostSpace>::value) return le_dom->face_amont_conj(num_face,i,k); else return le_dom_v_.face_amont_conj(num_face,i,k); }
+  template<typename ExecSpace=HostSpace>
+  KOKKOS_INLINE_FUNCTION double get_dim_face(int n1,int k) const { if constexpr (std::is_same<ExecSpace, HostSpace>::value) return le_dom->dim_face(n1,k); else return le_dom_v_.dim_face(n1,k); }
+  template<typename ExecSpace=HostSpace>
+  KOKKOS_INLINE_FUNCTION double get_dim_elem(int n1, int k) const { if constexpr (std::is_same<ExecSpace, HostSpace>::value) return le_dom->dim_elem(n1, k); else return le_dom_v_.dim_elem(n1, k); }
+  template<typename ExecSpace=HostSpace>
+  KOKKOS_INLINE_FUNCTION double get_dist_elem(int n1, int n2, int k) const { if constexpr (std::is_same<ExecSpace, HostSpace>::value) return le_dom->dist_elem(n1, n2, k); else return le_dom_v_.dist_elem(n1, n2, k); }
+  template<typename ExecSpace=HostSpace>
+  KOKKOS_INLINE_FUNCTION double get_dist_elem_period(int n1, int n2, int k) const { if constexpr (std::is_same<ExecSpace, HostSpace>::value) return le_dom->dist_elem_period(n1, n2, k); else return le_dom_v_.dist_elem_period(n1, n2, k); }
+
+  void view_ro() const override
+  {
+    Evaluateur_VDF::view_ro();
+    dt_vitesse_v_ = dt_vitesse.view_ro();
+  }
 
 protected:
   OBS_PTR(Champ_Face_VDF) vitesse_;
   DoubleTab dt_vitesse;
+  mutable CDoubleTabView dt_vitesse_v_;
 };
 
 /*! @brief associe le champ de vitesse transportante
diff --git a/src/VDF/Operateurs/Eval_Conv/Eval_Conv_VDF_Elem.h b/src/VDF/Operateurs/Eval_Conv/Eval_Conv_VDF_Elem.h
index 493e3db25f..8a5014619e 100644
--- a/src/VDF/Operateurs/Eval_Conv/Eval_Conv_VDF_Elem.h
+++ b/src/VDF/Operateurs/Eval_Conv/Eval_Conv_VDF_Elem.h
@@ -38,9 +38,11 @@ class Eval_Conv_VDF_Elem : public Eval_VDF_Elem
   // To overload
   template <typename Type_Double> inline void flux_face(const DoubleTab&, const DoubleTab&, const int, const Dirichlet_entree_fluide&, const int, Type_Double& ) const;
   template <typename Type_Double> inline void flux_face(const DoubleTab&, const DoubleTab&, const int, const Neumann_sortie_libre&, const int, Type_Double& ) const;
-  template <typename Type_Double> inline void flux_face(const DoubleTab&, const DoubleTab&, const int, const Periodique&, const int, Type_Double& ) const;
   template <typename Type_Double> inline void flux_face(const DoubleTab&, const int, const int, const int, const Echange_externe_impose&, const int, Type_Double& ) const { /* Do nothing */ }
-  template <typename Type_Double> inline void flux_faces_interne(const DoubleTab&, const int, Type_Double& ) const;
+
+  template <typename BC>
+  KOKKOS_INLINE_FUNCTION void flux_faces_bord_comp(CDoubleTabView, CDoubleTabView, const int, const BC_View&, const int, const int, double&) const;
+  KOKKOS_INLINE_FUNCTION void flux_faces_interne_comp(CDoubleTabView, const int, const int, double&) const;
 
   /* ************************************** *
    * *********  POUR L'IMPLICITE ********** *
@@ -77,17 +79,36 @@ class Eval_Conv_VDF_Elem : public Eval_VDF_Elem
   template <typename Type_Double> inline void coeffs_face_bloc_vitesse_common(const DoubleTab&, const int, Type_Double& ) const;
 
   // CRTP pattern to static_cast the appropriate class and get the implementation : This is magic !
-  inline int elem_(const int i, const int j) const { return static_cast<const DERIVED_T *>(this)->get_elem(i,j); }
-  inline int amont_amont_(const int face, const int i) const { return static_cast<const DERIVED_T *>(this)->amont_amont(face,i); }
-  inline double dt_vitesse(const int face, int comp = 0) const { return static_cast<const DERIVED_T *>(this)->get_dt_vitesse(face, comp); }
+  template<typename ExecSpace=HostSpace>
+  KOKKOS_INLINE_FUNCTION int elem_(const int i, const int j) const { return static_cast<const DERIVED_T *>(this)->template get_elem<ExecSpace>(i,j); }
+  template<typename ExecSpace=HostSpace>
+  KOKKOS_INLINE_FUNCTION int amont_amont_(const int face, const int i) const { return static_cast<const DERIVED_T *>(this)->template get_amont_amont<ExecSpace>(face,i); }
+  KOKKOS_INLINE_FUNCTION double dist_elem(const int n1, const int n2, const int k) const { return static_cast<const DERIVED_T *>(this)->get_dist_elem(n1,n2,k); }
+  template<typename ExecSpace=HostSpace>
+  KOKKOS_INLINE_FUNCTION double dt_vitesse(const int face, int comp = 0) const { return static_cast<const DERIVED_T *>(this)->template get_dt_vitesse<ExecSpace>(face, comp); }
+  template<typename ExecSpace=HostSpace>
+  KOKKOS_INLINE_FUNCTION int dt_vitesse_nb_comp() const { return static_cast<const DERIVED_T *>(this)->template get_dt_vitesse_nb_comp<ExecSpace>(); }
   inline const DoubleTab& tab_vitesse() const { return static_cast<const DERIVED_T *>(this)->get_tab_vitesse(); }
-  inline double surface_porosite(const int face) const { return static_cast<const DERIVED_T *>(this)->get_surface_porosite(face); }
+  template<typename ExecSpace=HostSpace>
+  KOKKOS_INLINE_FUNCTION double surface_porosite(const int face) const { return static_cast<const DERIVED_T *>(this)->template get_surface_porosite<ExecSpace>(face); }
 
   template <typename Type_Double> inline void quick_fram_(const Type_Double& psc, const int num0, const int num1, const int num0_0, const int num1_1, const int face, const DoubleTab& transporte, Type_Double& flux) const
   { static_cast<const DERIVED_T *>(this)->template quick_fram<Type_Double>(psc, num0, num1, num0_0, num1_1, face, transporte, flux); }
+  KOKKOS_INLINE_FUNCTION
+  void quick_fram_view_(const double psc, const int num0, const int num1, const int num0_0, const int num1_1, const int face, CDoubleTabView transporte, DoubleArrView flux) const
+  { static_cast<const DERIVED_T *>(this)->quick_fram_view(psc, num0, num1, num0_0, num1_1, face, transporte, flux); }
+  KOKKOS_INLINE_FUNCTION
+  void quick_fram_view_comp_(const double psc, const int num0, const int num1, const int num0_0, const int num1_1, const int face, CDoubleTabView transporte, const int k, double& flux) const
+  { static_cast<const DERIVED_T *>(this)->quick_fram_view_comp(psc, num0, num1, num0_0, num1_1, face, transporte, k, flux); }
 
   template <typename Type_Double> inline void qcentre_(const double psc, const int num0, const int num1, const int num0_0, const int num1_1, const int face, const DoubleTab& transporte, Type_Double& flux) const
   { static_cast<const DERIVED_T *>(this)->template qcentre<Type_Double>(psc,num0,num1,num0_0,num1_1,face,transporte,flux); }
+  KOKKOS_INLINE_FUNCTION
+  void qcentre_view_(const double psc, const int num0, const int num1, const int num0_0, const int num1_1, const int face, CDoubleTabView transporte, DoubleArrView flux) const
+  { static_cast<const DERIVED_T *>(this)->qcentre_view(psc,num0,num1,num0_0,num1_1,face,transporte,flux); }
+  KOKKOS_INLINE_FUNCTION
+  void qcentre_view_comp_(const double psc, const int num0, const int num1, const int num0_0, const int num1_1, const int face, CDoubleTabView transporte, const int k, double& flux) const
+  { static_cast<const DERIVED_T *>(this)->qcentre_view_comp(psc,num0,num1,num0_0,num1_1,face,transporte,k,flux); }
 };
 
 #include <Eval_Conv_VDF_Elem.tpp> // templates specializations ici ;)
diff --git a/src/VDF/Operateurs/Eval_Conv/Eval_Conv_VDF_Elem.tpp b/src/VDF/Operateurs/Eval_Conv/Eval_Conv_VDF_Elem.tpp
index 3b2f1e3c45..caf50497c6 100644
--- a/src/VDF/Operateurs/Eval_Conv/Eval_Conv_VDF_Elem.tpp
+++ b/src/VDF/Operateurs/Eval_Conv/Eval_Conv_VDF_Elem.tpp
@@ -21,7 +21,7 @@ inline void Eval_Conv_VDF_Elem<DERIVED_T>::flux_face(const DoubleTab& inco, cons
 {
   for (int n = 0; n < flux.size_array(); n++)
     {
-      const int ind = (tab_vitesse().line_size() == flux.size_array()) ? n : 0;
+      const int ind = (dt_vitesse_nb_comp() == flux.size_array()) ? n : 0;
       const double psc = dt_vitesse(f,ind) * surface_porosite(f);
       for (int i = 0, e; i < 2; i++)
         if ((e = elem_(f, i)) > -1)
@@ -34,7 +34,7 @@ inline void Eval_Conv_VDF_Elem<DERIVED_T>::flux_face(const DoubleTab& inco, cons
 {
   for (int n = 0; n < flux.size_array(); n++)
     {
-      const int ind = (tab_vitesse().line_size() == flux.size_array()) ? n : 0;
+      const int ind = (dt_vitesse_nb_comp() == flux.size_array()) ? n : 0;
       const double psc = dt_vitesse(f, ind) * surface_porosite(f);
       for (int i = 0, e; i < 2; i++)
         if ((e = elem_(f, i)) > -1)
@@ -42,91 +42,84 @@ inline void Eval_Conv_VDF_Elem<DERIVED_T>::flux_face(const DoubleTab& inco, cons
     }
 }
 
-template <typename DERIVED_T> template <typename Type_Double>
-inline void Eval_Conv_VDF_Elem<DERIVED_T>::flux_face(const DoubleTab& inco, const DoubleTab&, const int face, const Periodique& la_cl, const int num1, Type_Double& flux) const
-{
-  const int i = elem_(face,0), j = elem_(face,1), ncomp = flux.size_array();
 
-  if (!DERIVED_T::IS_AMONT)
+template <typename DERIVED_T> template <typename BC>
+KOKKOS_INLINE_FUNCTION void Eval_Conv_VDF_Elem<DERIVED_T>::flux_faces_bord_comp(CDoubleTabView inco, CDoubleTabView val_b, const int face, const BC_View& bc_view, const int num1, const int k, double& flux) const
+{
+  if constexpr (std::is_same_v<BC, Periodique>)
     {
-      const double psc = dt_vitesse(face) * surface_porosite(face);
-      const int i_0 = amont_amont_(face, 0), j_1 = amont_amont_(face, 1);
-
-      if (DERIVED_T::IS_CENTRE || DERIVED_T::IS_CENTRE4)
-        qcentre_ < Type_Double > (psc, i, j, i_0, j_1, face, inco, flux); // on applique le schema centre 2 ou 4
+      const int i = elem_<DeviceSpace>(face,0), j = elem_<DeviceSpace>(face,1);
+      if constexpr (!DERIVED_T::IS_AMONT)
+        {
+          const double psc = dt_vitesse<DeviceSpace>(face) * surface_porosite<DeviceSpace>(face);
+          const int i_0 = amont_amont_<DeviceSpace>(face,0), j_1 = amont_amont_<DeviceSpace>(face,1);
+          if constexpr (DERIVED_T::IS_CENTRE || DERIVED_T::IS_CENTRE4)
+            qcentre_view_comp_(psc,i,j,i_0,j_1,face,inco,k,flux);
+          else
+            quick_fram_view_comp_(psc,i,j,i_0,j_1,face,inco,k,flux);
+          flux *= -1.;
+        }
       else
         {
-          Type_Double psc_multi(ncomp);
-          for (int k = 0; k < ncomp; k++)
-            {
-              const int ind = (tab_vitesse().line_size() == ncomp) ? k : 0;
-              psc_multi[k] = dt_vitesse(face, ind) * surface_porosite(face);
-            }
-          quick_fram_(psc_multi, i, j, i_0, j_1, face, inco, flux); // on applique le schema Quick
+          const int ind = (dt_vitesse_nb_comp<DeviceSpace>() == 1) ? 0 : k;
+          const double psc = dt_vitesse<DeviceSpace>(face,ind) * surface_porosite<DeviceSpace>(face);
+          flux = (psc > 0) ? -psc * inco(i,k) : -psc * inco(j,k);
         }
-
-      for (int k = 0; k < ncomp; k++) flux[k] *= -1;
+    }
+  else if constexpr (std::is_same_v<BC, Dirichlet_entree_fluide> || std::is_same_v<BC, Neumann_sortie_libre>)
+    {
+      const int ind = (dt_vitesse_nb_comp<DeviceSpace>() == 1) ? 0 : k;
+      const double psc = dt_vitesse<DeviceSpace>(face,ind) * surface_porosite<DeviceSpace>(face);
+      flux = 0.;
+      for (int i = 0, e; i < 2; i++)
+        if ((e = elem_<DeviceSpace>(face,i)) > -1)
+          flux = -psc * (((psc > 0 && !i) || (psc <= 0 && i)) ? inco(e,k) : val_b(face,k));
     }
   else
-    for (int k = 0; k < ncomp; k++)
-      {
-        const int ind = (tab_vitesse().line_size() == ncomp) ? k : 0;
-        const double psc = dt_vitesse(face, ind) * surface_porosite(face);
-        flux[k] = (psc > 0) ? -psc * inco(i, k) : -psc * inco(j, k); /* AMONT */
-      }
+    flux = 0.; // generic do nothing
 }
 
-template <typename DERIVED_T> template <typename Type_Double>
-inline void Eval_Conv_VDF_Elem<DERIVED_T>::flux_faces_interne(const DoubleTab& inco, const int face, Type_Double& flux) const
+
+template <typename DERIVED_T>
+KOKKOS_INLINE_FUNCTION void Eval_Conv_VDF_Elem<DERIVED_T>::flux_faces_interne_comp(CDoubleTabView inco, const int face, const int k, double& flux) const
 {
-  const int i = elem_(face,0), j = elem_(face,1), ncomp = flux.size_array();
-  if (!DERIVED_T::IS_AMONT)
+  const int i = elem_<DeviceSpace>(face,0), j = elem_<DeviceSpace>(face,1);
+  if constexpr (!DERIVED_T::IS_AMONT)
     {
-      const double psc = dt_vitesse(face)*surface_porosite(face);
-      const int i_0 = amont_amont_(face,0), j_1 = amont_amont_(face,1);
-      if (DERIVED_T::IS_CENTRE)
+      const double psc = dt_vitesse<DeviceSpace>(face)*surface_porosite<DeviceSpace>(face);
+      const int i_0 = amont_amont_<DeviceSpace>(face,0), j_1 = amont_amont_<DeviceSpace>(face,1);
+      if constexpr (DERIVED_T::IS_CENTRE)
         {
-          qcentre_<Type_Double>(psc,i,j,i_0,j_1,face,inco,flux);
-          for (int k=0; k<ncomp; k++) flux[k] *= -1;
+          qcentre_view_comp_(psc,i,j,i_0,j_1,face,inco,k,flux);
+          flux *= -1.;
         }
-      else if (DERIVED_T::IS_CENTRE4)
+      else if constexpr (DERIVED_T::IS_CENTRE4)
         {
-          if ( (i_0 == -1) || (j_1 == -1) )
-            for (int k=0; k<ncomp; k++) flux[k] = -psc*(inco(i,k)+inco(j,k))/2.; // on applique le schema centre2
-          else // on applique le schema centre4
+          if ((i_0 == -1) || (j_1 == -1))
+            flux = -psc*(inco(i,k)+inco(j,k))/2.;
+          else
             {
-              qcentre_<Type_Double>(psc,i,j,i_0,j_1,face,inco,flux);
-              for (int k=0; k<ncomp; k++) flux[k] *= -1;
+              qcentre_view_comp_(psc,i,j,i_0,j_1,face,inco,k,flux);
+              flux *= -1.;
             }
         }
       else
         {
-          /* *****************************************************************************************************************************************************
-           * Alexandre C. 19/02/03 : We do not use the first order upwind scheme as it was done before because in LES computations all turbulence is damped
-           * and we can not recover a proper behavior of physical turbulent characteristics, especially when using wall-functions.
-           * Therefore we use the 2nd order centered scheme.
-           * *****************************************************************************************************************************************************
-           * Pierre L. 14/10/04: Correction car le centre explose sur le cas VALIDA On revient au quick en essayant d'ameliorer: on prend le quick si psc est
-           * encore favorable pour avoir les 3 points necessaires au calcul du quick. Cela est deja ce qui est fait pour le quick-sharp de l'evaluateur aux faces.
-           * *****************************************************************************************************************************************************/
-
-          Type_Double psc_multi(ncomp);
-          for (int k = 0; k < ncomp; k++)
+          if ((i_0 == -1 && psc >= 0) || (j_1 == -1 && psc <= 0))
+            flux = (psc > 0) ? -psc*inco(i,k) : -psc*inco(j,k);
+          else
             {
-              const int ind = (tab_vitesse().line_size() == ncomp) ? k : 0;
-              psc_multi[k] = dt_vitesse(face, ind) * surface_porosite(face);
+              quick_fram_view_comp_(psc,i,j,i_0,j_1,face,inco,k,flux);
+              flux *= -1.;
             }
-          quick_fram_(psc_multi,i,j,i_0,j_1,face,inco,flux);
-          for (int k=0; k<ncomp; k++) flux[k] *= -1;
         }
     }
   else
-    for(int k = 0; k < ncomp; k++)
-      {
-        const int ind = (tab_vitesse().line_size() == ncomp) ? k : 0;
-        const double psc = dt_vitesse(face, ind)*surface_porosite(face);
-        flux[k] = (psc > 0) ? -psc * inco(i, k) : -psc * inco(j, k); /* AMONT */
-      }
+    {
+      const int ind = (dt_vitesse_nb_comp<DeviceSpace>() == 1) ? 0 : k;
+      const double psc = dt_vitesse<DeviceSpace>(face, ind)*surface_porosite<DeviceSpace>(face);
+      flux = (psc > 0) ? -psc * inco(i, k) : -psc * inco(j, k);
+    }
 }
 
 /* ************************************** *
@@ -149,7 +142,7 @@ inline void Eval_Conv_VDF_Elem<DERIVED_T>::coeffs_face_common(const int face, Ty
       else
         for (int k = 0; k < ncomp; k++)
           {
-            const int ind = (tab_vitesse().line_size() == ncomp) ? k : 0;
+            const int ind = (dt_vitesse_nb_comp() == ncomp) ? k : 0;
             psc = dt_vitesse(face, ind)*surface_porosite(face);
             aii[k] = (psc > 0) ? psc : 0.;
             ajj[k] = 0.;
@@ -165,7 +158,7 @@ inline void Eval_Conv_VDF_Elem<DERIVED_T>::coeffs_face_common(const int face, Ty
       else
         for (int k = 0; k < ncomp; k++)
           {
-            const int ind = (tab_vitesse().line_size() == ncomp) ? k : 0;
+            const int ind = (dt_vitesse_nb_comp() == ncomp) ? k : 0;
             psc = dt_vitesse(face, ind)*surface_porosite(face);
             ajj[k] = (psc < 0) ? -psc : 0.;
             aii[k] = 0.;
@@ -203,7 +196,7 @@ inline void Eval_Conv_VDF_Elem<DERIVED_T>::coeffs_face(const int face, const int
   else
     for (int k = 0; k < ncomp; k++)
       {
-        const int ind = (tab_vitesse().line_size() == ncomp) ? k : 0;
+        const int ind = (dt_vitesse_nb_comp() == ncomp) ? k : 0;
         const double psc = dt_vitesse(face, ind)*surface_porosite(face);
         aii[k] = (psc > 0) ? psc : 0.;
         ajj[k] = (psc > 0) ? 0. : -psc;
@@ -237,7 +230,7 @@ inline void Eval_Conv_VDF_Elem<DERIVED_T>::coeffs_faces_interne(const int face,
   else
     for (int k = 0; k < ncomp; k++)
       {
-        const int ind = (tab_vitesse().line_size() == ncomp) ? k : 0;
+        const int ind = (dt_vitesse_nb_comp() == ncomp) ? k : 0;
         const double psc = dt_vitesse(face, ind)*surface_porosite(face);
         aii[k] = (psc > 0) ? psc : 0.;
         ajj[k] = (psc > 0) ? 0. : -psc;
@@ -254,7 +247,7 @@ inline void Eval_Conv_VDF_Elem<DERIVED_T>::coeffs_face_bloc_vitesse(const Double
         for (int i = 0, e; i < 2; i++)
           if ((e = elem_(f, i)) > -1)
             {
-              const int ind = (tab_vitesse().line_size() == flux.size_array()) ? n : 0;
+              const int ind = (dt_vitesse_nb_comp() == flux.size_array()) ? n : 0;
               flux[n] = surface_porosite(f) *
                         (((dt_vitesse(f, ind) > 0 && !i) || (dt_vitesse(f, ind) <= 0 && i)) ? inco(e, n) : val_b(f,
                                                                                                                  n));
@@ -272,7 +265,7 @@ inline void Eval_Conv_VDF_Elem<DERIVED_T>::coeffs_face_bloc_vitesse(const Double
         for (int i = 0, e; i < 2; i++)
           if ((e = elem_(f, i)) > -1)
             {
-              const int ind = (tab_vitesse().line_size() == flux.size_array()) ? n : 0;
+              const int ind = (dt_vitesse_nb_comp() == flux.size_array()) ? n : 0;
               flux[n] = surface_porosite(f) *
                         (((dt_vitesse(f, ind) > 0 && !i) || (dt_vitesse(f, ind) <= 0 && i)) ? inco(e, n) : val_b(f,
                                                                                                                  n));
@@ -296,7 +289,7 @@ inline void Eval_Conv_VDF_Elem<DERIVED_T>::coeffs_face_bloc_vitesse_common(const
       else
         for (int k = 0; k < ncomp; k++)
           {
-            const int ind = (tab_vitesse().line_size() == ncomp) ? k : 0;
+            const int ind = (dt_vitesse_nb_comp() == ncomp) ? k : 0;
             flux[k] = (dt_vitesse(face, ind) > 0) ? psc * inco(i, k) : psc * inco(j, k);
           }
     }
diff --git a/src/VDF/Operateurs/Eval_Conv/Eval_Conv_VDF_Elem_leaves.h b/src/VDF/Operateurs/Eval_Conv/Eval_Conv_VDF_Elem_leaves.h
index 6ed54b39c4..fbf309ce88 100644
--- a/src/VDF/Operateurs/Eval_Conv/Eval_Conv_VDF_Elem_leaves.h
+++ b/src/VDF/Operateurs/Eval_Conv/Eval_Conv_VDF_Elem_leaves.h
@@ -55,14 +55,21 @@ class Eval_Centre_VDF_Elem : public Eval_Conv_VDF_Elem<Eval_Centre_VDF_Elem>, pu
 {
 public:
   static constexpr bool IS_CENTRE = true;
-  inline int amont_amont(int face, int i) const override { return le_dom->amont_amont(face, i); }
-  inline double dim_elem(int n1, int k) const override { return le_dom->dim_elem(n1,k); }
-  inline double dist_elem(int n1, int n2, int k) const override { return le_dom->dist_elem(n1,n2,k); }
-  inline double dist_face_elem1(int num_face,int n1) const { return le_dom->dist_face_elem1(num_face, n1); }
-
   template <typename Type_Double>
   inline void qcentre(const double psc, const int num0, const int num1, const int num0_0, const int num1_1,const int face,const DoubleTab& transporte, Type_Double& flux) const
-  { qcentre2_impl<Type_Double>(psc,num0,num1,num0_0,num1_1,face,transporte,flux); }
+  {
+    qcentre2_impl<Type_Double>(psc,num0,num1,num0_0,num1_1,face,transporte,flux);
+  }
+  KOKKOS_INLINE_FUNCTION
+  void qcentre_view(const double psc, const int num0, const int num1, const int num0_0, const int num1_1,const int face,CDoubleTabView transporte, DoubleArrView flux) const
+  {
+    qcentre2_impl_view(psc,num0,num1,num0_0,num1_1,face,transporte,flux);
+  }
+  KOKKOS_INLINE_FUNCTION
+  void qcentre_view_comp(const double psc, const int num0, const int num1, const int num0_0, const int num1_1, const int face, CDoubleTabView transporte, const int k, double& flux) const
+  {
+    qcentre2_impl_comp(psc,num0,num1,transporte,k,flux);
+  }
 };
 
 /*! @brief class Eval_Centre4_VDF_Elem Evaluateur VDF pour la convection Le champ convecte est scalaire (Champ_P0_VDF)
@@ -75,18 +82,29 @@ class Eval_Centre4_VDF_Elem : public Eval_Conv_VDF_Elem<Eval_Centre4_VDF_Elem>,
 
 public:
   static constexpr bool IS_CENTRE4 = true;
-  inline int amont_amont(int face, int i) const override { return le_dom->amont_amont(face, i); }
-  inline double dist_elem(int n1, int n2, int k) const override { return le_dom->dist_elem_period(n1,n2,k); }
-  template <typename Type_Double> inline void qcentre(const double ,const int ,const int ,const int ,const int ,const int , const DoubleTab& ,Type_Double& ) const;
+  template <typename Type_Double>
+  inline void qcentre(const double psc, const int num0, const int num1, const int num0_0, const int num1_1, const int face, const DoubleTab& transporte, Type_Double& flux) const
+  {
+    const int ori = orientation(face);
+    const double dx = get_dist_elem_period(num0, num1, ori), dxam = get_dist_elem_period(num0_0, num0, ori), dxav = get_dist_elem_period(num1, num1_1, ori);
+    qcentre4_impl<Type_Double>(ori,dx,dxam,dxav,psc,num0,num1,num0_0,num1_1,face,transporte,flux);
+  }
+  KOKKOS_INLINE_FUNCTION
+  void qcentre_view(const double psc, const int num0, const int num1, const int num0_0, const int num1_1, const int face, CDoubleTabView transporte, DoubleArrView flux) const
+  {
+    const int ori = orientation_v_(face);
+    const double dx = get_dist_elem_period<DeviceSpace>(num0, num1, ori), dxam = get_dist_elem_period<DeviceSpace>(num0_0, num0, ori), dxav = get_dist_elem_period<DeviceSpace>(num1, num1_1, ori);
+    qcentre4_impl_view(ori,dx,dxam,dxav,psc,num0,num1,num0_0,num1_1,face,transporte,flux);
+  }
+  KOKKOS_INLINE_FUNCTION
+  void qcentre_view_comp(const double psc, const int num0, const int num1, const int num0_0, const int num1_1, const int face, CDoubleTabView transporte, const int k, double& flux) const
+  {
+    const int ori = orientation_v_(face);
+    const double dx = get_dist_elem_period<DeviceSpace>(num0, num1, ori), dxam = get_dist_elem_period<DeviceSpace>(num0_0, num0, ori), dxav = get_dist_elem_period<DeviceSpace>(num1, num1_1, ori);
+    qcentre4_impl_comp(ori,dx,dxam,dxav,psc,num0,num1,num0_0,num1_1,transporte,k,flux);
+  }
 };
 
-template <typename Type_Double>
-inline void Eval_Centre4_VDF_Elem::qcentre(const double psc, const int num0, const int num1, const int num0_0, const int num1_1, const int face, const DoubleTab& transporte, Type_Double& flux) const
-{
-  const int ori = orientation(face);
-  const double dx = dist_elem(num0, num1, ori), dxam = dist_elem(num0_0, num0, ori), dxav = dist_elem(num1, num1_1, ori);
-  qcentre4_impl<Type_Double>(ori,dx,dxam,dxav,psc,num0,num1,num0_0,num1_1,face,transporte,flux);
-}
 
 /*! @brief class Eval_Quick_VDF_Elem Evaluateur VDF pour la convection Le champ convecte est scalaire (Champ_P0_VDF)
  *
@@ -97,20 +115,33 @@ class Eval_Quick_VDF_Elem : public Eval_Conv_VDF_Elem<Eval_Quick_VDF_Elem>, publ
 {
 public:
   static constexpr bool IS_QUICK = true;
-  inline int amont_amont(int face, int i) const override { return le_dom->amont_amont(face, i); }
-  inline double dim_elem(int n1, int k) const override { return le_dom->dim_elem(n1,k); }
-  inline double dist_elem(int n1, int n2, int k) const override { return le_dom->dist_elem_period(n1,n2,k); }
-  template <typename Type_Double> inline void quick_fram(const Type_Double&, const int, const int,const int, const int ,const int ,const DoubleTab&, Type_Double& ) const;
+  template <typename Type_Double>
+  inline void quick_fram(const double psc, const int num0, const int num1,const int num0_0, const int num1_1, const int face,const DoubleTab& transporte, Type_Double& flux) const
+  {
+    const int ori = orientation(face);
+    const double dx = get_dist_elem_period(num0, num1, ori),
+                 dm0 = get_dim_elem(num0, ori), dxam0 = (num0_0!=-1?get_dist_elem_period(num0_0, num0, ori):0),
+                 dm1 = get_dim_elem(num1, ori), dxam1 = (num1_1!=-1?get_dist_elem_period(num1, num1_1, ori):0);
+    quick_fram_impl<Type_Double>(ori,dx,dm0,dxam0,dm1,dxam1,psc,num0,num1,num0_0,num1_1,face,transporte,flux);
+  }
+  KOKKOS_INLINE_FUNCTION
+  void quick_fram_view(const double psc, const int num0, const int num1, const int num0_0, const int num1_1, const int face, CDoubleTabView transporte, DoubleArrView flux) const
+  {
+    const int ori = orientation_v_(face);
+    const double dx = get_dist_elem_period<DeviceSpace>(num0, num1, ori),
+                 dm0 = get_dim_elem<DeviceSpace>(num0, ori), dxam0 = (num0_0!=-1?get_dist_elem_period<DeviceSpace>(num0_0, num0, ori):0),
+                 dm1 = get_dim_elem<DeviceSpace>(num1, ori), dxam1 = (num1_1!=-1?get_dist_elem_period<DeviceSpace>(num1, num1_1, ori):0);
+    quick_fram_impl_view(ori,dx,dm0,dxam0,dm1,dxam1,psc,num0,num1,num0_0,num1_1,face,transporte,flux);
+  }
+  KOKKOS_INLINE_FUNCTION
+  void quick_fram_view_comp(const double psc, const int num0, const int num1, const int num0_0, const int num1_1, const int face, CDoubleTabView transporte, const int k, double& flux) const
+  {
+    const int ori = orientation_v_(face);
+    const double dx = get_dist_elem_period<DeviceSpace>(num0, num1, ori),
+                 dm0 = get_dim_elem<DeviceSpace>(num0, ori), dxam0 = (num0_0!=-1?get_dist_elem_period<DeviceSpace>(num0_0, num0, ori):0),
+                 dm1 = get_dim_elem<DeviceSpace>(num1, ori), dxam1 = (num1_1!=-1?get_dist_elem_period<DeviceSpace>(num1, num1_1, ori):0);
+    quick_fram_impl_comp(ori,dx,dm0,dxam0,dm1,dxam1,psc,num0,num1,num0_0,num1_1,transporte,k,flux);
+  }
 };
 
-template <typename Type_Double>
-inline void Eval_Quick_VDF_Elem::quick_fram(const Type_Double& psc, const int num0, const int num1,const int num0_0, const int num1_1, const int face,const DoubleTab& transporte, Type_Double& flux) const
-{
-  const int ori = orientation(face);
-  const double dx = dist_elem(num0, num1, ori),
-               dm0 = dim_elem(num0, ori), dxam0 = (num0_0!=-1?dist_elem(num0_0, num0, ori):0),
-               dm1 = dim_elem(num1, ori), dxam1 = (num1_1!=-1?dist_elem(num1, num1_1, ori):0);
-  quick_fram_impl<Type_Double>(ori,dx,dm0,dxam0,dm1,dxam1,psc,num0,num1,num0_0,num1_1,face,transporte,flux);
-}
-
 #endif /* Eval_Conv_VDF_Elem_leaves_included */
diff --git a/src/VDF/Operateurs/Eval_Conv/Eval_Conv_VDF_Face.h b/src/VDF/Operateurs/Eval_Conv/Eval_Conv_VDF_Face.h
index 192ae8f759..1b1520a87b 100644
--- a/src/VDF/Operateurs/Eval_Conv/Eval_Conv_VDF_Face.h
+++ b/src/VDF/Operateurs/Eval_Conv/Eval_Conv_VDF_Face.h
@@ -1,5 +1,5 @@
 /****************************************************************************
-* Copyright (c) 2023, CEA
+* Copyright (c) 2026, CEA
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
@@ -34,40 +34,53 @@ class Eval_Conv_VDF_Face : public Eval_VDF_Face
    * *********  POUR L'EXPLICITE ********** *
    * ************************************** */
 
-  template<Type_Flux_Fa7 Fa7_Type, typename Type_Double> inline std::enable_if_t< Fa7_Type == Type_Flux_Fa7::SORTIE_LIBRE, void>
-  flux_fa7(const DoubleTab&, const DoubleTab*, int , const Neumann_sortie_libre&, int, Type_Double& ) const;
+  template<Type_Flux_Arete Arete_Type, typename Type_Double>
+  inline std::enable_if_t< Arete_Type == Type_Flux_Arete::INTERNE, void>
+  flux_arete(const DoubleTab&, const DoubleTab*, int, int, int, int, Type_Double& ) const ;
 
-  template<Type_Flux_Fa7 Fa7_Type, typename Type_Double> inline std::enable_if_t< Fa7_Type == Type_Flux_Fa7::ELEM, void>
-  flux_fa7(const DoubleTab&, const DoubleTab*,int, int, int, Type_Double& ) const;
+  // _comp variants: void with output ref(s) for one component k
+  template<Type_Flux_Arete Arete_Type>
+  KOKKOS_FORCEINLINE_FUNCTION std::enable_if_t<Arete_Type == Type_Flux_Arete::INTERNE, void>
+  flux_arete_comp(CDoubleTabView, CDoubleTabView, int, int, int, int, int, double&) const;
 
-  template<Type_Flux_Arete Arete_Type, typename Type_Double> inline std::enable_if_t< Arete_Type == Type_Flux_Arete::INTERNE, void>
-  flux_arete(const DoubleTab&, const DoubleTab*, int, int, int, int, Type_Double& ) const ;
+  template<Type_Flux_Arete Arete_Type>
+  KOKKOS_INLINE_FUNCTION std::enable_if_t<Arete_Type == Type_Flux_Arete::MIXTE, void>
+  flux_arete_comp(CDoubleTabView, CDoubleTabView, int, int, int, int, int, double&) const;
 
-  template<Type_Flux_Arete Arete_Type, typename Type_Double> inline std::enable_if_t< Arete_Type == Type_Flux_Arete::MIXTE, void>
-  flux_arete(const DoubleTab&, const DoubleTab*,int, int, int, int, Type_Double& ) const ;
+  template<Type_Flux_Arete Arete_Type>
+  KOKKOS_INLINE_FUNCTION std::enable_if_t<Arete_Type == Type_Flux_Arete::PAROI || Arete_Type == Type_Flux_Arete::NAVIER || Arete_Type == Type_Flux_Arete::NAVIER_PAROI, void>
+  flux_arete_comp(CDoubleTabView, CDoubleTabView, CDoubleTabView, CDoubleTabView, int, int, int, int, int, double& flux) const { flux = 0.; }
 
-  template<Type_Flux_Arete Arete_Type, typename Type_Double>
-  inline std::enable_if_t<(Arete_Type == Type_Flux_Arete::PAROI || Arete_Type == Type_Flux_Arete::NAVIER || Arete_Type == Type_Flux_Arete::NAVIER_PAROI), void>
-  flux_arete(const DoubleTab&, const DoubleTab*,int, int, int, int, Type_Double& ) const { /* do nothing */ }
+  template<Type_Flux_Arete Arete_Type>
+  KOKKOS_INLINE_FUNCTION std::enable_if_t<Arete_Type == Type_Flux_Arete::FLUIDE || Arete_Type == Type_Flux_Arete::NAVIER_FLUIDE || Arete_Type == Type_Flux_Arete::PAROI_FLUIDE, void>
+  flux_arete_comp(CDoubleTabView, CDoubleTabView, CDoubleTabView, CDoubleTabView, int, int, int, int, int, int, double&, double&) const;
 
-  template<Type_Flux_Arete Arete_Type, typename Type_Double>
-  inline std::enable_if_t<Arete_Type == Type_Flux_Arete::FLUIDE || Arete_Type == Type_Flux_Arete::NAVIER_FLUIDE || Arete_Type == Type_Flux_Arete::PAROI_FLUIDE, void>
-  flux_arete(const DoubleTab&, const DoubleTab*,int, int, int, int, Type_Double&, Type_Double& ) const;
+  template<Type_Flux_Arete Arete_Type>
+  KOKKOS_INLINE_FUNCTION std::enable_if_t<Arete_Type == Type_Flux_Arete::PERIODICITE, void>
+  flux_arete_comp(CDoubleTabView, CDoubleTabView, int, int, int, int, int, double&, double&) const;
 
-  template<Type_Flux_Arete Arete_Type, typename Type_Double> inline std::enable_if_t< Arete_Type == Type_Flux_Arete::PERIODICITE, void>
-  flux_arete(const DoubleTab&, const DoubleTab*, int, int, int, int, Type_Double&, Type_Double& ) const ;
+  template<Type_Flux_Arete Arete_Type>
+  KOKKOS_INLINE_FUNCTION std::enable_if_t<Arete_Type == Type_Flux_Arete::COIN_FLUIDE, void>
+  flux_arete_comp(CDoubleTabView, CDoubleTabView, CDoubleTabView, CDoubleTabView, int, int, int, int, int, double&, double&) const;
 
-  template<Type_Flux_Arete Arete_Type, typename Type_Double> inline std::enable_if_t< Arete_Type == Type_Flux_Arete::COIN_FLUIDE, void>
-  flux_arete(const DoubleTab&, const DoubleTab*, int, int, int, int, Type_Double&, Type_Double&) const;
+  template<Type_Flux_Fa7 Fa7_Type>
+  KOKKOS_FORCEINLINE_FUNCTION std::enable_if_t<Fa7_Type == Type_Flux_Fa7::ELEM, void>
+  flux_fa7_comp(CDoubleTabView, CDoubleTabView, int, int, int, int, double&) const;
+
+  template<Type_Flux_Fa7 Fa7_Type>
+  KOKKOS_INLINE_FUNCTION std::enable_if_t<Fa7_Type == Type_Flux_Fa7::SORTIE_LIBRE, void>
+  flux_fa7_comp(CDoubleTabView, CDoubleTabView, int, CDoubleTabView, int, int, double&) const;
 
   /* ************************************** *
-   * *********  POUR L'IMPLICITE ********** *
-   * ************************************** */
+  * *********  POUR L'IMPLICITE ********** *
+  * ************************************** */
 
-  template<Type_Flux_Fa7 Fa7_Type, typename Type_Double> inline std::enable_if_t< Fa7_Type == Type_Flux_Fa7::SORTIE_LIBRE, void>
+  template<Type_Flux_Fa7 Fa7_Type, typename Type_Double>
+  inline std::enable_if_t< Fa7_Type == Type_Flux_Fa7::SORTIE_LIBRE, void>
   coeffs_fa7(const DoubleTab*, int , const Neumann_sortie_libre&, Type_Double& , Type_Double& ) const;
 
-  template<Type_Flux_Fa7 Fa7_Type, typename Type_Double> inline std::enable_if_t< Fa7_Type == Type_Flux_Fa7::ELEM, void>
+  template<Type_Flux_Fa7 Fa7_Type, typename Type_Double>
+  inline std::enable_if_t< Fa7_Type == Type_Flux_Fa7::ELEM, void>
   coeffs_fa7(const DoubleTab*, int, int, int, Type_Double& , Type_Double& ) const;
 
   template<Type_Flux_Arete Arete_Type, typename Type_Double>
@@ -76,43 +89,56 @@ class Eval_Conv_VDF_Face : public Eval_VDF_Face
 
   template<Type_Flux_Arete Arete_Type, typename Type_Double>
   inline std::enable_if_t< Arete_Type == Type_Flux_Arete::FLUIDE || Arete_Type == Type_Flux_Arete::NAVIER_FLUIDE || Arete_Type == Type_Flux_Arete::PAROI_FLUIDE || Arete_Type == Type_Flux_Arete::COIN_FLUIDE, void>
-  coeffs_arete(const DoubleTab*, int, int, int, int, Type_Double& , Type_Double& , Type_Double& ) const;
+  coeffs_arete(const DoubleTab&, const DoubleTab&, const DoubleTab*, int, int, int, int, Type_Double& , Type_Double& , Type_Double& ) const;
+
+  template<Type_Flux_Arete Arete_Type, typename Type_Double>
+  inline std::enable_if_t<(Arete_Type == Type_Flux_Arete::PAROI || Arete_Type == Type_Flux_Arete::NAVIER || Arete_Type == Type_Flux_Arete::NAVIER_PAROI), void>
+  coeffs_arete(const DoubleTab&, const DoubleTab&, const DoubleTab*, int, int, int, int, Type_Double& , Type_Double& , Type_Double& ) const { /* do nothing */ }
 
-  template<Type_Flux_Arete Arete_Type, typename Type_Double> inline
-  std::enable_if_t<(Arete_Type == Type_Flux_Arete::PAROI || Arete_Type == Type_Flux_Arete::NAVIER || Arete_Type == Type_Flux_Arete::NAVIER_PAROI), void>
-  coeffs_arete(const DoubleTab*, int, int, int, int, Type_Double& , Type_Double& , Type_Double& ) const { /* do nothing */ }
 
 private:
   template <typename Type_Double>
   inline void fill_coeffs_proto(const int, const double , const double, Type_Double& , Type_Double& ) const;
 
   // CRTP pattern to static_cast the appropriate class and get the implementation: This is magic !
-  inline int premiere_face_bord() const { return static_cast<const DERIVED_T *>(this)->get_premiere_face_bord(); }
-  inline int orientation(int face) const { return static_cast<const DERIVED_T *>(this)->get_orientation(face); }
-  inline int elem_(int i, int j) const { return static_cast<const DERIVED_T *>(this)->get_elem(i,j); }
-  inline int face_amont_princ_(int num_face, int i) const { return static_cast<const DERIVED_T *>(this)->face_amont_princ(num_face,i); }
-  inline int face_amont_conj_(int num_face,int i, int k) const { return static_cast<const DERIVED_T *>(this)->face_amont_conj(num_face,i,k); }
-  inline double dt_vitesse(int face, int comp = 0) const { return static_cast<const DERIVED_T *>(this)->get_dt_vitesse(face, comp); }
-  inline double surface_porosite(int face) const { return static_cast<const DERIVED_T *>(this)->get_surface_porosite(face); }
-  inline double surface(int face) const { return static_cast<const DERIVED_T *>(this)->get_surface(face); }
-  inline double porosite(int face) const { return static_cast<const DERIVED_T *>(this)->get_porosite(face); }
-  inline double dim_face_(int n1,int k) const { return static_cast<const DERIVED_T *>(this)->dim_face(n1,k); }
-  inline double dim_elem_(int n1,int k) const { return static_cast<const DERIVED_T *>(this)->dim_elem(n1,k); }
-  inline double dist_face_(int n1,int n2,int k) const { return static_cast<const DERIVED_T *>(this)->dist_face(n1,n2,k); }
-  inline double dist_face_period_(int n1,int n2,int k) const { return static_cast<const DERIVED_T *>(this)->dist_face_period(n1,n2,k); }
-  inline double dist_elem_period_(int n1, int n2, int k) const { return static_cast<const DERIVED_T *>(this)->dist_elem_period(n1,n2,k); }
+  template<typename ExecSpace=HostSpace>
+  KOKKOS_INLINE_FUNCTION int orientation(int face) const { return static_cast<const DERIVED_T *>(this)->template get_orientation<ExecSpace>(face); }
+  template<typename ExecSpace=HostSpace>
+  KOKKOS_INLINE_FUNCTION int elem_(int i, int j) const { return static_cast<const DERIVED_T *>(this)->template get_elem<ExecSpace>(i,j); }
+  template<typename ExecSpace=HostSpace>
+  KOKKOS_INLINE_FUNCTION int face_amont_princ_(int num_face, int i) const { return static_cast<const DERIVED_T *>(this)->template get_face_amont_princ<ExecSpace>(num_face,i); }
+  template<typename ExecSpace=HostSpace>
+  KOKKOS_FORCEINLINE_FUNCTION int face_amont_conj_(int num_face,int i, int k) const { return static_cast<const DERIVED_T *>(this)->template get_face_amont_conj<ExecSpace>(num_face,i,k); }
+  template<typename ExecSpace=HostSpace>
+  KOKKOS_INLINE_FUNCTION double dt_vitesse(int face, int comp = 0) const { return static_cast<const DERIVED_T *>(this)->template get_dt_vitesse<ExecSpace>(face, comp); }
+  //KOKKOS_INLINE_FUNCTION double surface_porosite(int face) const { return static_cast<const DERIVED_T *>(this)->get_surface_porosite(face); }
+  template<typename ExecSpace=HostSpace>
+  KOKKOS_INLINE_FUNCTION double surface(int face) const { return static_cast<const DERIVED_T *>(this)->template get_surface<ExecSpace>(face); }
+  template<typename ExecSpace=HostSpace>
+  KOKKOS_INLINE_FUNCTION double porosite(int face) const { return static_cast<const DERIVED_T *>(this)->template get_porosite<ExecSpace>(face); }
+  template<typename ExecSpace=HostSpace>
+  KOKKOS_INLINE_FUNCTION double dim_face_(int n1,int k) const { return static_cast<const DERIVED_T *>(this)->template get_dim_face<ExecSpace>(n1,k); }
+  template<typename ExecSpace=HostSpace>
+  KOKKOS_INLINE_FUNCTION double dim_elem_(int n1,int k) const { return static_cast<const DERIVED_T *>(this)->template get_dim_elem<ExecSpace>(n1,k); }
+  template<typename ExecSpace=HostSpace>
+  KOKKOS_INLINE_FUNCTION double dist_face_(int n1,int n2,int k) const { return static_cast<const DERIVED_T *>(this)->template get_dist_face<ExecSpace>(n1,n2,k); }
+  template<typename ExecSpace=HostSpace>
+  KOKKOS_INLINE_FUNCTION double dist_face_period_(int n1,int n2,int k) const { return static_cast<const DERIVED_T *>(this)->template get_dist_face_period<ExecSpace>(n1,n2,k); }
+  template<typename ExecSpace=HostSpace>
+  KOKKOS_INLINE_FUNCTION double dist_elem_period_(int n1, int n2, int k) const { return static_cast<const DERIVED_T *>(this)->template get_dist_elem_period<ExecSpace>(n1,n2,k); }
+
   inline const Domaine_Cl_VDF& la_zcl() const { return static_cast<const DERIVED_T *>(this)->get_la_zcl(); }
 
-  inline double conv_quick_sharp_plus_(const double psc,const double vit_0, const double vit_1, const double vit_0_0, const double dx, const double dm, const double dxam) const
+  KOKKOS_INLINE_FUNCTION double conv_quick_sharp_plus_(const double psc,const double vit_0, const double vit_1, const double vit_0_0, const double dx, const double dm, const double dxam) const
   { return static_cast<const DERIVED_T *>(this)->conv_quick_sharp_plus(psc,vit_0,vit_1,vit_0_0,dx,dm,dxam); }
 
-  inline double conv_quick_sharp_moins_(const double psc,const double vit_0,const double vit_1, const double vit_1_1,const double dx, const double dm,const double dxam) const
+  KOKKOS_INLINE_FUNCTION double conv_quick_sharp_moins_(const double psc,const double vit_0,const double vit_1, const double vit_1_1,const double dx, const double dm,const double dxam) const
   { return static_cast<const DERIVED_T *>(this)->conv_quick_sharp_moins(psc,vit_0,vit_1,vit_1_1,dx,dm,dxam); }
 
-  inline double conv_centre_(const double psc,const double vit_0_0, const double vit_0, const double vit_1, const double vit1_1,double g1, double g2, double g3,double g4) const
+  KOKKOS_INLINE_FUNCTION double conv_centre_(const double psc,const double vit_0_0, const double vit_0, const double vit_1, const double vit1_1,double g1, double g2, double g3,double g4) const
   { return static_cast<const DERIVED_T *>(this)->conv_centre(psc,vit_0_0,vit_0,vit_1,vit1_1,g1,g2,g3,g4); }
 
-  inline void calcul_g_(const double dxam, const double dx, const double dxav, double& g1, double& g2, double& g3, double& g4) const
+  KOKKOS_INLINE_FUNCTION void calcul_g_(const double dxam, const double dx, const double dxav, double& g1, double& g2, double& g3, double& g4) const
   { static_cast<const DERIVED_T *>(this)->calcul_g(dxam,dx,dxav,g1,g2,g3,g4); }
 };
 
diff --git a/src/VDF/Operateurs/Eval_Conv/Eval_Conv_VDF_Face.tpp b/src/VDF/Operateurs/Eval_Conv/Eval_Conv_VDF_Face.tpp
index b398471925..57f0ec1eef 100644
--- a/src/VDF/Operateurs/Eval_Conv/Eval_Conv_VDF_Face.tpp
+++ b/src/VDF/Operateurs/Eval_Conv/Eval_Conv_VDF_Face.tpp
@@ -20,102 +20,67 @@
  * *********  POUR L'EXPLICITE ********** *
  * ************************************** */
 
-template <typename DERIVED_T> template<Type_Flux_Fa7 Fa7_Type, typename Type_Double> inline std::enable_if_t< Fa7_Type == Type_Flux_Fa7::SORTIE_LIBRE, void>
-Eval_Conv_VDF_Face<DERIVED_T>::flux_fa7(const DoubleTab& inco, const DoubleTab* a_r, int face, const Neumann_sortie_libre& la_cl, int num1, Type_Double& flux) const
-{
-  const int elem1 = elem_(face, 0), elem2 = elem_(face,1);
-  for (int k = 0; k < flux.size_array(); k++)
-    {
-      double psc = dt_vitesse(face, k) * surface(face);
-      if (a_r && DERIVED_T::IS_AMONT) psc *= (*a_r)((elem1 != -1) ? elem1 : elem2, k);
-      flux[k] = -psc * inco(face, k) * porosite(face);
-    }
-}
-
-template <typename DERIVED_T> template<Type_Flux_Fa7 Fa7_Type, typename Type_Double> inline std::enable_if_t< Fa7_Type == Type_Flux_Fa7::ELEM, void>
-Eval_Conv_VDF_Face<DERIVED_T>::flux_fa7(const DoubleTab& inco, const DoubleTab* a_r, int num_elem, int fac1, int fac2, Type_Double& flux) const
+template <typename DERIVED_T> template<Type_Flux_Fa7 Fa7_Type> KOKKOS_FORCEINLINE_FUNCTION std::enable_if_t< Fa7_Type == Type_Flux_Fa7::ELEM, void>
+Eval_Conv_VDF_Face<DERIVED_T>::flux_fa7_comp(CDoubleTabView inco, CDoubleTabView a_r, int num_elem, int fac1, int fac2, int k, double& flux) const
 {
-  const int ncomp = flux.size_array();
-  double psc = 0.25*(dt_vitesse(fac1)+dt_vitesse(fac2))*(surface(fac1)+surface(fac2));
+  double psc = 0.25 * (dt_vitesse<DeviceSpace>(fac1) + dt_vitesse<DeviceSpace>(fac2)) *
+               (surface<DeviceSpace>(fac1) + surface<DeviceSpace>(fac2));
   if (DERIVED_T::IS_AMONT)
     {
-      for (int k = 0; k < ncomp; k++)
+      psc = 0.25 * (dt_vitesse<DeviceSpace>(fac1, k) + dt_vitesse<DeviceSpace>(fac2, k)) *
+            (surface<DeviceSpace>(fac1) + surface<DeviceSpace>(fac2));
+      const int f = psc > 0 ? fac1 : fac2;
+      if (a_r.size() > 0)
         {
-          psc = 0.25*(dt_vitesse(fac1,k)+dt_vitesse(fac2,k))*(surface(fac1)+surface(fac2));
-          const int f = psc > 0 ? fac1 : fac2;
-
-          if (a_r)
-            {
-              const int elem = elem_(f, 0), elem2 = elem_(f, 1);
-              const int e = dt_vitesse(f,k) > 0 ? (elem > -1 ? elem : elem2) : (elem2 > -1 ? elem2 : elem);
-              psc *= (*a_r)(e, k);
-            }
-
-          flux[k] = -psc * inco(f, k) * porosite(f);
+          const int elem = elem_<DeviceSpace>(f, 0), elem2 = elem_<DeviceSpace>(f, 1);
+          const int e = dt_vitesse<DeviceSpace>(f, k) > 0 ? (elem > -1 ? elem : elem2) : (elem2 > -1 ? elem2 : elem);
+          psc *= a_r(e, k);
         }
+      flux = -psc * inco(f, k) * porosite<DeviceSpace>(f);
     }
   else if (DERIVED_T::IS_CENTRE)
-    for (int k = 0; k < ncomp; k++) flux[k] = -psc*0.5*(inco(fac1,k)*porosite(fac1)+inco(fac2,k)*porosite(fac2));
+    flux = -psc * 0.5 * (inco(fac1, k) * porosite<DeviceSpace>(fac1) + inco(fac2, k) * porosite<DeviceSpace>(fac2));
   else
     {
-      const int num0_0 = face_amont_princ_(fac1,0), num1_1 = face_amont_princ_(fac2,1);
+      const int num0_0 = face_amont_princ_<DeviceSpace>(fac1, 0), num1_1 = face_amont_princ_<DeviceSpace>(fac2, 1);
       if (DERIVED_T::IS_CENTRE4)
         {
-          const int ori = orientation(fac1);
-          if  ( (num0_0 == -1) || (num1_1== -1) )
-            for (int k = 0; k < ncomp; k++) flux[k] = -psc*0.5*(inco(fac1,k)*porosite(fac1)+inco(fac2,k)*porosite(fac2)); // Schema centre 2
-          else // Schema centre 4
+          const int ori = orientation<DeviceSpace>(fac1);
+          if ((num0_0 == -1) || (num1_1 == -1))
+            flux = -psc * 0.5 * (inco(fac1, k) * porosite<DeviceSpace>(fac1) + inco(fac2, k) * porosite<DeviceSpace>(fac2));
+          else
             {
-              Type_Double vit_0(ncomp),vit_0_0(ncomp),vit_1_1(ncomp),vit_1(ncomp);
-              const double dx = dim_elem_(num_elem,ori), dxam = dim_elem_(elem_(fac1,0),ori), dxav = dim_elem_(elem_(fac2,1),ori);
+              const double dx = dim_elem_<DeviceSpace>(num_elem, ori);
+              const double dxam = dim_elem_<DeviceSpace>(elem_<DeviceSpace>(fac1, 0), ori);
+              const double dxav = dim_elem_<DeviceSpace>(elem_<DeviceSpace>(fac2, 1), ori);
               double g1, g2, g3, g4;
-              calcul_g_(dxam,dx,dxav,g1,g2,g3,g4);
-              for (int k = 0; k < ncomp; k++)
-                {
-                  vit_0_0[k] = inco(num0_0,k)*porosite(num0_0);
-                  vit_0[k] = inco(fac1,k)*porosite(fac1);
-                  vit_1[k] = inco(fac2,k)*porosite(fac2);
-                  vit_1_1[k] = inco(num1_1,k)*porosite(num1_1);
-                  flux[k] = -conv_centre_(psc,vit_0_0[k],vit_0[k],vit_1[k],vit_1_1[k],g1,g2,g3,g4);
-                }
+              calcul_g_(dxam, dx, dxav, g1, g2, g3, g4);
+              flux = -conv_centre_(psc, inco(num0_0, k) * porosite<DeviceSpace>(num0_0), inco(fac1, k) * porosite<DeviceSpace>(fac1),
+                                   inco(fac2, k) * porosite<DeviceSpace>(fac2), inco(num1_1, k) * porosite<DeviceSpace>(num1_1), g1, g2, g3, g4);
             }
         }
-      else // QUICK
+      else  // QUICK
         {
-          if  (psc > 0)
+          if (psc > 0)
             {
               if (num0_0 == -1)
-                for (int k=0; k<ncomp; k++) flux[k] = -psc*inco(fac1,k)*porosite(fac1); // Schema amont
+                flux = -psc * inco(fac1, k) * porosite<DeviceSpace>(fac1);
               else
                 {
-                  Type_Double vit_0(ncomp), vit_0_0(ncomp), vit_1(ncomp);
-                  const int ori = orientation(fac1), elem_amont = elem_(fac1,0);
-                  const double dx = dim_elem_(num_elem,ori), dm = dist_elem_period_(elem_amont,num_elem,ori), dxam = dim_elem_(elem_amont,ori);
-                  for (int k = 0; k < ncomp; k++)
-                    {
-                      vit_0[k] = inco(fac1,k)*porosite(fac1);
-                      vit_1[k] = inco(fac2,k)*porosite(fac2);
-                      vit_0_0[k] = inco(num0_0,k)*porosite(num0_0);
-                      flux[k] = -conv_quick_sharp_plus_(psc,vit_0[k],vit_1[k],vit_0_0[k],dx,dm,dxam);
-                    }
+                  const int ori = orientation<DeviceSpace>(fac1), elem_amont = elem_<DeviceSpace>(fac1, 0);
+                  const double dx = dim_elem_<DeviceSpace>(num_elem, ori), dm = dist_elem_period_<DeviceSpace>(elem_amont, num_elem, ori), dxam = dim_elem_<DeviceSpace>(elem_amont, ori);
+                  flux = -conv_quick_sharp_plus_(psc, inco(fac1, k) * porosite<DeviceSpace>(fac1), inco(fac2, k) * porosite<DeviceSpace>(fac2), inco(num0_0, k) * porosite<DeviceSpace>(num0_0), dx, dm, dxam);
                 }
             }
-          else // (psc < 0)
+          else
             {
               if (num1_1 == -1)
-                for (int k = 0; k < ncomp; k++) flux[k] = -psc*inco(fac2,k)*porosite(fac2); // Schema amont
+                flux = -psc * inco(fac2, k) * porosite<DeviceSpace>(fac2);
               else
                 {
-                  Type_Double vit_0(ncomp), vit_1(ncomp), vit_1_1(ncomp);
-                  const int ori = orientation(fac2), elem_amont = elem_(fac2,1);
-                  const double dx = dim_elem_(num_elem,ori), dm = dist_elem_period_(num_elem,elem_amont,ori), dxam = dim_elem_(elem_amont,ori);
-                  for (int k = 0; k < ncomp; k++)
-                    {
-                      vit_0[k] = inco(fac1,k)*porosite(fac1);
-                      vit_1[k] = inco(fac2,k)*porosite(fac2);
-                      vit_1_1[k] = inco(num1_1,k)*porosite(num1_1);
-                      flux[k] = -conv_quick_sharp_moins_(psc,vit_0[k],vit_1[k],vit_1_1[k],dx,dm,dxam);
-                    }
+                  const int ori = orientation<DeviceSpace>(fac2), elem_amont = elem_<DeviceSpace>(fac2, 1);
+                  const double dx = dim_elem_<DeviceSpace>(num_elem, ori), dm = dist_elem_period_<DeviceSpace>(num_elem, elem_amont, ori), dxam = dim_elem_<DeviceSpace>(elem_amont, ori);
+                  flux = -conv_quick_sharp_moins_(psc, inco(fac1, k) * porosite<DeviceSpace>(fac1), inco(fac2, k) * porosite<DeviceSpace>(fac2), inco(num1_1, k) * porosite<DeviceSpace>(num1_1), dx, dm, dxam);
                 }
             }
         }
@@ -155,18 +120,17 @@ Eval_Conv_VDF_Face<DERIVED_T>::flux_arete(const DoubleTab& inco, const DoubleTab
             for (int k = 0; k < ncomp; k++) flux[k] = -0.5*(inco(fac3,k)+inco(fac4,k))*psc ; // Schema centre 2 (pas assez de faces)
           else  // Schema Centre 4
             {
-              Type_Double vit_0(ncomp), vit_0_0(ncomp), vit_1_1(ncomp), vit_1(ncomp);
               // Inutile de prendre dist_face_period pour dx car fac3 et fac4 ne peuvent etre periodiques (arete interne)
               const double dx = dist_face_(fac3,fac4,ori), dxam = dist_face_period_(num0_0,fac3,ori), dxav = dist_face_period_(fac4,num1_1,ori);
               double g1, g2, g3, g4;
               calcul_g_(dxam,dx,dxav,g1,g2,g3,g4);
               for (int k = 0; k < ncomp; k++)
                 {
-                  vit_0_0[k] = inco(num0_0,k);
-                  vit_0[k] = inco(fac3,k);
-                  vit_1[k] = inco(fac4,k);
-                  vit_1_1[k] = inco(num1_1,k);
-                  flux[k] = -conv_centre_(psc,vit_0_0[k],vit_0[k],vit_1[k],vit_1_1[k],g1,g2,g3,g4);
+                  const double vit_0_0 = inco(num0_0,k);
+                  const double vit_0 = inco(fac3,k);
+                  const double vit_1 = inco(fac4,k);
+                  const double vit_1_1 = inco(num1_1,k);
+                  flux[k] = -conv_centre_(psc,vit_0_0,vit_0,vit_1,vit_1_1,g1,g2,g3,g4);
                 }
             }
         }
@@ -178,14 +142,13 @@ Eval_Conv_VDF_Face<DERIVED_T>::flux_arete(const DoubleTab& inco, const DoubleTab
                 for (int k = 0; k < ncomp; k++) flux[k] = -psc*inco(fac3,k); // Schema amont
               else // Schema quick
                 {
-                  Type_Double vit_0(ncomp), vit_0_0(ncomp), vit_1(ncomp);
                   const double dx = dist_face_period_(fac3,fac4,ori), dm = dim_face_(fac3,ori), dxam = dist_face_period_(num0_0,fac3,ori);
                   for (int k = 0; k < ncomp; k++)
                     {
-                      vit_0[k] = inco(fac3,k);
-                      vit_1[k] = inco(fac4,k);
-                      vit_0_0[k] = inco(num0_0,k);
-                      flux[k] = -conv_quick_sharp_plus_(psc,vit_0[k],vit_1[k],vit_0_0[k],dx,dm,dxam);
+                      const double vit_0 = inco(fac3,k);
+                      const double vit_1 = inco(fac4,k);
+                      const double vit_0_0 = inco(num0_0,k);
+                      flux[k] = -conv_quick_sharp_plus_(psc,vit_0,vit_1,vit_0_0,dx,dm,dxam);
                     }
                 }
             }
@@ -195,14 +158,13 @@ Eval_Conv_VDF_Face<DERIVED_T>::flux_arete(const DoubleTab& inco, const DoubleTab
                 for (int k = 0; k < ncomp; k++) flux[k] = -psc*inco(fac4,k); // Schema amont
               else // Schema quick
                 {
-                  Type_Double vit_0(ncomp), vit_1(ncomp), vit_1_1(ncomp);
                   const double dx = dist_face_period_(fac3,fac4,ori), dm = dim_face_(fac4,ori), dxam = dist_face_period_(fac4,num1_1,ori);
                   for (int k = 0; k < ncomp; k++)
                     {
-                      vit_0[k] = inco(fac3,k);
-                      vit_1[k] = inco(fac4,k);
-                      vit_1_1[k] = inco(num1_1,k);
-                      flux[k] = -conv_quick_sharp_moins_(psc,vit_0[k],vit_1[k],vit_1_1[k],dx,dm,dxam);
+                      const double vit_0 = inco(fac3,k);
+                      const double vit_1 = inco(fac4,k);
+                      const double vit_1_1 = inco(num1_1,k);
+                      flux[k] = -conv_quick_sharp_moins_(psc,vit_0,vit_1,vit_1_1,dx,dm,dxam);
                     }
                 }
             }
@@ -210,225 +172,187 @@ Eval_Conv_VDF_Face<DERIVED_T>::flux_arete(const DoubleTab& inco, const DoubleTab
     }
 }
 
-template <typename DERIVED_T> template<Type_Flux_Arete Arete_Type, typename Type_Double> inline std::enable_if_t< Arete_Type == Type_Flux_Arete::MIXTE, void>
-Eval_Conv_VDF_Face<DERIVED_T>::flux_arete(const DoubleTab& inco, const DoubleTab* a_r, int fac1, int fac2,int fac3, int fac4, Type_Double& flux) const
+// ===== _comp scalar variants (one component k) for MDRangePolicy kernels =====
+
+template <typename DERIVED_T> template<Type_Flux_Arete Arete_Type>
+KOKKOS_FORCEINLINE_FUNCTION std::enable_if_t<Arete_Type == Type_Flux_Arete::INTERNE, void>
+Eval_Conv_VDF_Face<DERIVED_T>::flux_arete_comp(CDoubleTabView inco, CDoubleTabView a_r, int fac1, int fac2, int fac3, int fac4, int k, double& flux) const
 {
-  double psc = 0.25*((dt_vitesse(fac1)*porosite(fac1)+dt_vitesse(fac2)*porosite(fac2))*(surface(fac1)+surface(fac2)));
-  if (DERIVED_T::IS_CENTRE)
-    for (int k = 0; k < flux.size_array(); k++) flux[k] = -psc*0.5*(inco(fac3,k)+inco(fac4,k));
+  double psc = 0.25*((dt_vitesse<DeviceSpace>(fac1)*porosite<DeviceSpace>(fac1)+dt_vitesse<DeviceSpace>(fac2)*porosite<DeviceSpace>(fac2))*(surface<DeviceSpace>(fac1)+surface<DeviceSpace>(fac2)));
+  if (DERIVED_T::IS_AMONT)
+    {
+      psc = 0.25*((dt_vitesse<DeviceSpace>(fac1,k)*porosite<DeviceSpace>(fac1)+dt_vitesse<DeviceSpace>(fac2,k)*porosite<DeviceSpace>(fac2))*(surface<DeviceSpace>(fac1)+surface<DeviceSpace>(fac2)));
+      const int f = psc > 0 ? fac3 : fac4;
+      if (a_r.size()>0)
+        {
+          const int elem = elem_<DeviceSpace>(f,0), elem2 = elem_<DeviceSpace>(f,1);
+          const int e = dt_vitesse<DeviceSpace>(f,k) > 0 ? (elem > -1 ? elem : elem2) : (elem2 > -1 ? elem2 : elem);
+          psc *= a_r(e,k);
+        }
+      flux = -psc*inco(f,k);
+    }
+  else if (DERIVED_T::IS_CENTRE)
+    flux = -0.5*(inco(fac3,k)+inco(fac4,k))*psc;
   else
     {
-      for (int k = 0; k < flux.size_array(); k++)
+      const int ori = orientation<DeviceSpace>(fac1);
+      const int num0_0 = face_amont_conj_<DeviceSpace>(fac3,ori,0), num1_1 = face_amont_conj_<DeviceSpace>(fac4,ori,1);
+      if (DERIVED_T::IS_CENTRE4)
+        {
+          if ((num0_0==-1)||(num1_1==-1)) { flux = -0.5*(inco(fac3,k)+inco(fac4,k))*psc; return; }
+          const double dx = dist_face_<DeviceSpace>(fac3,fac4,ori), dxam = dist_face_period_<DeviceSpace>(num0_0,fac3,ori), dxav = dist_face_period_<DeviceSpace>(fac4,num1_1,ori);
+          double g1,g2,g3,g4;
+          calcul_g_(dxam,dx,dxav,g1,g2,g3,g4);
+          flux = -conv_centre_(psc,inco(num0_0,k),inco(fac3,k),inco(fac4,k),inco(num1_1,k),g1,g2,g3,g4);
+        }
+      else // IS_QUICK
         {
-          psc = 0.25 * ((dt_vitesse(fac1, k) * porosite(fac1) + dt_vitesse(fac2, k) * porosite(fac2)) * (surface(fac1) + surface(fac2)));
           if (psc > 0)
             {
-              const int elem = elem_(fac3, 0) > 0 ? elem_(fac3, 0) : elem_(fac3, 1);
-              if (a_r) psc *= (*a_r)(elem,k);
-              flux[k] = -psc * inco(fac3, k);
+              if (num0_0==-1) { flux = -psc*inco(fac3,k); return; }
+              const double dx = dist_face_period_<DeviceSpace>(fac3,fac4,ori), dm = dim_face_<DeviceSpace>(fac3,ori), dxam = dist_face_period_<DeviceSpace>(num0_0,fac3,ori);
+              flux = -conv_quick_sharp_plus_(psc,inco(fac3,k),inco(fac4,k),inco(num0_0,k),dx,dm,dxam);
             }
           else
             {
-              const int elem = elem_(fac4, 0) > 0 ? elem_(fac4, 0) : elem_(fac4, 1);
-              if (a_r) psc *= (*a_r)(elem,k);
-              flux[k] = -psc * inco(fac4, k);
+              if (num1_1==-1) { flux = -psc*inco(fac4,k); return; }
+              const double dx = dist_face_period_<DeviceSpace>(fac3,fac4,ori), dm = dim_face_<DeviceSpace>(fac4,ori), dxam = dist_face_period_<DeviceSpace>(fac4,num1_1,ori);
+              flux = -conv_quick_sharp_moins_(psc,inco(fac3,k),inco(fac4,k),inco(num1_1,k),dx,dm,dxam);
             }
         }
     }
 }
 
-template <typename DERIVED_T> template<Type_Flux_Arete Arete_Type, typename Type_Double>
-inline std::enable_if_t<Arete_Type == Type_Flux_Arete::FLUIDE || Arete_Type == Type_Flux_Arete::NAVIER_FLUIDE || Arete_Type == Type_Flux_Arete::PAROI_FLUIDE, void>
-Eval_Conv_VDF_Face<DERIVED_T>::flux_arete(const DoubleTab& inco, const DoubleTab* a_r, int fac1, int fac2, int fac3, int signe, Type_Double& flux3, Type_Double& flux1_2) const
+template <typename DERIVED_T> template<Type_Flux_Arete Arete_Type>
+KOKKOS_INLINE_FUNCTION std::enable_if_t<Arete_Type == Type_Flux_Arete::MIXTE, void>
+Eval_Conv_VDF_Face<DERIVED_T>::flux_arete_comp(CDoubleTabView inco, CDoubleTabView a_r, int fac1, int fac2, int fac3, int fac4, int k, double& flux) const
 {
-  assert(flux3.size_array() == flux1_2.size_array());
-  constexpr bool is_SYM = (Arete_Type == Type_Flux_Arete::NAVIER_FLUIDE);
-  if (DERIVED_T::IS_AXI && is_SYM) return;
-  const int ncomp = flux3.size_array();
-
-  const int pfb = premiere_face_bord(), ori = orientation(fac3), rang1 = DERIVED_T::IS_QUICK ? fac1 : (fac1-pfb), rang2 = DERIVED_T::IS_QUICK ? fac2 :(fac2-pfb); // TODO : FIXME : euh ? pourquoi ca ?
-
-  for (int k = 0; k < ncomp; k++)
+  double psc = 0.25*((dt_vitesse<DeviceSpace>(fac1)*porosite<DeviceSpace>(fac1)+dt_vitesse<DeviceSpace>(fac2)*porosite<DeviceSpace>(fac2))*(surface<DeviceSpace>(fac1)+surface<DeviceSpace>(fac2)));
+  if (DERIVED_T::IS_CENTRE) { flux = -psc*0.5*(inco(fac3,k)+inco(fac4,k)); return; }
+  psc = 0.25*((dt_vitesse<DeviceSpace>(fac1,k)*porosite<DeviceSpace>(fac1)+dt_vitesse<DeviceSpace>(fac2,k)*porosite<DeviceSpace>(fac2))*(surface<DeviceSpace>(fac1)+surface<DeviceSpace>(fac2)));
+  if (psc > 0)
     {
-      double psc = 0.25*((dt_vitesse(fac1,k)*porosite(fac1)+dt_vitesse(fac2,k)*porosite(fac2))*(surface(fac1)+surface(fac2)));
-      if ((psc*signe)>0)
-        {
-          const int elem = elem_(fac3, 0), elem2 = elem_(fac3, 1);
-          const int e = dt_vitesse(fac3, k) > 0 ? (elem > -1 ? elem : elem2) : (elem2 > -1 ? elem2 : elem);
-          const double aa_r = (a_r && DERIVED_T::IS_AMONT) ? (*a_r)(e, k) : 1.0;
-          flux3[k] = -aa_r*inco(fac3,k)*psc ;
-        }
-      else
-        {
-          const int ind = ncomp*ori+k;
-          const double vf1 = Champ_Face_get_val_imp_face_bord_sym(inco,inconnue->temps(),rang1,ind,la_zcl());
-          const double vf2 = Champ_Face_get_val_imp_face_bord_sym(inco,inconnue->temps(),rang2,ind,la_zcl());
-          const int elem = elem_(fac3, 0), elem2 = elem_(fac3, 1);
-          const int e = dt_vitesse(fac3, k) > 0 ? (elem > -1 ? elem : elem2) : (elem2 > -1 ? elem2 : elem);
-          const double aa_r = (a_r && DERIVED_T::IS_AMONT) ? (*a_r)(e, k) : 1.0;
-          flux3[k] = -aa_r * 0.5 * (vf1 + vf2) * psc ;
-        }
+      const int elem = elem_<DeviceSpace>(fac3,0) > 0 ? elem_<DeviceSpace>(fac3,0) : elem_<DeviceSpace>(fac3,1);
+      if (a_r.size()>0) psc *= a_r(elem,k);
+      flux = -psc*inco(fac3,k);
     }
-
-  for (int k = 0; k < ncomp; k++)
+  else
     {
-      double psc = 0.5*dt_vitesse(fac3,k)*surface(fac3)*porosite(fac3);
-      if (psc>0)
-        {
-          const int elem = elem_(fac1, 0), elem2 = elem_(fac1, 1);
-          const int e = dt_vitesse(fac1, k) > 0 ? (elem > -1 ? elem : elem2) : (elem2 > -1 ? elem2 : elem);
-          const double aa_r = (a_r && DERIVED_T::IS_AMONT) ? (*a_r)(e, k) : 1.0;
-          flux1_2[k] = -aa_r * psc * inco(fac1, k);
-        }
-      else
-        {
-          const int elem = elem_(fac2, 0), elem2 = elem_(fac2, 1);
-          const int e = dt_vitesse(fac2, k) > 0 ? (elem > -1 ? elem : elem2) : (elem2 > -1 ? elem2 : elem);
-          const double aa_r = (a_r && DERIVED_T::IS_AMONT) ? (*a_r)(e, k) : 1.0;
-          flux1_2[k] = (DERIVED_T::IS_CENTRE || DERIVED_T::IS_CENTRE4) ? -psc*0.5*(inco(fac1,k)+inco(fac2,k)) : -aa_r * psc * inco(fac2, k);
-        }
+      const int elem = elem_<DeviceSpace>(fac4,0) > 0 ? elem_<DeviceSpace>(fac4,0) : elem_<DeviceSpace>(fac4,1);
+      if (a_r.size()>0) psc *= a_r(elem,k);
+      flux = -psc*inco(fac4,k);
     }
 }
 
-template <typename DERIVED_T> template<Type_Flux_Arete Arete_Type, typename Type_Double> inline std::enable_if_t< Arete_Type == Type_Flux_Arete::PERIODICITE, void>
-Eval_Conv_VDF_Face<DERIVED_T>::flux_arete(const DoubleTab& inco, const DoubleTab* a_r, int fac1, int fac2 , int fac3, int fac4, Type_Double& flux3_4, Type_Double& flux1_2) const
+template <typename DERIVED_T> template<Type_Flux_Arete Arete_Type>
+KOKKOS_INLINE_FUNCTION std::enable_if_t<Arete_Type == Type_Flux_Arete::FLUIDE || Arete_Type == Type_Flux_Arete::NAVIER_FLUIDE || Arete_Type == Type_Flux_Arete::PAROI_FLUIDE, void>
+Eval_Conv_VDF_Face<DERIVED_T>::flux_arete_comp(CDoubleTabView inco, CDoubleTabView val_imp_face_bord, CDoubleTabView, CDoubleTabView a_r, int fac1, int fac2, int fac3, int signe, int ncomp, int k, double& flux3, double& flux1_2) const
 {
-  assert(flux3_4.size_array() == flux1_2.size_array());
-  if (DERIVED_T::IS_QUICK) // XXX : LOL
+  constexpr bool is_SYM = (Arete_Type == Type_Flux_Arete::NAVIER_FLUIDE);
+  if (DERIVED_T::IS_AXI && is_SYM) { flux3 = 0.; flux1_2 = 0.; return; }
+  const int ori = orientation<DeviceSpace>(fac3);
+  double psc = 0.25*((dt_vitesse<DeviceSpace>(fac1,k)*porosite<DeviceSpace>(fac1)+dt_vitesse<DeviceSpace>(fac2,k)*porosite<DeviceSpace>(fac2))*(surface<DeviceSpace>(fac1)+surface<DeviceSpace>(fac2)));
+  if ((psc*signe) > 0)
     {
-      if (DERIVED_T::IS_AXI) return;
-      else
-        {
-          flux_arete < Type_Flux_Arete::INTERNE > (inco, a_r, fac1, fac2, fac3, fac4, flux3_4);
-          flux_arete < Type_Flux_Arete::INTERNE > (inco, a_r, fac3, fac4, fac1, fac2, flux1_2);
-          return;
-        }
+      const int elem = elem_<DeviceSpace>(fac3,0), elem2 = elem_<DeviceSpace>(fac3,1);
+      const int e = dt_vitesse<DeviceSpace>(fac3,k) > 0 ? (elem > -1 ? elem : elem2) : (elem2 > -1 ? elem2 : elem);
+      const double aa_r = (a_r.data() && DERIVED_T::IS_AMONT) ? a_r(e,k) : 1.0;
+      flux3 = -aa_r*inco(fac3,k)*psc;
     }
-  const int ncomp = flux3_4.size_array();
-
-  // FIXME : pb_multi !
-  if (ncomp > 1) throw;
+  else
+    {
+      const int ind = ncomp*ori+k;
+      const int elem = elem_<DeviceSpace>(fac3,0), elem2 = elem_<DeviceSpace>(fac3,1);
+      const int e = dt_vitesse<DeviceSpace>(fac3,k) > 0 ? (elem > -1 ? elem : elem2) : (elem2 > -1 ? elem2 : elem);
+      const double aa_r = (a_r.data() && DERIVED_T::IS_AMONT) ? a_r(e,k) : 1.0;
+      flux3 = -aa_r*0.5*(val_imp_face_bord(fac1,ind)+val_imp_face_bord(fac2,ind))*psc;
+    }
+  psc = 0.5*dt_vitesse<DeviceSpace>(fac3,k)*surface<DeviceSpace>(fac3)*porosite<DeviceSpace>(fac3);
+  if (psc > 0)
+    {
+      const int elem = elem_<DeviceSpace>(fac1,0), elem2 = elem_<DeviceSpace>(fac1,1);
+      const int e = dt_vitesse<DeviceSpace>(fac1,k) > 0 ? (elem > -1 ? elem : elem2) : (elem2 > -1 ? elem2 : elem);
+      const double aa_r = (a_r.data() && DERIVED_T::IS_AMONT) ? a_r(e,k) : 1.0;
+      flux1_2 = -aa_r*psc*inco(fac1,k);
+    }
+  else
+    {
+      const int elem = elem_<DeviceSpace>(fac2,0), elem2 = elem_<DeviceSpace>(fac2,1);
+      const int e = dt_vitesse<DeviceSpace>(fac2,k) > 0 ? (elem > -1 ? elem : elem2) : (elem2 > -1 ? elem2 : elem);
+      const double aa_r = (a_r.data() && DERIVED_T::IS_AMONT) ? a_r(e,k) : 1.0;
+      flux1_2 = (DERIVED_T::IS_CENTRE || DERIVED_T::IS_CENTRE4) ? -psc*0.5*(inco(fac1,k)+inco(fac2,k)) : -aa_r*psc*inco(fac2,k);
+    }
+}
 
-  double psc = 0.25*(dt_vitesse(fac1)*porosite(fac1)+dt_vitesse(fac2)*porosite(fac2))*(surface(fac1) +surface(fac2));
+template <typename DERIVED_T> template<Type_Flux_Arete Arete_Type>
+KOKKOS_INLINE_FUNCTION std::enable_if_t<Arete_Type == Type_Flux_Arete::PERIODICITE, void>
+Eval_Conv_VDF_Face<DERIVED_T>::flux_arete_comp(CDoubleTabView inco, CDoubleTabView a_r, int fac1, int fac2, int fac3, int fac4, int k, double& flux3_4, double& flux1_2) const
+{
+  if (DERIVED_T::IS_QUICK)
+    {
+      if (DERIVED_T::IS_AXI) { flux3_4 = 0.; flux1_2 = 0.; return; }
+      flux_arete_comp<Type_Flux_Arete::INTERNE>(inco, a_r, fac1, fac2, fac3, fac4, k, flux3_4);
+      flux_arete_comp<Type_Flux_Arete::INTERNE>(inco, a_r, fac3, fac4, fac1, fac2, k, flux1_2);
+      return;
+    }
+  double psc = 0.25*(dt_vitesse<DeviceSpace>(fac1)*porosite<DeviceSpace>(fac1)+dt_vitesse<DeviceSpace>(fac2)*porosite<DeviceSpace>(fac2))*(surface<DeviceSpace>(fac1)+surface<DeviceSpace>(fac2));
   if (DERIVED_T::IS_CENTRE)
-    for (int k = 0; k < ncomp; k++) flux3_4[k] = -psc*0.5*(inco(fac3,k)+inco(fac4,k));
+    flux3_4 = -psc*0.5*(inco(fac3,k)+inco(fac4,k));
   else if (DERIVED_T::IS_CENTRE4)
     {
-      const int ori = orientation(fac1), num0_0 = face_amont_conj_(fac3,ori,0),num1_1 = face_amont_conj_(fac4,ori,1);
-      if ( (num0_0 == -1)||(num1_1== -1) )
-        for (int k = 0; k < ncomp; k++) flux3_4[k] = -psc*0.5*(inco(fac3,k)+inco(fac4,k)); // Schema centre 2 (pas assez de faces)
-      else // Schema Centre4
+      const int ori = orientation<DeviceSpace>(fac1), num0_0 = face_amont_conj_<DeviceSpace>(fac3,ori,0), num1_1 = face_amont_conj_<DeviceSpace>(fac4,ori,1);
+      if ((num0_0==-1)||(num1_1==-1))
+        flux3_4 = -psc*0.5*(inco(fac3,k)+inco(fac4,k));
+      else
         {
-          Type_Double vit_0(ncomp), vit_0_0(ncomp), vit_1_1(ncomp), vit_1(ncomp);
-          const double dx = dist_face_period_(fac3,fac4,ori), dxam = dist_face_period_(num0_0,fac3,ori), dxav = dist_face_period_(fac4,num1_1,ori);
-          double g1, g2, g3, g4;
+          const double dx = dist_face_period_<DeviceSpace>(fac3,fac4,ori), dxam = dist_face_period_<DeviceSpace>(num0_0,fac3,ori), dxav = dist_face_period_<DeviceSpace>(fac4,num1_1,ori);
+          double g1,g2,g3,g4;
           calcul_g_(dxam,dx,dxav,g1,g2,g3,g4);
-          for (int k = 0; k < ncomp; k++)
-            {
-              vit_0_0[k] = inco(num0_0,k);
-              vit_0[k] = inco(fac3,k);
-              vit_1[k] = inco(fac4,k);
-              vit_1_1[k] = inco(num1_1,k);
-              flux3_4[k] = -conv_centre_(psc,vit_0_0[k],vit_0[k],vit_1[k],vit_1_1[k],g1,g2,g3,g4);
-            }
+          flux3_4 = -conv_centre_(psc,inco(num0_0,k),inco(fac3,k),inco(fac4,k),inco(num1_1,k),g1,g2,g3,g4);
         }
     }
   else
-    {
-      if (psc>0)
-        for (int k = 0; k < ncomp; k++)
-          {
-//            if (a_r && DERIVED_T::IS_AMONT) psc *= (*a_r)(elem_(fac3,0),0); // FIXME
-            flux3_4[k] = -psc*inco(fac3,k);
-          }
-      else for (int k = 0; k < ncomp; k++)
-          {
-//            if (a_r && DERIVED_T::IS_AMONT) psc *= (*a_r)(elem_(fac4,0),0); // FIXME
-            flux3_4[k] = -psc*inco(fac4,k);
-          }
-    }
+    flux3_4 = psc > 0 ? -psc*inco(fac3,k) : -psc*inco(fac4,k);
 
-  psc = 0.25*(dt_vitesse(fac3)*porosite(fac3)+dt_vitesse(fac4)*porosite(fac4))*(surface(fac3)+surface(fac4));
+  psc = 0.25*(dt_vitesse<DeviceSpace>(fac3)*porosite<DeviceSpace>(fac3)+dt_vitesse<DeviceSpace>(fac4)*porosite<DeviceSpace>(fac4))*(surface<DeviceSpace>(fac3)+surface<DeviceSpace>(fac4));
   if (DERIVED_T::IS_CENTRE)
-    for (int k = 0; k < ncomp; k++) flux1_2[k] = -psc*0.5*(inco(fac1,k)+inco(fac2,k));
+    flux1_2 = -psc*0.5*(inco(fac1,k)+inco(fac2,k));
   else if (DERIVED_T::IS_CENTRE4)
     {
-      const int ori = orientation(fac3), num0_0 = face_amont_conj_(fac1,ori,0), num1_1 = face_amont_conj_(fac2,ori,1);
-
-      if ( (num0_0 == -1)||(num1_1== -1) )
-        for (int k=0; k<ncomp; k++)  flux1_2[k] = -psc*0.5*(inco(fac1,k)+inco(fac2,k)); // Schema centre 2 (pas assez de faces)
-      else // Schema Centre4
+      const int ori = orientation<DeviceSpace>(fac3), num0_0 = face_amont_conj_<DeviceSpace>(fac1,ori,0), num1_1 = face_amont_conj_<DeviceSpace>(fac2,ori,1);
+      if ((num0_0==-1)||(num1_1==-1))
+        flux1_2 = -psc*0.5*(inco(fac1,k)+inco(fac2,k));
+      else
         {
-          Type_Double vit_0(ncomp), vit_0_0(ncomp), vit_1_1(ncomp), vit_1(ncomp);
-          const double dx = dist_face_period_(fac1,fac2,ori),dxam = dist_face_period_(num0_0,fac1,ori), dxav = dist_face_period_(fac2,num1_1,ori);
-          double g1, g2, g3, g4;
+          const double dx = dist_face_period_<DeviceSpace>(fac1,fac2,ori), dxam = dist_face_period_<DeviceSpace>(num0_0,fac1,ori), dxav = dist_face_period_<DeviceSpace>(fac2,num1_1,ori);
+          double g1,g2,g3,g4;
           calcul_g_(dxam,dx,dxav,g1,g2,g3,g4);
-          for (int k = 0; k < ncomp; k++)
-            {
-              vit_0_0[k] = inco(num0_0,k);
-              vit_0[k] = inco(fac1,k);
-              vit_1[k] = inco(fac2,k);
-              vit_1_1[k]=inco(num1_1,k);
-              flux1_2[k] = -conv_centre_(psc,vit_0_0[k],vit_0[k],vit_1[k],vit_1_1[k],g1,g2,g3,g4);
-            }
+          flux1_2 = -conv_centre_(psc,inco(num0_0,k),inco(fac1,k),inco(fac2,k),inco(num1_1,k),g1,g2,g3,g4);
         }
     }
   else
-    {
-      if (psc>0)
-        for (int k = 0; k < ncomp; k++)
-          {
-//            if (a_r && DERIVED_T::IS_AMONT) psc *= (*a_r)(elem_(fac1,0),0); // FIXME
-            flux1_2[k] = -psc*inco(fac1,k);
-          }
-      else for (int k = 0; k < ncomp; k++)
-          {
-//            if (a_r && DERIVED_T::IS_AMONT) psc *= (*a_r)(elem_(fac2,0),0); // FIXME
-            flux1_2[k] = -psc*inco(fac2,k);
-          }
-    }
+    flux1_2 = psc > 0 ? -psc*inco(fac1,k) : -psc*inco(fac2,k);
 }
 
-template <typename DERIVED_T> template<Type_Flux_Arete Arete_Type, typename Type_Double> inline std::enable_if_t< Arete_Type == Type_Flux_Arete::COIN_FLUIDE, void>
-Eval_Conv_VDF_Face<DERIVED_T>::flux_arete(const DoubleTab& inco, const DoubleTab* a_r, int fac1, int , int fac3, int signe, Type_Double& flux3, Type_Double& flux1_2) const
+template <typename DERIVED_T> template<Type_Flux_Arete Arete_Type>
+KOKKOS_INLINE_FUNCTION std::enable_if_t<Arete_Type == Type_Flux_Arete::COIN_FLUIDE, void>
+Eval_Conv_VDF_Face<DERIVED_T>::flux_arete_comp(CDoubleTabView inco, CDoubleTabView val_imp_face_bord, CDoubleTabView, CDoubleTabView, int fac1, int, int fac3, int signe, int k, double& flux3, double& flux1_2) const
 {
-  assert(flux3.size_array() == flux1_2.size_array());
-  if (!DERIVED_T::IS_AMONT)
-    {
-      Cerr << "Flux_arete with Type_Flux_Arete::COIN_FLUIDE is only coded for amont scheme !" <<finl;
-      Process::exit();
-    }
-
-  const int ncomp = flux3.size_array();
-
-  // FIXME : pb_multi !
-  if (ncomp > 1) throw;
-
-  double psc = 0.5 * dt_vitesse(fac1) * porosite(fac1) * surface(fac1);
-  if ((psc * signe) > 0)
-    for (int k = 0; k < ncomp; k++)
-      {
-//        if (a_r) psc *= (*a_r)(elem_(fac3,0),0); // FIXME
-        flux3[k] = -inco(fac3,k) * psc;
-      }
-  else
-    {
-      const int pfb = premiere_face_bord(), rang1 = (fac1 - pfb), ori = orientation(fac3);
-      for (int k = 0; k < ncomp; k++) flux3[k] = -Champ_Face_get_val_imp_face_bord(inconnue->temps(), rang1, ori, la_zcl()) * psc;
-    }
+  if (!DERIVED_T::IS_AMONT) { Process::Kokkos_exit("COIN_FLUIDE flux_arete_comp: only coded for amont"); flux3 = 0.; flux1_2 = 0.; return; }
+  double psc = 0.5*dt_vitesse<DeviceSpace>(fac1)*porosite<DeviceSpace>(fac1)*surface<DeviceSpace>(fac1);
+  flux3 = ((psc*signe) > 0) ? -inco(fac3,k)*psc : -val_imp_face_bord(fac1, orientation<DeviceSpace>(fac3))*psc;
+  psc = 0.5*dt_vitesse<DeviceSpace>(fac3)*surface<DeviceSpace>(fac3)*porosite<DeviceSpace>(fac3);
+  flux1_2 = psc > 0 ? -psc*inco(fac1,k) : -psc*val_imp_face_bord(fac3, orientation<DeviceSpace>(fac1));
+}
 
-  psc = 0.5 * dt_vitesse(fac3) * surface(fac3) * porosite(fac3);
-  if (psc > 0)
-    for (int k = 0; k < ncomp; k++)
-      {
-//        if (a_r) psc *= (*a_r)(elem_(fac1,0),0); // FIXME
-        flux1_2[k] = -psc * inco(fac1,k);
-      }
-  else
-    {
-      const int pfb = premiere_face_bord(), rang3 = (fac3 - pfb), ori = orientation(fac1);
-      for (int k = 0; k < ncomp; k++) flux1_2[k] = -psc * Champ_Face_get_val_imp_face_bord(inconnue->temps(), rang3, ori, la_zcl());
-    }
+template <typename DERIVED_T> template<Type_Flux_Fa7 Fa7_Type>
+KOKKOS_INLINE_FUNCTION std::enable_if_t<Fa7_Type == Type_Flux_Fa7::SORTIE_LIBRE, void>
+Eval_Conv_VDF_Face<DERIVED_T>::flux_fa7_comp(CDoubleTabView inco, CDoubleTabView a_r, int fac1, CDoubleTabView, int, int k, double& flux) const
+{
+  const int elem1 = elem_<DeviceSpace>(fac1,0), elem2 = elem_<DeviceSpace>(fac1,1);
+  double psc = dt_vitesse<DeviceSpace>(fac1,k)*surface<DeviceSpace>(fac1);
+  if (a_r.data() && DERIVED_T::IS_AMONT) psc *= a_r((elem1 != -1) ? elem1 : elem2, k);
+  flux = -psc*inco(fac1,k)*porosite<DeviceSpace>(fac1);
 }
 
 /* ************************************** *
@@ -507,7 +431,7 @@ Eval_Conv_VDF_Face<DERIVED_T>::coeffs_arete(const DoubleTab* a_r, int fac1, int
 
 template <typename DERIVED_T> template<Type_Flux_Arete Arete_Type, typename Type_Double> inline
 std::enable_if_t<Arete_Type == Type_Flux_Arete::FLUIDE || Arete_Type == Type_Flux_Arete::NAVIER_FLUIDE || Arete_Type == Type_Flux_Arete::PAROI_FLUIDE || Arete_Type == Type_Flux_Arete::COIN_FLUIDE, void>
-Eval_Conv_VDF_Face<DERIVED_T>::coeffs_arete(const DoubleTab* a_r, int fac1, int fac2,int fac3,int signe,Type_Double& aii1_2, Type_Double& aii3_4, Type_Double& ajj1_2) const
+Eval_Conv_VDF_Face<DERIVED_T>::coeffs_arete(const DoubleTab&, const DoubleTab&, const DoubleTab* a_r, int fac1, int fac2,int fac3,int signe,Type_Double& aii1_2, Type_Double& aii3_4, Type_Double& ajj1_2) const
 {
   assert(aii1_2.size_array() == aii3_4.size_array() && aii1_2.size_array() == ajj1_2.size_array());
   if (DERIVED_T::IS_CENTRE || DERIVED_T::IS_AXI || DERIVED_T::IS_CENTRE4) return;
@@ -541,6 +465,7 @@ Eval_Conv_VDF_Face<DERIVED_T>::coeffs_arete(const DoubleTab* a_r, int fac1, int
     }
 }
 
+
 template <typename DERIVED_T> template <typename Type_Double>
 inline void Eval_Conv_VDF_Face<DERIVED_T>::fill_coeffs_proto(const int k, const double psc1, const double psc2, Type_Double& A, Type_Double& B) const
 {
diff --git a/src/VDF/Operateurs/Eval_Conv/Eval_Conv_VDF_Face_leaves.h b/src/VDF/Operateurs/Eval_Conv/Eval_Conv_VDF_Face_leaves.h
index 337041dc7e..5eff0ced62 100644
--- a/src/VDF/Operateurs/Eval_Conv/Eval_Conv_VDF_Face_leaves.h
+++ b/src/VDF/Operateurs/Eval_Conv/Eval_Conv_VDF_Face_leaves.h
@@ -1,5 +1,5 @@
 /****************************************************************************
-* Copyright (c) 2023, CEA
+* Copyright (c) 2026, CEA
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
@@ -44,6 +44,17 @@ class Eval_Amont_VDF_Face : public Eval_Conv_VDF_Face<Eval_Amont_VDF_Face>, publ
 {
 public:
   static constexpr bool IS_AMONT = true, CALC_ARR_COIN_FL = true;
+
+  KOKKOS_INLINE_FUNCTION double conv_centre(const double psc,const double vit_0_0, const double vit_0, const double vit_1,const double vit1_1,double g1, double g2, double g3,double g4) const
+  { Process::Kokkos_exit("Error!"); return 0; }
+  KOKKOS_INLINE_FUNCTION double conv_quick_sharp_moins(const double psc,const double vit_0,const double vit_1, const double vit_1_1,
+                                                       const double dx, const double dm,const double dxam) const
+  { Process::Kokkos_exit("Error!"); return 0; }
+  KOKKOS_INLINE_FUNCTION double conv_quick_sharp_plus(const double psc,const double vit_0, const double vit_1, const double vit_0_0,
+                                                      const double dx, const double dm, const double dxam) const
+  { Process::Kokkos_exit("Error!"); return 0; }
+  KOKKOS_INLINE_FUNCTION void calcul_g(const double dxam, const double dx, const double dxav, double& g1, double& g2, double& g3, double& g4) const
+  { Process::Kokkos_exit("Error!"); }
 };
 
 /*! @brief class Eval_Centre_VDF_Face Evaluateur VDF pour la convection Le champ convecte est un Champ_Face_VDF
@@ -55,6 +66,16 @@ class Eval_Centre_VDF_Face : public Eval_Conv_VDF_Face<Eval_Centre_VDF_Face>, pu
 {
 public:
   static constexpr bool IS_CENTRE = true;
+
+  KOKKOS_INLINE_FUNCTION double conv_centre(const double psc,const double vit_0_0, const double vit_0, const double vit_1,const double vit1_1,double g1, double g2, double g3,double g4) const
+  { Process::Kokkos_exit("Error!"); return 0; }
+  KOKKOS_INLINE_FUNCTION double conv_quick_sharp_moins(const double psc,const double vit_0,const double vit_1, const double vit_1_1,
+                                                       const double dx, const double dm,const double dxam) const
+  { Process::Kokkos_exit("Error!"); return 0; }
+  KOKKOS_INLINE_FUNCTION double conv_quick_sharp_plus(const double psc,const double vit_0, const double vit_1, const double vit_0_0,const double dx, const double dm, const double dxam) const
+  { Process::Kokkos_exit("Error!"); return 0; }
+  KOKKOS_INLINE_FUNCTION void calcul_g(const double dxam, const double dx, const double dxav, double& g1, double& g2, double& g3, double& g4) const
+  { Process::Kokkos_exit("Error!"); }
 };
 
 /*! @brief class Eval_Centre4_VDF_Face Evaluateur VDF pour la convection Le champ convecte est un Champ_Face_VDF
@@ -67,16 +88,17 @@ class Eval_Centre4_VDF_Face : public Eval_Conv_VDF_Face<Eval_Centre4_VDF_Face>,
 public:
   static constexpr bool IS_CENTRE4 = true;
 
-  inline int face_amont_conj(int num_face,int i,int k) const override { return le_dom->face_amont_conj(num_face, i, k); }
-  inline int face_amont_princ(int num_face,int i) const override { return le_dom->face_amont_princ(num_face, i); }
   inline double dist_face(int n1,int n2,int k) const { return le_dom->dist_face(n1,n2,k); }
   inline double dist_face_period(int n1,int n2,int k) const { return le_dom->dist_face_period(n1,n2,k); }
-  inline double dist_elem_period(int n1,int n2,int k) const override { return le_dom->dist_elem_period(n1,n2,k); }
-  inline double dim_elem(int n1,int k) const override { return le_dom->dim_elem(n1,k); }
-  inline double conv_centre(const double psc,const double vit_0_0, const double vit_0, const double vit_1,const double vit1_1,double g1, double g2, double g3,double g4) const override
+  KOKKOS_INLINE_FUNCTION double conv_centre(const double psc,const double vit_0_0, const double vit_0, const double vit_1,const double vit1_1,double g1, double g2, double g3,double g4) const
   { return (g1*vit_0_0 + g2*vit_0 + g3*vit_1 + g4*vit1_1) * psc; }
-
-  inline void calcul_g(const double dxam, const double dx, const double dxav, double& g1, double& g2, double& g3, double& g4) const override
+  KOKKOS_INLINE_FUNCTION double conv_quick_sharp_moins(const double psc,const double vit_0,const double vit_1, const double vit_1_1,
+                                                       const double dx, const double dm,const double dxam) const
+  { Process::Kokkos_exit("Error!"); return 0; }
+  KOKKOS_INLINE_FUNCTION double conv_quick_sharp_plus(const double psc,const double vit_0, const double vit_1, const double vit_0_0,
+                                                      const double dx, const double dm, const double dxam) const
+  { Process::Kokkos_exit("Error!"); return 0; }
+  KOKKOS_INLINE_FUNCTION void calcul_g(const double dxam, const double dx, const double dxav, double& g1, double& g2, double& g3, double& g4) const
   {return calcul_g_impl(dxam,dx,dxav,g1,g2,g3,g4); }
 };
 
@@ -90,21 +112,21 @@ class Eval_Quick_VDF_Face : public Eval_Conv_VDF_Face<Eval_Quick_VDF_Face>, publ
 public:
   static constexpr bool IS_QUICK = true;
 
-  inline int face_amont_conj(int num_face,int i,int k) const override { return le_dom->face_amont_conj(num_face, i, k); }
-  inline int face_amont_princ(int num_face,int i) const override { return le_dom->face_amont_princ(num_face, i); }
-  inline double dim_elem(int n1,int k) const override { return le_dom->dim_elem(n1,k); }
-  inline double dim_face(int n1,int k) const override { return le_dom->dim_face(n1,k); }
   inline double dist_face(int n1,int n2,int k) const { return le_dom->dist_face(n1,n2,k); }
-  inline double dist_elem(int n1,int n2,int k) const override { return le_dom->dist_elem(n1,n2,k); }
-  inline double dist_elem_period(int n1, int n2, int k) const override { return le_dom->dist_elem_period(n1,n2,k); }
   inline double dist_face_period(int n1,int n2,int k) const { return le_dom->dist_face_period(n1,n2,k); }
-  inline double conv_quick_sharp_plus(const double psc,const double vit_0, const double vit_1, const double vit_0_0,
-                                      const double dx, const double dm, const double dxam) const override
+  KOKKOS_INLINE_FUNCTION double conv_centre(const double psc,const double vit_0_0, const double vit_0, const double vit_1,const double vit1_1,double g1, double g2, double g3,double g4) const
+  { Process::Kokkos_exit("Error!"); return 0; }
+
+  KOKKOS_INLINE_FUNCTION double conv_quick_sharp_plus(const double psc,const double vit_0, const double vit_1, const double vit_0_0,
+                                                      const double dx, const double dm, const double dxam) const
   { return conv_quick_sharp_plus_impl(psc,vit_0,vit_1,vit_0_0,dx,dm,dxam); }
 
-  inline double conv_quick_sharp_moins(const double psc,const double vit_0,const double vit_1, const double vit_1_1,
-                                       const double dx, const double dm,const double dxam) const override
+  KOKKOS_INLINE_FUNCTION double conv_quick_sharp_moins(const double psc,const double vit_0,const double vit_1, const double vit_1_1,
+                                                       const double dx, const double dm,const double dxam) const
   { return conv_quick_sharp_moins_impl(psc,vit_0,vit_1,vit_1_1,dx,dm,dxam); }
+
+  KOKKOS_INLINE_FUNCTION void calcul_g(const double dxam, const double dx, const double dxav, double& g1, double& g2, double& g3, double& g4) const
+  { Process::Kokkos_exit("Error!"); }
 };
 
 /*! @brief class Eval_Quick_VDF_Face_Axi Evaluateur VDF pour la convection en coordonnees cylindriques : Le champ convecte est un Champ_Face_VDF
@@ -117,26 +139,31 @@ class Eval_Quick_VDF_Face_Axi : public Eval_Conv_VDF_Face<Eval_Quick_VDF_Face_Ax
 public:
   static constexpr bool IS_AXI = true, IS_QUICK = true, CALC_ARR_PERIO = false, CALC_ARR_NAVIER_FL = false;
 
-  inline int face_amont_princ(int num_face,int i) const override { return le_dom->face_amont_princ(num_face, i); }
-  inline int face_amont_conj(int ,int ,int ) const override;
+  inline int face_amont_conj(int ,int ,int ) const;
   inline double dist_face(int ,int ,int ) const;
-  inline double dist_elem_period(int n1, int n2, int k) const override { return dist_face(n1,n2,k); }
-  inline double dim_face(int ,int ) const override;
-  inline double dist_elem(int ,int ,int ) const override;
-  inline double dim_elem(int ,int ) const override;
-  inline double conv_quick_sharp_plus(const double psc,const double vit_0, const double vit_1, const double vit_0_0,
-                                      const double dx, const double dm, const double dxam) const override
+  inline double dist_elem_period(int n1, int n2, int k) const { return dist_face(n1,n2,k); }
+  inline double dim_face(int ,int ) const;
+  inline double dist_elem(int ,int ,int ) const;
+  inline double dim_elem(int ,int ) const;
+  KOKKOS_INLINE_FUNCTION double conv_centre(const double psc,const double vit_0_0, const double vit_0, const double vit_1,const double vit1_1,double g1, double g2, double g3,double g4) const
+  { Process::Kokkos_exit("Error!"); return 0; }
+
+  KOKKOS_INLINE_FUNCTION double conv_quick_sharp_plus(const double psc,const double vit_0, const double vit_1, const double vit_0_0,
+                                                      const double dx, const double dm, const double dxam) const
   { return conv_quick_sharp_plus_impl(psc,vit_0,vit_1,vit_0_0,dx,dm,dxam); }
 
-  inline double conv_quick_sharp_moins(const double psc,const double vit_0,const double vit_1, const double vit_1_1,
-                                       const double dx, const double dm,const double dxam) const override
+  KOKKOS_INLINE_FUNCTION double conv_quick_sharp_moins(const double psc,const double vit_0,const double vit_1, const double vit_1_1,
+                                                       const double dx, const double dm,const double dxam) const
   { return conv_quick_sharp_moins_impl(psc,vit_0,vit_1,vit_1_1,dx,dm,dxam); }
+
+  KOKKOS_INLINE_FUNCTION void calcul_g(const double dxam, const double dx, const double dxav, double& g1, double& g2, double& g3, double& g4) const
+  { Process::Kokkos_exit("Error!"); }
 };
 
 inline double Eval_Quick_VDF_Face_Axi::dim_elem(int n1, int k) const
 {
   const IntTab& elem_faces_ = le_dom->elem_faces();
-  return dist_face(elem_faces_(n1,k), elem_faces_(n1,k+dimension), k) ;
+  return dist_face(elem_faces_(n1,k), elem_faces_(n1,k+Objet_U::dimension), k) ;
 }
 
 inline double Eval_Quick_VDF_Face_Axi::dist_elem(int n1, int n2, int k) const
@@ -165,7 +192,7 @@ inline int Eval_Quick_VDF_Face_Axi::face_amont_conj(int num_face, int k, int i)
   const IntTab& face_voisins_ = le_dom->face_voisins();
   const IntTab& elem_faces_   = le_dom->elem_faces();
   const IntVect& orientation_ = le_dom->orientation();
-  return face_amont_conj_axi_impl(num_face,k,i,dimension,face_voisins_,elem_faces_,orientation_);
+  return face_amont_conj_axi_impl(num_face,k,i,Objet_U::dimension,face_voisins_,elem_faces_,orientation_);
 }
 
 #endif /* Eval_Conv_VDF_Face_leaves_included */
diff --git a/src/VDF/Operateurs/Eval_Conv/Eval_Conv_VDF_tools.cpp b/src/VDF/Operateurs/Eval_Conv/Eval_Conv_VDF_tools.cpp
index d9912cb61e..8b2232cf5a 100644
--- a/src/VDF/Operateurs/Eval_Conv/Eval_Conv_VDF_tools.cpp
+++ b/src/VDF/Operateurs/Eval_Conv/Eval_Conv_VDF_tools.cpp
@@ -1,5 +1,5 @@
 /****************************************************************************
-* Copyright (c) 2022, CEA
+* Copyright (c) 2025, CEA
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
@@ -15,44 +15,6 @@
 
 #include <Eval_Conv_VDF_tools.h>
 
-// quick pour un champ face
-double Eval_Conv_VDF_tools::conv_quick_sharp_plus_impl(const double psc,const double vit_0, const double vit_1,
-                                                       const double vit_0_0, const double dx,
-                                                       const double dm, const double dxam) const
-{
-  double cf, curv, delta_0 = vit_0 - vit_0_0, delta = vit_1 - vit_0, dd1,utc, delta_delta;
-  curv = (delta/dx - delta_0/dxam)/dm ;
-  // Calcul de cf:
-  delta_delta = delta_0+delta;
-  dd1 = std::fabs(delta_delta);
-  if (dd1 < 1.e-5) cf = 0.125;
-  else
-    {
-      utc = delta_0/delta_delta;
-      cf = sharp2(utc);
-    }
-  return (0.5*(vit_0 + vit_1) - cf*(dx*dx)*curv)*psc;
-}
-
-// quick pour un champ face
-double Eval_Conv_VDF_tools::conv_quick_sharp_moins_impl(const double psc,const double vit_0,const double vit_1,
-                                                        const double vit_1_1,const double dx,
-                                                        const double dm,const double dxam) const
-{
-  double cf, curv, delta_1 = vit_1_1 - vit_1, delta = vit_1 - vit_0, dd1,utc, delta_delta;
-  curv = ( delta_1/dxam - delta/dx )/dm ;
-  // Calcul de cf:
-  delta_delta = delta_1+delta;
-  dd1 = std::fabs(delta_delta);
-  if (dd1 < 1.e-5) cf = 0.125;
-  else
-    {
-      utc = delta_1/delta_delta;
-      cf = sharp2(utc);
-    }
-  return (0.5*(vit_0 + vit_1) - cf*(dx*dx)*curv)*psc;
-}
-
 int Eval_Conv_VDF_tools::face_amont_conj_axi_impl(int num_face, int k, int i, int dimension,
                                                   const IntTab& face_voisins, const IntTab& elem_faces,
                                                   const IntVect& orientation) const
@@ -108,12 +70,3 @@ double Eval_Conv_VDF_tools::dist_elem_axi_impl(int n1, int n2, int k, const Doub
     }
   return dist ;
 }
-
-// Calcul des coefficients g1,g2,g3,g4 a partir de dxam,dx,dxav
-void Eval_Conv_VDF_tools::calcul_g_impl(const double dxam, const double dx, const double dxav, double& g1, double& g2, double& g3, double& g4) const
-{
-  g1 = -dx*dx*(dx/2+dxav)/(4*(dx+dxam+dxav)*(dx+dxam)*dxam);
-  g2 =  (dx+2*dxam)*(dx+2*dxav)/(8*dxam*(dx+dxav));
-  g3 =  (dx+2*dxam)*(dx+2*dxav)/(8*dxav*(dx+dxam));
-  g4 = -dx*dx*(dx/2+dxam)/(4*(dx+dxam+dxav)*dxav*(dx+dxav));
-}
diff --git a/src/VDF/Operateurs/Eval_Conv/Eval_Conv_VDF_tools.h b/src/VDF/Operateurs/Eval_Conv/Eval_Conv_VDF_tools.h
index 127bca092e..ea3b153872 100644
--- a/src/VDF/Operateurs/Eval_Conv/Eval_Conv_VDF_tools.h
+++ b/src/VDF/Operateurs/Eval_Conv/Eval_Conv_VDF_tools.h
@@ -23,48 +23,55 @@ class Eval_Conv_VDF_tools
 {
 public:
   virtual ~Eval_Conv_VDF_tools() {}
-  // DANGER !!!! FAUT JAMAIS ENTRER
-  virtual int amont_amont(int face, int i) const { return dont_call<int>(__func__); }
-  virtual int face_amont_conj(int ,int ,int ) const { return dont_call<int>(__func__); }
-  virtual int face_amont_princ(int ,int ) const { return dont_call<int>(__func__); }
-  virtual double dim_elem(int ,int ) const { return dont_call<double>(__func__); }
-  virtual double dim_face(int ,int ) const { return dont_call<double>(__func__); }
-  virtual double dist_elem(int ,int ,int ) const { return dont_call<double>(__func__); }
-  virtual double dist_elem_period(int , int , int ) const { return dont_call<double>(__func__); }
-  virtual double conv_centre(const double,const double,const double,const double,const double,double,double,double,double) const { return dont_call<double>(__func__); }
-  virtual double conv_quick_sharp_plus(const double,const double,const double,const double,const double,const double,const double) const { return dont_call<double>(__func__); }
-  virtual double conv_quick_sharp_moins(const double,const double,const double,const double,const double,const double,const double) const { return dont_call<double>(__func__); }
-  virtual void calcul_g(const double,const double,const double,double&,double&,double&,double&) const { return dont_call<void>(__func__); }
-
   template <typename Type_Double>
   void qcentre(const double, const int, const int, const int, const int, const int, const DoubleTab&, Type_Double& ) const { return dont_call<void>(__func__); }
+  KOKKOS_INLINE_FUNCTION
+  void qcentre_view(const double, const int, const int, const int, const int, const int, CDoubleTabView, DoubleArrView) const { return dont_call<void>(__func__); }
 
   template <typename Type_Double>
   void quick_fram(const Type_Double&, const int, const int, const int, const int, const int, const DoubleTab&, Type_Double& ) const { return dont_call<void>(__func__); }
+  KOKKOS_INLINE_FUNCTION
+  void quick_fram_view(const double, const int, const int, const int, const int, const int, CDoubleTabView, DoubleArrView) const { return dont_call<void>(__func__); }
 
 protected:
   int face_amont_conj_axi_impl(int ,int ,int ,int , const IntTab& , const IntTab& , const IntVect&) const;
   double dist_face_axi_impl(int ,int ,int ,const DoubleTab&) const;
   double dist_elem_axi_impl(int ,int ,int ,const DoubleTab&) const;
-  double conv_quick_sharp_plus_impl(const double,const double,const double,const double,const double,const double,const double) const ;
-  double conv_quick_sharp_moins_impl(const double,const double,const double,const double,const double,const double,const double) const;
-  void calcul_g_impl(const double,const double,const double,double&,double&,double&,double& ) const ;
 
   template <typename Type_Double>
   void qcentre2_impl(const double,const int,const int,const int,const int,const int,const DoubleTab&,Type_Double&) const;
+  KOKKOS_INLINE_FUNCTION
+  void qcentre2_impl_view(const double,const int,const int,const int,const int,const int,CDoubleTabView,DoubleArrView) const;
+  KOKKOS_INLINE_FUNCTION
+  void qcentre2_impl_comp(const double,const int,const int,CDoubleTabView,const int,double&) const;
 
   template <typename Type_Double>
   void qcentre4_impl(const int,const double,const double,const double,const double,const int,const int,const int,const int,const int,const DoubleTab&,Type_Double&) const;
+  KOKKOS_INLINE_FUNCTION
+  void qcentre4_impl_view(const int,const double,const double,const double,const double,const int,const int,const int,const int,const int,CDoubleTabView,DoubleArrView) const;
+  KOKKOS_INLINE_FUNCTION
+  void qcentre4_impl_comp(const int,const double,const double,const double,const double,const int,const int,const int,const int,CDoubleTabView,const int,double&) const;
 
   template <typename Type_Double>
   void quick_fram_impl(const int,const double,const double,const double,const double,const double,const Type_Double&,const int,const int,const int,const int,const int,const DoubleTab&,Type_Double&) const;
+  KOKKOS_INLINE_FUNCTION
+  void quick_fram_impl_view(const int,const double,const double,const double,const double,const double,const double,const int,const int,const int,const int,const int,CDoubleTabView,DoubleArrView) const;
+  KOKKOS_INLINE_FUNCTION
+  void quick_fram_impl_comp(const int,const double,const double,const double,const double,const double,const double,const int,const int,const int,const int,CDoubleTabView,const int,double&) const;
+
+  KOKKOS_INLINE_FUNCTION
+  double conv_quick_sharp_plus_impl(const double,const double,const double,const double,const double,const double,const double) const ;
+  KOKKOS_INLINE_FUNCTION
+  double conv_quick_sharp_moins_impl(const double,const double,const double,const double,const double,const double,const double) const;
+  KOKKOS_INLINE_FUNCTION
+  void calcul_g_impl(const double,const double,const double,double&,double&,double&,double& ) const ;
 
 private:
   template <typename type>
-  type dont_call (const char * nom_fct) const
+  KOKKOS_INLINE_FUNCTION type dont_call (const char * nom_fct) const
   {
-    Cerr << "What ??? You should not call the function " << nom_fct << finl;
-    throw;
+    Process::Kokkos_exit("What ??? You should not call the function.");
+    if constexpr (!std::is_void_v<type>) return type {};
   }
 };
 
@@ -80,25 +87,26 @@ inline double Fram4(const double s1,const double s2, const double s3,const doubl
 }
 
 // Fram pour QUICK
-inline double Fram(const double s1,const double s2, const double s3,const double s4)
+KOKKOS_INLINE_FUNCTION
+double Fram(const double s1,const double s2, const double s3,const double s4)
 {
-  double smin0 = std::min(s4,s2), smax0 = std::max(s4,s2), smin1 = std::min(s3,s1), smax1 = std::max(s3,s1);
+  double smin0 = Kokkos::min(s4,s2), smax0 = Kokkos::max(s4,s2), smin1 = Kokkos::min(s3,s1), smax1 = Kokkos::max(s3,s1);
   // Ajout du DMINFLOAT car le compilateur Nvidia evalue quand meme (bug) si smax0-smin0=0...
-  double sr0 = (std::fabs(smax0-smin0)<DMINFLOAT ? 0. : (s3-smin0)/(smax0-smin0+DMINFLOAT));
-  double sr1 = (std::fabs(smax1-smin1)<DMINFLOAT ? 0. : (s2-smin1)/(smax1-smin1+DMINFLOAT));
-  double fr = 2.*std::max(std::fabs(sr0-0.5),std::fabs(sr1-0.5));
+  double sr0 = (Kokkos::fabs(smax0-smin0)<DMINFLOAT ? 0. : (s3-smin0)/(smax0-smin0+DMINFLOAT));
+  double sr1 = (Kokkos::fabs(smax1-smin1)<DMINFLOAT ? 0. : (s2-smin1)/(smax1-smin1+DMINFLOAT));
+  double fr = 2.*Kokkos::max(Kokkos::fabs(sr0-0.5),Kokkos::fabs(sr1-0.5));
   fr *= fr;
   fr *= fr;
-  return std::min(fr,1.0);
+  return Kokkos::min(fr,1.0);
 }
 
 // Fonction de calcul de cf(limiteur de pente) dans le schema Quick-sharp
-inline double sharp2(const double utc)
+KOKKOS_INLINE_FUNCTION double sharp2(const double utc)
 {
   double cf;
   if ( (utc <= -1) || (utc >= 1.5) ) cf = 0.125;
   else if ((utc > -1) && (utc <= 0) ) cf = 0.5 + 0.375*utc;
-  else if ((utc <= 0.25) && (utc > 0) ) cf = 0.5 - 0.625*sqrt(utc);
+  else if ((utc <= 0.25) && (utc > 0) ) cf = 0.5 - 0.625*Kokkos::sqrt(utc);
   else if ((utc > 0.25) && (utc <= 1.) ) cf = 0.25*(1.-utc);
   else cf = 0.25*(utc-1.);
   return cf;
@@ -109,13 +117,7 @@ void Eval_Conv_VDF_tools::qcentre2_impl(const double psc, const int num0, const
                                         const DoubleTab& transporte,Type_Double& flux) const
 {
   int k, ncomp = flux.size_array();
-  Type_Double T0(ncomp), T1(ncomp);
-  for (k=0; k<ncomp; k++)
-    {
-      T0[k] = transporte(num0,k);
-      T1[k] = transporte(num1,k);
-    }
-  for (k=0; k<ncomp; k++) flux[k] =0.5 * (T0[k] + T1[k]) * psc ;
+  for (k=0; k<ncomp; k++) flux[k] = 0.5 * (transporte(num0,k) + transporte(num1,k)) * psc;
 }
 
 template <typename Type_Double>
@@ -123,17 +125,9 @@ void Eval_Conv_VDF_tools::qcentre4_impl(const int ori,const double dx, const dou
                                         const int num0_0, const int num1_1, const int face, const DoubleTab& transporte,Type_Double& flux) const
 {
   int k, ncomp = flux.size_array();
-  Type_Double T0(ncomp), T0_0(ncomp), T1(ncomp), T1_1(ncomp);
-  for (k=0; k<ncomp; k++)
-    {
-      T0[k] = transporte(num0,k);
-      T0_0[k] = transporte(num0_0,k);
-      T1[k] = transporte(num1,k);
-      T1_1[k] = transporte(num1_1,k);
-    }
   const double g1 = -dx*dx*(dx/2+dxav)/(4*(dx+dxam+dxav)*(dx+dxam)*dxam), g2 = (dx+2*dxam)*(dx+2*dxav)/(8*dxam*(dx+dxav));
   const double g3 = (dx+2*dxam)*(dx+2*dxav)/(8*dxav*(dx+dxam)), g4 = -dx*dx*(dx/2+dxam)/(4*(dx+dxam+dxav)*dxav*(dx+dxav));
-  for (k=0; k<ncomp; k++) flux[k] =( g1*T0_0[k] + g2*T0[k] + g3*T1[k] + g4*T1_1[k] ) * psc ;
+  for (k=0; k<ncomp; k++) flux[k] = (g1*transporte(num0_0,k) + g2*transporte(num0,k) + g3*transporte(num1,k) + g4*transporte(num1_1,k)) * psc;
 }
 
 template <typename Type_Double>
@@ -145,18 +139,100 @@ void Eval_Conv_VDF_tools::quick_fram_impl(const int ori,const double dx, const d
 
   for (int k=0; k<ncomp; k++)
     {
-      if ( (num0_0 == -1 && psc[k] >= 0 ) || (num1_1 == -1 && psc[k] <= 0 ) )
+      T0 = transporte(num0,k);
+      T0_0 = (num0_0!=-1?transporte(num0_0,k):0);
+      T1 = transporte(num1,k);
+      T1_1 = (num1_1!=-1?transporte(num1_1,k):0);
+
+      if (psc > 0)
         {
-          flux[k] = (psc[k] > 0) ? psc[k]*transporte(num0,k) : psc[k]*transporte(num1,k);
+          assert(num0_0!=-1);
+          trans_amont = T0;
+          curv = ( (T1 - T0)/dx - (T0 - T0_0)/dxam0 )/dm0 ;
         }
       else
         {
-          T0 = transporte(num0,k);
-          T0_0 = (num0_0!=-1?transporte(num0_0,k):0);
-          T1 = transporte(num1,k);
-          T1_1 = (num1_1!=-1?transporte(num1_1,k):0);
+          assert(num1_1!=-1);
+          trans_amont = T1;
+          curv = ( (T1_1 - T1)/dxam1 - (T1 - T0)/dx )/dm1;
+        }
+      flux[k] = 0.5*(T0+T1) - 0.125*(dx*dx)*curv;
+      // On applique le filtre Fram:
+      fr = ( num0_0 != -1 && num1_1 != -1 ) ? Fram(T0_0,T0,T1,T1_1) : 1.;
+      flux[k] = ((1.-fr)*flux[k] + fr*trans_amont)*psc;
+    }
+}
+
+// Views implementation:
+KOKKOS_INLINE_FUNCTION
+void Eval_Conv_VDF_tools::qcentre2_impl_view(const double psc, const int num0, const int num1, const int num0_0, const int num1_1, const int face,
+                                             CDoubleTabView transporte,DoubleArrView flux) const
+{
+  const int ncomp = (int)flux.size();
+  for (int k=0; k<ncomp; k++)
+    {
+      double T0 = transporte(num0, k);
+      double T1 = transporte(num1, k);
+      flux[k] = 0.5 * (T0 + T1) * psc;
+    }
+}
 
-          if (psc[k] > 0)
+KOKKOS_INLINE_FUNCTION
+void Eval_Conv_VDF_tools::qcentre2_impl_comp(const double psc, const int num0, const int num1,
+                                             CDoubleTabView transporte, const int k, double& flux) const
+{
+  flux = 0.5 * (transporte(num0, k) + transporte(num1, k)) * psc;
+}
+
+KOKKOS_INLINE_FUNCTION
+void Eval_Conv_VDF_tools::qcentre4_impl_view(const int ori,const double dx, const double dxam, const double dxav, const double psc, const int num0, const int num1,
+                                             const int num0_0, const int num1_1, const int face, CDoubleTabView transporte,DoubleArrView flux) const
+{
+  const int ncomp = (int)flux.size();
+  const double g1 = -dx*dx*(dx/2+dxav)/(4*(dx+dxam+dxav)*(dx+dxam)*dxam), g2 = (dx+2*dxam)*(dx+2*dxav)/(8*dxam*(dx+dxav));
+  const double g3 = (dx+2*dxam)*(dx+2*dxav)/(8*dxav*(dx+dxam)), g4 = -dx*dx*(dx/2+dxam)/(4*(dx+dxam+dxav)*dxav*(dx+dxav));
+  for (int k=0; k<ncomp; k++)
+    {
+      double T0 = transporte(num0,k);
+      double T0_0 = transporte(num0_0,k);
+      double T1 = transporte(num1,k);
+      double T1_1 = transporte(num1_1,k);
+      flux[k] =( g1*T0_0 + g2*T0 + g3*T1 + g4*T1_1 ) * psc ;
+    }
+}
+
+KOKKOS_INLINE_FUNCTION
+void Eval_Conv_VDF_tools::qcentre4_impl_comp(const int ori, const double dx, const double dxam, const double dxav,
+                                             const double psc, const int num0, const int num1,
+                                             const int num0_0, const int num1_1,
+                                             CDoubleTabView transporte, const int k, double& flux) const
+{
+  const double g1 = -dx*dx*(dx/2+dxav)/(4*(dx+dxam+dxav)*(dx+dxam)*dxam);
+  const double g2 = (dx+2*dxam)*(dx+2*dxav)/(8*dxam*(dx+dxav));
+  const double g3 = (dx+2*dxam)*(dx+2*dxav)/(8*dxav*(dx+dxam));
+  const double g4 = -dx*dx*(dx/2+dxam)/(4*(dx+dxam+dxav)*dxav*(dx+dxav));
+  flux = (g1*transporte(num0_0,k) + g2*transporte(num0,k) + g3*transporte(num1,k) + g4*transporte(num1_1,k)) * psc;
+}
+
+KOKKOS_INLINE_FUNCTION
+void Eval_Conv_VDF_tools::quick_fram_impl_view(const int ori,const double dx, const double dm0, const double dxam0, const double dm1, const double dxam1, const double psc,
+                                               const int num0, const int num1, const int num0_0, const int num1_1, const int face, CDoubleTabView transporte, DoubleArrView flux) const
+{
+  const int ncomp = (int)flux.size();
+  for (int k = 0; k < ncomp; k++)
+    {
+      if ( (num0_0 == -1 && psc >= 0 ) || (num1_1 == -1 && psc <= 0 ) )
+        {
+          flux[k] = (psc > 0) ? psc*transporte(num0,k) : psc*transporte(num1,k);
+        }
+      else
+        {
+          double T0 = transporte(num0,k);
+          double T0_0 = (num0_0!=-1?transporte(num0_0,k):0);
+          double T1 = transporte(num1,k);
+          double T1_1 = (num1_1!=-1?transporte(num1_1,k):0);
+          double trans_amont, curv;
+          if (psc > 0)
             {
               trans_amont = T0;
               curv = ( (T1 - T0)/dx - (T0 - T0_0)/dxam0 )/dm0 ;
@@ -168,10 +244,88 @@ void Eval_Conv_VDF_tools::quick_fram_impl(const int ori,const double dx, const d
             }
           flux[k] = 0.5*(T0+T1) - 0.125*(dx*dx)*curv;
           // On applique le filtre Fram:
-          fr = ( num0_0 != -1 && num1_1 != -1 ) ? Fram(T0_0,T0,T1,T1_1) : 1.;
-          flux[k] = ((1.-fr)*flux[k] + fr*trans_amont)*psc[k];
+          double fr = ( num0_0 != -1 && num1_1 != -1 ) ? Fram(T0_0,T0,T1,T1_1) : 1.;
+          flux[k] = ((1.-fr)*flux[k] + fr*trans_amont)*psc;
         }
     }
 }
 
+KOKKOS_INLINE_FUNCTION
+void Eval_Conv_VDF_tools::quick_fram_impl_comp(const int ori, const double dx, const double dm0, const double dxam0,
+                                               const double dm1, const double dxam1, const double psc,
+                                               const int num0, const int num1, const int num0_0, const int num1_1,
+                                               CDoubleTabView transporte, const int k, double& flux) const
+{
+  const double T0   = transporte(num0, k);
+  const double T0_0 = (num0_0 != -1 ? transporte(num0_0, k) : 0.);
+  const double T1   = transporte(num1, k);
+  const double T1_1 = (num1_1 != -1 ? transporte(num1_1, k) : 0.);
+  double trans_amont, curv;
+  if (psc > 0)
+    {
+      assert(num0_0 != -1);
+      trans_amont = T0;
+      curv = ((T1 - T0)/dx - (T0 - T0_0)/dxam0) / dm0;
+    }
+  else
+    {
+      assert(num1_1 != -1);
+      trans_amont = T1;
+      curv = ((T1_1 - T1)/dxam1 - (T1 - T0)/dx) / dm1;
+    }
+  double val = 0.5*(T0+T1) - 0.125*(dx*dx)*curv;
+  const double fr = (num0_0 != -1 && num1_1 != -1) ? Fram(T0_0, T0, T1, T1_1) : 1.;
+  flux = ((1.-fr)*val + fr*trans_amont) * psc;
+}
+
+// quick pour un champ face
+KOKKOS_INLINE_FUNCTION
+double Eval_Conv_VDF_tools::conv_quick_sharp_plus_impl(const double psc,const double vit_0, const double vit_1,
+                                                       const double vit_0_0, const double dx,
+                                                       const double dm, const double dxam) const
+{
+  double cf, curv, delta_0 = vit_0 - vit_0_0, delta = vit_1 - vit_0, dd1,utc, delta_delta;
+  curv = (delta/dx - delta_0/dxam)/dm ;
+  // Calcul de cf:
+  delta_delta = delta_0+delta;
+  dd1 = Kokkos::fabs(delta_delta);
+  if (dd1 < 1.e-5) cf = 0.125;
+  else
+    {
+      utc = delta_0/delta_delta;
+      cf = sharp2(utc);
+    }
+  return (0.5*(vit_0 + vit_1) - cf*(dx*dx)*curv)*psc;
+}
+
+// quick pour un champ face
+KOKKOS_INLINE_FUNCTION
+double Eval_Conv_VDF_tools::conv_quick_sharp_moins_impl(const double psc,const double vit_0,const double vit_1,
+                                                        const double vit_1_1,const double dx,
+                                                        const double dm,const double dxam) const
+{
+  double cf, curv, delta_1 = vit_1_1 - vit_1, delta = vit_1 - vit_0, dd1,utc, delta_delta;
+  curv = ( delta_1/dxam - delta/dx )/dm ;
+  // Calcul de cf:
+  delta_delta = delta_1+delta;
+  dd1 = Kokkos::fabs(delta_delta);
+  if (dd1 < 1.e-5) cf = 0.125;
+  else
+    {
+      utc = delta_1/delta_delta;
+      cf = sharp2(utc);
+    }
+  return (0.5*(vit_0 + vit_1) - cf*(dx*dx)*curv)*psc;
+}
+
+// Calcul des coefficients g1,g2,g3,g4 a partir de dxam,dx,dxav
+KOKKOS_INLINE_FUNCTION
+void Eval_Conv_VDF_tools::calcul_g_impl(const double dxam, const double dx, const double dxav, double& g1, double& g2, double& g3, double& g4) const
+{
+  g1 = -dx*dx*(dx/2+dxav)/(4*(dx+dxam+dxav)*(dx+dxam)*dxam);
+  g2 =  (dx+2*dxam)*(dx+2*dxav)/(8*dxam*(dx+dxav));
+  g3 =  (dx+2*dxam)*(dx+2*dxav)/(8*dxav*(dx+dxam));
+  g4 = -dx*dx*(dx/2+dxam)/(4*(dx+dxam+dxav)*dxav*(dx+dxav));
+}
+
 #endif /* Eval_Conv_VDF_tools_included */
diff --git a/src/VDF/Operateurs/Eval_Diff_Dift/Eval_Diff_VDF.h b/src/VDF/Operateurs/Eval_Diff_Dift/Eval_Diff_VDF.h
index fb72ea0311..2e4179e54f 100644
--- a/src/VDF/Operateurs/Eval_Diff_Dift/Eval_Diff_VDF.h
+++ b/src/VDF/Operateurs/Eval_Diff_Dift/Eval_Diff_VDF.h
@@ -24,6 +24,16 @@
 class Eval_Diff_VDF
 {
 public:
+  inline Eval_Diff_VDF() { }
+  inline Eval_Diff_VDF(const Eval_Diff_VDF& eval)
+  {
+    is_var_ = eval.is_var_;
+    ref_probleme_ = eval.ref_probleme_;
+    ref_diffusivite_ = eval.ref_diffusivite_;
+    tab_diffusivite_.ref(eval.tab_diffusivite_);
+    tab_alpha_.ref(eval.tab_alpha_);
+    tab_diffusivite_v_ = eval.tab_diffusivite_v_;
+  }
   virtual ~Eval_Diff_VDF() { }
 
   inline const int& is_var() const { return is_var_; }
@@ -71,43 +81,69 @@ class Eval_Diff_VDF
     update_diffusivite();
   }
 
+  // Template function by ExecSpace to get TRUSTTab tab_diffusivite_ (array on host) or view (array on device)
+  template<typename ExecSpace=HostSpace>
+  KOKKOS_INLINE_FUNCTION double tab_diffusivite(int face, int comp) const { if constexpr (std::is_same<ExecSpace, HostSpace>::value) return tab_diffusivite_(face, comp); else return tab_diffusivite_v_(face, comp); }
+
   // Methods used by the flux computation in template class:
-  inline double compute_heq_impl(double d0, int i, double d1, int j, int compo) const
+  template<typename ExecSpace=HostSpace>
+  KOKKOS_INLINE_FUNCTION double compute_heq_impl(double d0, int i, double d1, int j, int compo) const
   {
-    return 1. / (d0 / tab_diffusivite_(is_var_ * i, compo) + d1 / tab_diffusivite_(is_var_ * j, compo));
+    return 1. / (d0 / tab_diffusivite<ExecSpace>(is_var_ * i, compo) + d1 / tab_diffusivite<ExecSpace>(is_var_ * j, compo));
   }
-
-  inline double nu_1_impl(int i, int compo) const { return tab_diffusivite_(is_var_ * i, compo); }
-  inline double nu_2_impl(int i, int compo) const { return tab_diffusivite_(is_var_ * i, compo); }
-
-  inline double nu_1_impl_face(int i, int j, int compo) const
+  template<typename ExecSpace=HostSpace>
+  KOKKOS_INLINE_FUNCTION double nu_1_impl(int i, int compo) const
   {
-    return 0.5 * (tab_diffusivite_(is_var_ * i, compo) + tab_diffusivite_(is_var_ * j, compo));
+    return tab_diffusivite<ExecSpace>(is_var_ * i, compo);
   }
-
-  inline double nu_2_impl_face(int i, int j, int k, int l, int compo) const
+  template<typename ExecSpace=HostSpace>
+  KOKKOS_INLINE_FUNCTION double nu_2_impl(int i, int compo) const
   {
-    return 0.25 * (tab_diffusivite_(is_var_ * i, compo) + tab_diffusivite_(is_var_ * j, compo) + tab_diffusivite_(is_var_ * k, compo) + tab_diffusivite_(is_var_ * l, compo));
+    return tab_diffusivite<ExecSpace>(is_var_ * i, compo);
+  }
+  template<typename ExecSpace=HostSpace>
+  KOKKOS_INLINE_FUNCTION double nu_1_impl_face(int i, int j, int compo) const
+  {
+    return 0.5 * (tab_diffusivite<ExecSpace>(is_var_ * i, compo) + tab_diffusivite<ExecSpace>(is_var_ * j, compo));
+  }
+  template<typename ExecSpace=HostSpace>
+  KOKKOS_INLINE_FUNCTION double nu_2_impl_face(int i, int j, int k, int l, int compo) const
+  {
+    return 0.25 * (tab_diffusivite<ExecSpace>(is_var_ * i, compo) + tab_diffusivite<ExecSpace>(is_var_ * j, compo) + tab_diffusivite<ExecSpace>(is_var_ * k, compo) + tab_diffusivite<ExecSpace>(is_var_ * l, compo));
+  }
+  template<typename ExecSpace=HostSpace>
+  KOKKOS_INLINE_FUNCTION double nu_lam_impl_face(int i, int j, int k, int l, int compo) const
+  {
+    return nu_2_impl_face<ExecSpace>(i, j, k, l, compo);
+  }
+  template<typename ExecSpace=HostSpace>
+  KOKKOS_INLINE_FUNCTION double nu_lam_impl_face2(int i, int j, int compo) const
+  {
+    return nu_1_impl_face<ExecSpace>(i, j, compo);
   }
-
-  inline double nu_lam_impl_face(int i, int j, int k, int l, int compo) const { return nu_2_impl_face(i, j, k, l, compo); }
-  inline double nu_lam_impl_face2(int i, int j, int compo) const { return nu_1_impl_face(i, j, compo); }
 
   // These methods will be overloaded in DIFT operators (See Eval_Dift_VDF_const_Elem for example ...)
   inline int get_ind_Fluctu_Term() const { return 0; }
-  inline double get_dv_mvol(const int i) const { throw; } /* seulement pour K-Eps */
+  template<typename ExecSpace=HostSpace>
+  KOKKOS_INLINE_FUNCTION double get_dv_mvol(const int i) const { Kokkos::abort("get_dv_mvol not implemented"); return 0.; } /* seulement pour K-Eps */
   inline virtual double get_equivalent_distance(int boundary_index,int local_face) const { return 0; }
-  inline double nu_t_impl(int i, int compo) const { return 0.; }
-  inline double tau_tan_impl(int i, int j) const { return 0.; }
-  inline bool uses_wall() const { return false; }
-  inline bool uses_mod() const { return false; }
+  template<typename ExecSpace=HostSpace>
+  KOKKOS_INLINE_FUNCTION double nu_t_impl(int i, int compo) const { return 0.; }
+  KOKKOS_INLINE_FUNCTION double tau_tan_impl(int i, int j) const { return 0.; }
+  KOKKOS_INLINE_FUNCTION bool uses_wall() const { return false; }
+  KOKKOS_INLINE_FUNCTION bool uses_mod() const { return false; }
   inline const DoubleTab& get_k_elem() const { throw; } // pour F5 seulement ...
 
+  virtual void view_ro_impl() const
+  {
+    tab_diffusivite_v_ = tab_diffusivite_.view_ro();
+  }
 protected:
   int is_var_ = 0;
   OBS_PTR(Probleme_base) ref_probleme_;
   OBS_PTR(Champ_base) ref_diffusivite_;
   DoubleTab tab_diffusivite_, tab_alpha_;
+  mutable CDoubleTabView tab_diffusivite_v_;
 };
 
 #endif /* Eval_Diff_VDF_included */
diff --git a/src/VDF/Operateurs/Eval_Diff_Dift/Eval_Diff_VDF_Elem_Gen.h b/src/VDF/Operateurs/Eval_Diff_Dift/Eval_Diff_VDF_Elem_Gen.h
index 274b7a4794..c9ee812652 100644
--- a/src/VDF/Operateurs/Eval_Diff_Dift/Eval_Diff_VDF_Elem_Gen.h
+++ b/src/VDF/Operateurs/Eval_Diff_Dift/Eval_Diff_VDF_Elem_Gen.h
@@ -1,5 +1,5 @@
 /****************************************************************************
-* Copyright (c) 2025, CEA
+* Copyright (c) 2026, CEA
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
@@ -63,19 +63,12 @@ class Eval_Diff_VDF_Elem_Gen : public Eval_VDF_Elem, public Evaluateur_VDF
    * *********  POUR L'EXPLICITE ********** *
    * ************************************** */
 
-  template <typename BC, typename Type_Double> // Generic return
-  inline void flux_face(const DoubleTab&, const DoubleTab&, const int, const BC&, int, Type_Double& ) const { /* Do nothing */ }
-
   // To overload
-  template <typename Type_Double> inline void flux_face(const DoubleTab&, const DoubleTab&, const int, const Dirichlet_entree_fluide&, const int, Type_Double& ) const;
-  template <typename Type_Double> inline void flux_face(const DoubleTab&, const DoubleTab&, const int, const Scalaire_impose_paroi&, const int, Type_Double& ) const;
-  template <typename Type_Double> inline void flux_face(const DoubleTab&, const DoubleTab&, const int, const Dirichlet_loi_paroi&, const int, Type_Double& ) const;
-  template <typename Type_Double> inline void flux_face(const DoubleTab&, const DoubleTab&, const int, const Neumann_paroi&, const int, Type_Double& ) const;
-  template <typename Type_Double> inline void flux_face(const DoubleTab&, const DoubleTab&, const int, const Periodique&, const int, Type_Double& ) const;
   template <typename Type_Double> inline void flux_face(const DoubleTab&, const DoubleTab&, const int, const Echange_global_impose&, const int, Type_Double& ) const;
-  template <typename Type_Double> inline void flux_face(const DoubleTab&, const DoubleTab&, const int, const Dirichlet_paroi_fixe&, const int, Type_Double& ) const;
   template <typename Type_Double> inline void flux_face(const DoubleTab&, const int, const int, const int, const Echange_externe_impose&, const int, Type_Double& ) const;
-  template <typename Type_Double> inline void flux_faces_interne(const DoubleTab&, const int, Type_Double& ) const;
+  template <typename BC>
+  KOKKOS_INLINE_FUNCTION void flux_faces_bord_comp(CDoubleTabView, CDoubleTabView, const int, const BC_View&, const int, const int, double&) const;
+  KOKKOS_INLINE_FUNCTION void flux_faces_interne_comp(CDoubleTabView, const int, const int, double&) const;
 
   /* ************************************** *
    * *********  POUR L'IMPLICITE ********** *
@@ -103,13 +96,37 @@ class Eval_Diff_VDF_Elem_Gen : public Eval_VDF_Elem, public Evaluateur_VDF
   template <typename Type_Double> inline void secmem_face(const int, const Echange_global_impose&, const int, Type_Double& ) const;
   template <typename Type_Double> inline void secmem_face(const int, const int, const int, const Echange_externe_impose&, const int, Type_Double& ) const;
   template <typename Type_Double> inline void secmem_faces_interne(const int, Type_Double& ) const { /* Do nothing */ }
+  void view_ro() const override
+  {
+    Evaluateur_VDF::view_ro();
+    static_cast<const DERIVED_T *>(this)->view_ro_impl();
+  }
 
 private:
-  inline double Dist_face_elem0(const int face, const int n0) const { return DERIVED_T::IS_AXI ? le_dom->dist_face_elem0_axi(face,n0) : le_dom->dist_face_elem0(face,n0); }
-  inline double Dist_face_elem1(const int face, const int n1) const { return DERIVED_T::IS_AXI ? le_dom->dist_face_elem1_axi(face,n1) : le_dom->dist_face_elem1(face,n1); }
-  inline double Dist_norm_bord (const int face) const
+  template<typename ExecSpace=HostSpace>
+  KOKKOS_INLINE_FUNCTION double get_dist_face_elem0(const int face, const int n0) const
+  {
+    if constexpr (std::is_same<ExecSpace, HostSpace>::value)
+      return DERIVED_T::IS_AXI ? le_dom->dist_face_elem0_axi(face,n0) : le_dom->dist_face_elem0(face,n0);
+    else
+      return DERIVED_T::IS_AXI ? le_dom_v_.dist_face_elem0_axi(face,n0) : le_dom_v_.dist_face_elem0(face,n0);
+  }
+  template<typename ExecSpace=HostSpace>
+  KOKKOS_INLINE_FUNCTION double get_dist_face_elem1(const int face, const int n1) const
+  {
+    if constexpr (std::is_same<ExecSpace, HostSpace>::value)
+      return DERIVED_T::IS_AXI ? le_dom->dist_face_elem1_axi(face,n1) : le_dom->dist_face_elem1(face,n1);
+    else
+      return DERIVED_T::IS_AXI ? le_dom_v_.dist_face_elem1_axi(face,n1) : le_dom_v_.dist_face_elem1(face,n1);
+  }
+  template<typename ExecSpace=HostSpace>
+  KOKKOS_INLINE_FUNCTION double get_dist_norm_bord(const int face) const
   {
-    double val = DERIVED_T::IS_AXI ? le_dom->dist_norm_bord_axi(face) : le_dom->dist_norm_bord(face);
+    double val;
+    if constexpr (std::is_same<ExecSpace, HostSpace>::value)
+      val = DERIVED_T::IS_AXI ? le_dom->dist_norm_bord_axi(face) : le_dom->dist_norm_bord(face);
+    else
+      val = DERIVED_T::IS_AXI ? le_dom_v_.dist_norm_bord_axi(face) : le_dom_v_.dist_norm_bord(face);
     return DERIVED_T::IS_MULTD ? val : 2*val;
   }
 
@@ -133,11 +150,15 @@ class Eval_Diff_VDF_Elem_Gen : public Eval_VDF_Elem, public Evaluateur_VDF
 
   // CRTP pattern to static_cast the appropriate class and get the implementation : This is magic !
   inline int ind_Fluctu_Term() const { return static_cast<const DERIVED_T *>(this)->get_ind_Fluctu_Term(); } // See generic impl in the class Eval_Diff_VDF. They will be overloaded for Dift ops
-  inline double nu_1(const int i, int compo = 0) const { return static_cast<const DERIVED_T *>(this)->nu_1_impl(i,compo); }
-  inline double nu_2(const int i, int compo = 0) const { return static_cast<const DERIVED_T *>(this)->nu_2_impl(i,compo); }
-  inline double compute_heq(const double d0, const int i0, const double d1, const int i1, int compo = 0) const { return static_cast<const DERIVED_T *>(this)->compute_heq_impl(d0,i0,d1,i1,compo); }
+  template <typename ExecSpace=HostSpace>
+  KOKKOS_INLINE_FUNCTION double nu_1(const int i, int compo = 0) const { return static_cast<const DERIVED_T *>(this)->template nu_1_impl<ExecSpace>(i,compo); }
+  template <typename ExecSpace=HostSpace>
+  KOKKOS_INLINE_FUNCTION double nu_2(const int i, int compo = 0) const { return static_cast<const DERIVED_T *>(this)->template nu_2_impl<ExecSpace>(i,compo); }
+  template <typename ExecSpace=HostSpace>
+  KOKKOS_INLINE_FUNCTION double compute_heq(const double d0, const int i0, const double d1, const int i1, int compo = 0) const { return static_cast<const DERIVED_T *>(this)->template compute_heq_impl<ExecSpace>(d0,i0,d1,i1,compo); }
   inline double equivalent_distance (const int boundary_index, const int local_face) const { return static_cast<const DERIVED_T *>(this)->get_equivalent_distance(boundary_index,local_face); }
-  inline double dv_mvol(const int i) const { return static_cast<const DERIVED_T *>(this)->get_dv_mvol(i); }
+  template <typename ExecSpace=HostSpace>
+  KOKKOS_INLINE_FUNCTION double dv_mvol(const int i) const { return static_cast<const DERIVED_T *>(this)->template get_dv_mvol<ExecSpace>(i); }
 };
 
 #include <Eval_Diff_VDF_Elem_Gen.tpp> // templates specializations ici ;)
diff --git a/src/VDF/Operateurs/Eval_Diff_Dift/Eval_Diff_VDF_Elem_Gen.tpp b/src/VDF/Operateurs/Eval_Diff_Dift/Eval_Diff_VDF_Elem_Gen.tpp
index d532e880dd..74a1cf49eb 100644
--- a/src/VDF/Operateurs/Eval_Diff_Dift/Eval_Diff_VDF_Elem_Gen.tpp
+++ b/src/VDF/Operateurs/Eval_Diff_Dift/Eval_Diff_VDF_Elem_Gen.tpp
@@ -18,159 +18,6 @@
 
 #include <T_paroi_Champ_P0_VDF.h>
 
-template <typename DERIVED_T> template <typename Type_Double>
-inline void Eval_Diff_VDF_Elem_Gen<DERIVED_T>::flux_face(const DoubleTab& inco, const DoubleTab& val_b, const int face, const Dirichlet_entree_fluide& la_cl, const int num1, Type_Double& flux) const
-{
-  // Olga avait mis : double dist = 2*Dist_norm_bord(face);
-  // Pierre dit que :
-  const double dist = Dist_norm_bord(face);
-  const int i = elem_(face,0), j = elem_(face,1), ncomp = flux.size_array();
-
-  for (int k = 0; k < ncomp; k++)
-    {
-      if (DERIVED_T::IS_QUASI)
-        {
-          const double T_imp = la_cl.val_imp(face - num1, k);
-          const int ori = DERIVED_T::IS_ANISO ? orientation(face) : k;
-          flux[k] = (i != -1) ? (T_imp - inco(i, k)) / dv_mvol(i) * surface(face) * porosite(face) * nu_1(i, ori) / dist :
-                    (inco(j, k) - T_imp) / dv_mvol(j) * surface(face) * porosite(face) * nu_1(j, ori) / dist;
-        }
-      else if (DERIVED_T::IS_MULTI_SCALAR_DIFF)
-        {
-          flux[k] = 0.0;
-          for (int l = 0; l < ncomp; l++)
-            {
-              const double T_imp = la_cl.val_imp(face - num1, l);
-              const int ori = ncomp * k + l;
-
-              flux[k] += (i != -1) ? (T_imp - inco(i, l)) * surface(face) * porosite(face) * nu_1(i, ori) / dist :
-                         (inco(j, l) - T_imp) * surface(face) * porosite(face) * nu_1(j, ori) / dist;
-            }
-        }
-      else
-        {
-          const double T_imp = la_cl.val_imp(face - num1, k);
-          const int ori = DERIVED_T::IS_ANISO ? orientation(face) : k;
-
-          flux[k] = (i != -1) ? (T_imp - inco(i, k)) * surface(face) * porosite(face) * nu_1(i, ori) / dist :
-                    (inco(j, k) - T_imp) * surface(face) * porosite(face) * nu_1(j, ori) / dist;
-        }
-    }
-}
-
-template <typename DERIVED_T> template <typename Type_Double>
-inline void Eval_Diff_VDF_Elem_Gen<DERIVED_T>::flux_face(const DoubleTab& inco, const DoubleTab& val_b, const int face, const Scalaire_impose_paroi& la_cl, const int num1, Type_Double& flux) const
-{
-  const double dist = Dist_norm_bord(face);
-  const int i = elem_(face,0), j = elem_(face,1), ncomp = flux.size_array();
-
-  for (int k = 0; k < ncomp; k++)
-    {
-      if (DERIVED_T::IS_MULTI_SCALAR_DIFF)
-        {
-          flux[k] = 0.0;
-          for (int l = 0; l < ncomp; l++)
-            {
-              const double T_imp = la_cl.val_imp(face-num1, l);
-              const int ori = ncomp * k + l;
-              flux[k] += (i != -1) ? (T_imp-inco(i,l))*surface(face)*porosite(face)*nu_1(i,ori)/dist :
-                         (inco(j,l)-T_imp)*surface(face)*porosite(face)*nu_1(j,ori)/dist;
-            }
-        }
-      else
-        {
-          const double T_imp = la_cl.val_imp(face-num1,k);
-          const int ori = DERIVED_T::IS_ANISO ? orientation(face) : k;
-          flux[k] = (i != -1) ? (T_imp-inco(i,k))*surface(face)*porosite(face)*nu_1(i,ori)/dist :
-                    (inco(j,k)-T_imp)*surface(face)*porosite(face)*nu_1(j,ori)/dist;
-        }
-    }
-}
-
-template <typename DERIVED_T> template <typename Type_Double>
-inline void Eval_Diff_VDF_Elem_Gen<DERIVED_T>::flux_face(const DoubleTab& inco, const DoubleTab& val_b, const int face, const Dirichlet_loi_paroi& la_cl, const int num1, Type_Double& flux) const
-{
-  if (DERIVED_T::IS_MULTI_SCALAR_DIFF) throw;
-
-  const double dist = Dist_norm_bord(face);
-  const int i = elem_(face,0), j = elem_(face,1), ncomp = flux.size_array();
-
-  for (int k = 0; k < ncomp; k++)
-    {
-      const double T_imp = la_cl.val_imp(face-num1,k);
-      const int ori = DERIVED_T::IS_ANISO ? orientation(face) : k;
-      flux[k] = (i != -1) ? (T_imp-inco(i,k))*surface(face)*porosite(face)*nu_1(i,ori)/dist : (inco(j,k)-T_imp)*surface(face)*porosite(face)*nu_1(j,ori)/dist;
-    }
-}
-
-template <typename DERIVED_T> template <typename Type_Double>
-inline void Eval_Diff_VDF_Elem_Gen<DERIVED_T>::flux_face(const DoubleTab& , const DoubleTab& val_b, const int face, const Neumann_paroi& la_cl, const int num1, Type_Double& flux) const
-{
-  const int i = elem_(face,0), ncomp = flux.size_array();
-
-  // XXX LUIS : Note : Pas de distinguo entre MULTISCALAR_DIFF et une diffusion normale pour des CL de Neumann
-  for (int k = 0; k < ncomp; k++)
-    flux[k] = ((i != -1) ? 1 : -1) * la_cl.flux_impose(face - num1, k) * surface(face);
-}
-
-template <typename DERIVED_T> template <typename Type_Double>
-inline void Eval_Diff_VDF_Elem_Gen<DERIVED_T>::flux_face(const DoubleTab& inco, const DoubleTab&, const int face, const Periodique& la_cl, const int , Type_Double& flux) const
-{
-  const int i = elem_(face,0), j = elem_(face,1), ncomp = flux.size_array();
-  const double d0 = le_dom->dist_face_elem0_period(face,i,la_cl.distance()), d1 = le_dom->dist_face_elem1_period(face,j,la_cl.distance());
-
-  for (int k = 0; k < ncomp; k++)
-    {
-      if (DERIVED_T::IS_MULTI_SCALAR_DIFF)
-        {
-
-          if (DERIVED_T::IS_ANISO) throw; // XXX LUIS : pas d'anisotropie pour l'instant
-
-          flux[k] = 0.0;
-          for (int l = 0; l < ncomp; l++)
-            {
-              const int comp_diff = ncomp * k + l;
-              double heq = 0.;
-
-              if (nu_1(i,comp_diff) == 0.0 || nu_1(j,comp_diff) == 0.0) heq = 0.;
-              else
-                {
-                  assert(nu_1(i,comp_diff) != 0.0 && nu_1(j,comp_diff) != 0.0);
-                  heq = compute_heq(d0, i, d1, j, comp_diff);
-                }
-              flux[k] += DERIVED_T::IS_QUASI ? heq*(inco(j,l)/dv_mvol(j) - inco(i,l)/dv_mvol(i))*surface(face)*porosite(face) : heq*(inco(j,l) - inco(i,l))*surface(face)*porosite(face);
-            }
-
-        }
-      else
-        {
-          double heq = -123.;
-          const int ori = DERIVED_T::IS_ANISO ? orientation(face) : k;
-          if (nu_1(i,ori) == 0.0 || nu_1(j,ori) == 0.0) heq = 0.;
-          else
-            {
-              assert(nu_1(i,ori) != 0.0 && nu_1(j,ori) != 0.0);
-              heq = compute_heq(d0,i, d1,j,ori);
-            }
-          flux[k] = DERIVED_T::IS_QUASI ? heq*(inco(j,k)/dv_mvol(j) - inco(i,k)/dv_mvol(i))*surface(face)*porosite(face) : heq*(inco(j,k) - inco(i,k))*surface(face)*porosite(face);
-        }
-    }
-}
-
-template <typename DERIVED_T> template <typename Type_Double>
-inline void Eval_Diff_VDF_Elem_Gen<DERIVED_T>::flux_face(const DoubleTab& inco, const DoubleTab& val_b, const int face, const Dirichlet_paroi_fixe&, const int num1, Type_Double& flux ) const
-{
-  if (DERIVED_T::IS_MULTI_SCALAR_DIFF) throw;
-
-  const int i = elem_(face,0), j = elem_(face,1), ncomp = flux.size_array();
-  const double dist = Dist_norm_bord(face);
-
-  if (DERIVED_T::IS_QUASI)
-    for (int k = 0; k < ncomp; k++) flux[k] = (i != -1) ? -inco(i,k)*surface(face)*porosite(face)*nu_1(i,k)/dv_mvol(i)/dist : inco(j,k)*surface(face)*porosite(face)*nu_1(j,k)/dv_mvol(j)/dist;
-  else
-    for (int k = 0; k < ncomp; k++) flux[k] = (i != -1) ? -inco(i,k)*surface(face)*porosite(face)*nu_1(i,k)/dist : inco(j,k)*surface(face)*porosite(face)*nu_1(j,k)/dist;
-}
-
 template <typename DERIVED_T> template <typename Type_Double>
 inline void Eval_Diff_VDF_Elem_Gen<DERIVED_T>::flux_face(const DoubleTab& inco, const DoubleTab&, const int face , const Echange_global_impose& la_cl, const int num1, Type_Double& flux) const
 {
@@ -346,51 +193,157 @@ inline void Eval_Diff_VDF_Elem_Gen<DERIVED_T>::flux_face(const DoubleTab& inco,
     }
 }
 
-template <typename DERIVED_T> template <typename Type_Double>
-inline void Eval_Diff_VDF_Elem_Gen<DERIVED_T>::flux_faces_interne(const DoubleTab& inco, const int face, Type_Double& flux) const
+template <typename DERIVED_T> template <typename BC>
+KOKKOS_INLINE_FUNCTION void Eval_Diff_VDF_Elem_Gen<DERIVED_T>::flux_faces_bord_comp(CDoubleTabView inco, CDoubleTabView val_b, const int face, const BC_View& bc_view, const int num1, const int k, double& flux) const
 {
-  const int i = elem_(face,0), j = elem_(face,1), ncomp = flux.size_array();
-  double heq, d0 = Dist_face_elem0(face,i), d1 = Dist_face_elem1(face,j);
-  for (int k = 0; k < ncomp; k++)
+  if constexpr (std::is_same_v<BC, Periodique>)
     {
-      const int ori = DERIVED_T::IS_ANISO ? orientation(face) : k;
-      if (DERIVED_T::IS_RANS)
-        {
-          heq = compute_heq(d0,i, d1,j,ori); // pas d'assert pour k-eps !
-          flux[k] = DERIVED_T::IS_QUASI ? heq*(inco(j,k)/dv_mvol(j) - inco(i,k)/dv_mvol(i))*surface(face)*porosite(face) :
-                    heq*(inco(j,k)-inco(i,k))*surface(face)*porosite(face);
-        }
-      else if (DERIVED_T::IS_MULTI_SCALAR_DIFF)
+      if (DERIVED_T::IS_MULTI_SCALAR_DIFF) { Process::Kokkos_exit("IS_MULTI_SCALAR_DIFF not supported in _comp"); return; }
+      const double distance = bc_view.val[0](0,0);
+      const int i = elem_v_(face,0), j = elem_v_(face,1);
+      const double d0 = le_dom_v_.dist_face_elem0_period(face,i,distance);
+      const double d1 = le_dom_v_.dist_face_elem1_period(face,j,distance);
+      const int ori = DERIVED_T::IS_ANISO ? orientation_v_(face) : k;
+      const double heq = compute_heq<DeviceSpace>(d0,i,d1,j,ori);
+      flux = DERIVED_T::IS_QUASI ? heq*(inco(j,k)/dv_mvol<DeviceSpace>(j) - inco(i,k)/dv_mvol<DeviceSpace>(i))*surface_v_(face)*porosite_v_(face)
+             : heq*(inco(j,k) - inco(i,k))*surface_v_(face)*porosite_v_(face);
+    }
+  else if constexpr (std::is_same_v<BC, Dirichlet_entree_fluide>)
+    {
+      const double dist = get_dist_norm_bord<DeviceSpace>(face);
+      const int i = elem_v_(face,0), j = elem_v_(face,1);
+      if (DERIVED_T::IS_MULTI_SCALAR_DIFF)
         {
-          flux[k] = 0.0;
+          const int ncomp = (int)inco.extent(1);
+          flux = 0.;
           for (int l = 0; l < ncomp; l++)
             {
-              const int comp_diff = ncomp * k + l;
-              if (nu_1(i,comp_diff) == 0.0 || nu_1(j,comp_diff) == 0.0) heq = 0.;
-              else
-                {
-                  assert(nu_1(i,comp_diff) != 0.0 && nu_1(j,comp_diff) != 0.0);
-                  heq = compute_heq(d0, i, d1, j, comp_diff);
-                }
-              flux[k] += heq * (inco(j, l) - inco(i, l)) * surface(face) * porosite(face);
+              const double T_imp = bc_view.val[0](face - num1, l);
+              const int ori = ncomp * k + l;
+              flux += (i != -1) ? (T_imp - inco(i,l)) * surface_v_(face) * porosite_v_(face) * nu_1<DeviceSpace>(i,ori) / dist
+                      : (inco(j,l) - T_imp) * surface_v_(face) * porosite_v_(face) * nu_1<DeviceSpace>(j,ori) / dist;
             }
         }
+      else if (DERIVED_T::IS_QUASI)
+        {
+          const double T_imp = bc_view.val[0](face-num1,k);
+          const int ori = DERIVED_T::IS_ANISO ? orientation_v_(face) : k;
+          flux = (i != -1) ? (T_imp-inco(i,k))/dv_mvol<DeviceSpace>(i)*surface_v_(face)*porosite_v_(face)*nu_1<DeviceSpace>(i,ori)/dist
+                 : (inco(j,k)-T_imp)/dv_mvol<DeviceSpace>(j)*surface_v_(face)*porosite_v_(face)*nu_1<DeviceSpace>(j,ori)/dist;
+        }
       else
         {
-          if (nu_1(i,ori) == 0.0 || nu_1(j,ori) == 0.0) heq = 0.;
-          else
+          const double T_imp = bc_view.val[0](face-num1,k);
+          const int ori = DERIVED_T::IS_ANISO ? orientation_v_(face) : k;
+          flux = (i != -1) ? (T_imp-inco(i,k))*surface_v_(face)*porosite_v_(face)*nu_1<DeviceSpace>(i,ori)/dist
+                 : (inco(j,k)-T_imp)*surface_v_(face)*porosite_v_(face)*nu_1<DeviceSpace>(j,ori)/dist;
+        }
+    }
+  else if constexpr (std::is_same_v<BC, Scalaire_impose_paroi>)
+    {
+      const double dist = get_dist_norm_bord<DeviceSpace>(face);
+      const int i = elem_v_(face,0), j = elem_v_(face,1);
+      if (DERIVED_T::IS_MULTI_SCALAR_DIFF)
+        {
+          const int ncomp = (int)inco.extent(1);
+          flux = 0.;
+          for (int l = 0; l < ncomp; l++)
             {
-              assert(nu_1(i,ori) != 0.0 && nu_1(j,ori) != 0.0);
-              heq = compute_heq(d0,i, d1,j,ori);
+              const double T_imp = bc_view.val[0](face - num1, l);
+              const int ori = ncomp * k + l;
+              flux += (i != -1) ? (T_imp - inco(i,l)) * surface_v_(face) * porosite_v_(face) * nu_1<DeviceSpace>(i,ori) / dist
+                      : (inco(j,l) - T_imp) * surface_v_(face) * porosite_v_(face) * nu_1<DeviceSpace>(j,ori) / dist;
             }
-          flux[k] = heq*(inco(j,k)-inco(i,k))*surface(face)*porosite(face);
+        }
+      else
+        {
+          const double T_imp = bc_view.val[0](face-num1,k);
+          const int ori = DERIVED_T::IS_ANISO ? orientation_v_(face) : k;
+          flux = (i != -1) ? (T_imp-inco(i,k))*surface_v_(face)*porosite_v_(face)*nu_1<DeviceSpace>(i,ori)/dist
+                 : (inco(j,k)-T_imp)*surface_v_(face)*porosite_v_(face)*nu_1<DeviceSpace>(j,ori)/dist;
         }
     }
+  else if constexpr (std::is_same_v<BC, Dirichlet_loi_paroi>)
+    {
+      if (DERIVED_T::IS_MULTI_SCALAR_DIFF) { Process::Kokkos_exit("IS_MULTI_SCALAR_DIFF not supported in _comp"); return; }
+      const double dist = get_dist_norm_bord<DeviceSpace>(face);
+      const int i = elem_v_(face,0), j = elem_v_(face,1);
+      const double T_imp = bc_view.val[0](face-num1,k);
+      const int ori = DERIVED_T::IS_ANISO ? orientation_v_(face) : k;
+      flux = (i != -1) ? (T_imp-inco(i,k))*surface_v_(face)*porosite_v_(face)*nu_1<DeviceSpace>(i,ori)/dist
+             : (inco(j,k)-T_imp)*surface_v_(face)*porosite_v_(face)*nu_1<DeviceSpace>(j,ori)/dist;
+    }
+  else if constexpr (std::is_same_v<BC, Neumann_paroi>)
+    {
+      // No distinction between IS_MULTI_SCALAR_DIFF and normal diffusion for Neumann BCs
+      const int i = elem_v_(face,0);
+      flux = ((i != -1) ? 1 : -1) * bc_view.val[0](face-num1,k)*surface_v_(face);
+    }
+  else if constexpr (std::is_same_v<BC, Dirichlet_paroi_fixe>)
+    {
+      if (DERIVED_T::IS_MULTI_SCALAR_DIFF) { Process::Kokkos_exit("IS_MULTI_SCALAR_DIFF not supported in _comp"); return; }
+      const int i = elem_v_(face,0), j = elem_v_(face,1);
+      const double dist = get_dist_norm_bord<DeviceSpace>(face);
+      if (DERIVED_T::IS_QUASI)
+        flux = (i != -1) ? -inco(i,k)*surface_v_(face)*porosite_v_(face)*nu_1<DeviceSpace>(i,k)/dv_mvol<DeviceSpace>(i)/dist
+               :  inco(j,k)*surface_v_(face)*porosite_v_(face)*nu_1<DeviceSpace>(j,k)/dv_mvol<DeviceSpace>(j)/dist;
+      else
+        flux = (i != -1) ? -inco(i,k)*surface_v_(face)*porosite_v_(face)*nu_1<DeviceSpace>(i,k)/dist
+               :  inco(j,k)*surface_v_(face)*porosite_v_(face)*nu_1<DeviceSpace>(j,k)/dist;
+    }
+  else if constexpr(std::is_same_v<BC, Echange_global_impose>)
+    {
+      if (DERIVED_T::IS_MULTI_SCALAR_DIFF)
+        {
+          Process::Kokkos_exit("IS_MULTI_SCALAR_DIFF not supported for Echange_global_impose");
+          return;
+        }
+      const int i = elem_v_(face, 0), j = elem_v_(face, 1);
+      const double h = bc_view.val[0](face - num1, k);
+      const double Text = bc_view.val[1](face - num1, k);
+      const double phi = bc_view.val[2].data() ? bc_view.val[2](face - num1, k) : 0.0;
+      flux = (i != -1) ? (phi + h * (Text - inco(i, k))) * surface_v_(face)
+             : (-phi + h * (inco(j, k) - Text)) * surface_v_(face);
+    }
+  else
+    flux = 0.; // Do nothing for Neumann_sortie_libre, Symetrie, etc.
+}
+
+template <typename DERIVED_T>
+KOKKOS_INLINE_FUNCTION void Eval_Diff_VDF_Elem_Gen<DERIVED_T>::flux_faces_interne_comp(CDoubleTabView inco, const int face, const int k, double& flux) const
+{
+  const int i = elem_v_(face,0), j = elem_v_(face,1);
+  const double d0 = get_dist_face_elem0<DeviceSpace>(face,i), d1 = get_dist_face_elem1<DeviceSpace>(face,j);
+  const int ori = DERIVED_T::IS_ANISO ? orientation_v_(face) : k;
+  if (DERIVED_T::IS_RANS)
+    {
+      const double heq = compute_heq<DeviceSpace>(d0,i,d1,j,ori);
+      flux = DERIVED_T::IS_QUASI ? heq*(inco(j,k)/dv_mvol<DeviceSpace>(j) - inco(i,k)/dv_mvol<DeviceSpace>(i))*surface_v_(face)*porosite_v_(face)
+             : heq*(inco(j,k)-inco(i,k))*surface_v_(face)*porosite_v_(face);
+    }
+  else if (DERIVED_T::IS_MULTI_SCALAR_DIFF)
+    {
+      const int ncomp = (int)inco.extent(1);
+      flux = 0.;
+      for (int l = 0; l < ncomp; l++)
+        {
+          const int comp_diff = ncomp * k + l;
+          const double heq = (nu_1<DeviceSpace>(i,comp_diff) == 0.0 || nu_1<DeviceSpace>(j,comp_diff) == 0.0) ? 0. : compute_heq<DeviceSpace>(d0, i, d1, j, comp_diff);
+          flux += heq * (inco(j, l) - inco(i, l)) * surface_v_(face) * porosite_v_(face);
+        }
+    }
+  else
+    {
+      double heq;
+      if (nu_1<DeviceSpace>(i,ori) == 0.0 || nu_1<DeviceSpace>(j,ori) == 0.0) heq = 0.;
+      else heq = compute_heq<DeviceSpace>(d0,i,d1,j,ori);
+      flux = heq*(inco(j,k)-inco(i,k))*surface_v_(face)*porosite_v_(face);
+    }
 }
 
 /* ************************************** *
- * *********  POUR L'IMPLICITE ********** *
- * ************************************** */
+* *********  POUR L'IMPLICITE ********** *
+* ************************************** */
 
 template <typename DERIVED_T> template <typename Type_Double>
 inline void Eval_Diff_VDF_Elem_Gen<DERIVED_T>::coeffs_face(const int face, const int, const Dirichlet_entree_fluide& la_cl, Type_Double& aii, Type_Double& ajj) const
@@ -399,7 +352,7 @@ inline void Eval_Diff_VDF_Elem_Gen<DERIVED_T>::coeffs_face(const int face, const
   const int i = elem_(face,0), j = elem_(face,1),
             ncomp = DERIVED_T::IS_MULTI_SCALAR_DIFF ? static_cast<int>(std::sqrt(aii.size_array())) : aii.size_array();
 
-  const double dist = Dist_norm_bord(face);
+  const double dist = get_dist_norm_bord(face);
 
   for (int k = 0; k < ncomp; k++)
     if (DERIVED_T::IS_MULTI_SCALAR_DIFF)
@@ -426,7 +379,7 @@ inline void Eval_Diff_VDF_Elem_Gen<DERIVED_T>::coeffs_face(const int face, const
   const int i = elem_(face,0), j = elem_(face,1),
             ncomp = DERIVED_T::IS_MULTI_SCALAR_DIFF ? static_cast<int>(std::sqrt(aii.size_array())) : aii.size_array();
 
-  const double dist = Dist_norm_bord(face);
+  const double dist = get_dist_norm_bord(face);
 
   for (int k = 0; k < ncomp; k++)
     if (DERIVED_T::IS_MULTI_SCALAR_DIFF)
@@ -451,7 +404,7 @@ inline void Eval_Diff_VDF_Elem_Gen<DERIVED_T>::coeffs_face(const int face, const
 {
   assert (aii.size_array() == ajj.size_array());
   const int i = elem_(face,0), j = elem_(face,1), ncomp = aii.size_array();
-  const double dist = Dist_norm_bord(face);
+  const double dist = get_dist_norm_bord(face);
   for (int k = 0; k < ncomp; k++)
     {
       const int ori = DERIVED_T::IS_ANISO ? orientation(face) : k;
@@ -473,13 +426,7 @@ inline void Eval_Diff_VDF_Elem_Gen<DERIVED_T>::coeffs_face(const int face, const
           for (int l = 0; l < ncomp; l++)
             {
               const int ori = ncomp * k + l;
-              double heq;
-              if (nu_1(i,ori) == 0.0 || nu_1(j,ori) == 0.0) heq = 0.;
-              else
-                {
-                  assert(nu_1(i,ori) != 0.0 && nu_1(j,ori) != 0.0);
-                  heq = compute_heq(d0, i, d1, j, ori);
-                }
+              double heq = (nu_1(i,ori) == 0.0 || nu_1(j,ori) == 0.0) ? 0. : compute_heq(d0, i, d1, j, ori);
               aii[ori] = heq * surface(face) * porosite(face);
               ajj[ori] = heq * surface(face) * porosite(face);
             }
@@ -487,13 +434,7 @@ inline void Eval_Diff_VDF_Elem_Gen<DERIVED_T>::coeffs_face(const int face, const
       else
         {
           const int ori = DERIVED_T::IS_ANISO ? orientation(face) : k;
-          double heq;
-          if (nu_1(i,ori) == 0.0 || nu_1(j,ori) == 0.0) heq = 0.;
-          else
-            {
-              assert(nu_1(i,ori) != 0.0 && nu_1(j,ori) != 0.0);
-              heq = compute_heq(d0, i, d1, j, ori);
-            }
+          double heq = (nu_1(i,ori) == 0.0 || nu_1(j,ori) == 0.0) ? 0. : compute_heq(d0, i, d1, j, ori);
           aii[k] = ajj[k] = heq*surface(face)*porosite(face); // On peut faire ca !
         }
     }
@@ -504,7 +445,7 @@ inline void Eval_Diff_VDF_Elem_Gen<DERIVED_T>::coeffs_face(const int face, const
 {
   assert (aii.size_array() == ajj.size_array());
   const int i = elem_(face,0), j = elem_(face,1), ncomp = aii.size_array();
-  const double dist = Dist_norm_bord(face);
+  const double dist = get_dist_norm_bord(face);
   if (DERIVED_T::IS_QUASI)
     {
       for (int k = 0; k < ncomp; k++)
@@ -601,7 +542,7 @@ inline void Eval_Diff_VDF_Elem_Gen<DERIVED_T>::coeffs_faces_interne(const int fa
 {
   assert (aii.size_array() == ajj.size_array());
   const int i = elem_(face,0), j = elem_(face,1), ncomp = DERIVED_T::IS_MULTI_SCALAR_DIFF ? int(sqrt(aii.size_array())) : aii.size_array();
-  double heq, d0 = Dist_face_elem0(face,i), d1 = Dist_face_elem1(face,j);
+  double heq, d0 = get_dist_face_elem0(face,i), d1 = get_dist_face_elem1(face,j);
   for (int k = 0; k < ncomp; k++)
     {
       const int ori = DERIVED_T::IS_ANISO ? orientation(face) : k;
@@ -644,7 +585,7 @@ template <typename DERIVED_T> template <typename Type_Double>
 inline void Eval_Diff_VDF_Elem_Gen<DERIVED_T>::secmem_face(const int face, const Dirichlet_entree_fluide& la_cl, const int num1, Type_Double& flux) const
 {
   const int i = elem_(face,0), j = elem_(face,1), ncomp = flux.size_array();
-  double dist = Dist_norm_bord(face);
+  double dist = get_dist_norm_bord(face);
   for (int k = 0; k < ncomp; k++)
     {
       const int ori = DERIVED_T::IS_ANISO ? orientation(face) : k;
diff --git a/src/VDF/Operateurs/Eval_Diff_Dift/Eval_Diff_VDF_Face_Gen.h b/src/VDF/Operateurs/Eval_Diff_Dift/Eval_Diff_VDF_Face_Gen.h
index b783b48e70..d989ebc60d 100644
--- a/src/VDF/Operateurs/Eval_Diff_Dift/Eval_Diff_VDF_Face_Gen.h
+++ b/src/VDF/Operateurs/Eval_Diff_Dift/Eval_Diff_VDF_Face_Gen.h
@@ -1,5 +1,5 @@
 /****************************************************************************
-* Copyright (c) 2023, CEA
+* Copyright (c) 2026, CEA
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
@@ -42,37 +42,47 @@ class Eval_Diff_VDF_Face_Gen : public Eval_VDF_Face, public Evaluateur_VDF
    * *********  POUR L'EXPLICITE ********** *
    * ************************************** */
 
-  template<Type_Flux_Fa7 Fa7_Type, typename Type_Double> inline std::enable_if_t< Fa7_Type == Type_Flux_Fa7::SORTIE_LIBRE, void>
-  flux_fa7(const DoubleTab&, const DoubleTab*, int , const Neumann_sortie_libre&, int, Type_Double& ) const { /* Do nothing */ }
+  template<Type_Flux_Arete Arete_Type, typename Type_Double>
+  inline std::enable_if_t< Arete_Type == Type_Flux_Arete::INTERNE, void>
+  flux_arete(const DoubleTab&, const DoubleTab*, int, int, int, int, Type_Double& ) const ;
 
-  template<Type_Flux_Fa7 Fa7_Type, typename Type_Double> inline std::enable_if_t< Fa7_Type == Type_Flux_Fa7::ELEM, void>
-  flux_fa7(const DoubleTab&, const DoubleTab*, int, int, int, Type_Double& ) const;
+  // _comp variants: void with output ref(s) for one component k
+  template<Type_Flux_Arete Arete_Type>
+  KOKKOS_INLINE_FUNCTION std::enable_if_t<Arete_Type == Type_Flux_Arete::INTERNE, void>
+  flux_arete_comp(CDoubleTabView, CDoubleTabView, int, int, int, int, int, double&) const;
 
-  template<Type_Flux_Arete Arete_Type, typename Type_Double> inline std::enable_if_t< Arete_Type == Type_Flux_Arete::INTERNE, void>
-  flux_arete(const DoubleTab&, const DoubleTab*, int, int, int, int, Type_Double& ) const ;
+  template<Type_Flux_Arete Arete_Type>
+  KOKKOS_INLINE_FUNCTION std::enable_if_t<Arete_Type == Type_Flux_Arete::MIXTE, void>
+  flux_arete_comp(CDoubleTabView, CDoubleTabView, int, int, int, int, int, double&) const;
 
-  template<Type_Flux_Arete Arete_Type, typename Type_Double> inline std::enable_if_t< Arete_Type == Type_Flux_Arete::MIXTE, void>
-  flux_arete(const DoubleTab&, const DoubleTab*, int, int, int, int, Type_Double& ) const ;
+  template<Type_Flux_Arete Arete_Type>
+  KOKKOS_INLINE_FUNCTION std::enable_if_t<Arete_Type == Type_Flux_Arete::PAROI || Arete_Type == Type_Flux_Arete::NAVIER_PAROI, void>
+  flux_arete_comp(CDoubleTabView, CDoubleTabView, CDoubleTabView, CDoubleTabView, int, int, int, int, int, double&) const;
 
-  template<Type_Flux_Arete Arete_Type, typename Type_Double> inline std::enable_if_t<Arete_Type == Type_Flux_Arete::PAROI || Arete_Type == Type_Flux_Arete::NAVIER_PAROI, void>
-  flux_arete(const DoubleTab&, const DoubleTab*, int, int, int, int, Type_Double& ) const ;
+  template<Type_Flux_Arete Arete_Type>
+  KOKKOS_INLINE_FUNCTION std::enable_if_t<Arete_Type == Type_Flux_Arete::NAVIER, void>
+  flux_arete_comp(CDoubleTabView, CDoubleTabView, CDoubleTabView, CDoubleTabView, int, int, int, int, int, double&) const;
 
-  template<Type_Flux_Arete Arete_Type, typename Type_Double> inline std::enable_if_t<(Arete_Type == Type_Flux_Arete::NAVIER), void>
-  flux_arete(const DoubleTab&, const DoubleTab*, int, int, int, int, Type_Double& ) const;
+  template<Type_Flux_Arete Arete_Type>
+  KOKKOS_INLINE_FUNCTION std::enable_if_t<Arete_Type == Type_Flux_Arete::FLUIDE || Arete_Type == Type_Flux_Arete::NAVIER_FLUIDE || Arete_Type == Type_Flux_Arete::PAROI_FLUIDE, void>
+  flux_arete_comp(CDoubleTabView, CDoubleTabView, CDoubleTabView, CDoubleTabView, int, int, int, int, int, int, double&, double&) const;
 
-  template<Type_Flux_Arete Arete_Type, typename Type_Double>
-  inline std::enable_if_t< Arete_Type == Type_Flux_Arete::FLUIDE || Arete_Type == Type_Flux_Arete::NAVIER_FLUIDE  || Arete_Type == Type_Flux_Arete::PAROI_FLUIDE, void>
-  flux_arete(const DoubleTab&, const DoubleTab*, int, int, int, int, Type_Double&, Type_Double&) const;
+  template<Type_Flux_Arete Arete_Type>
+  KOKKOS_INLINE_FUNCTION std::enable_if_t<Arete_Type == Type_Flux_Arete::PERIODICITE, void>
+  flux_arete_comp(CDoubleTabView, CDoubleTabView, int, int, int, int, int, double&, double&) const;
 
-  template<Type_Flux_Arete Arete_Type, typename Type_Double> inline std::enable_if_t< Arete_Type == Type_Flux_Arete::PERIODICITE, void>
-  flux_arete(const DoubleTab&, const DoubleTab*, int, int, int, int, Type_Double&, Type_Double&) const ;
+  template<Type_Flux_Arete Arete_Type>
+  KOKKOS_INLINE_FUNCTION std::enable_if_t<Arete_Type == Type_Flux_Arete::COIN_FLUIDE, void>
+  flux_arete_comp(CDoubleTabView, CDoubleTabView, CDoubleTabView, CDoubleTabView, int, int, int, int, int, double& f3, double& f12) const
+  { Process::Kokkos_exit("arete_coin_fluide not coded for this scheme."); f3 = f12 = 0.; }
 
-  template<Type_Flux_Arete Arete_Type, typename Type_Double> inline std::enable_if_t< Arete_Type == Type_Flux_Arete::COIN_FLUIDE, void>
-  flux_arete(const DoubleTab&, const DoubleTab*, int, int, int, int, Type_Double&, Type_Double&) const
-  {
-    Cerr << "arete_coin_fluide not coded for this scheme. Ask the TRUST support to code like Eval_Amont_VDF_Face !" << finl;
-    Process::exit();
-  }
+  template<Type_Flux_Fa7 Fa7_Type>
+  KOKKOS_INLINE_FUNCTION std::enable_if_t<Fa7_Type == Type_Flux_Fa7::ELEM, void>
+  flux_fa7_comp(CDoubleTabView, CDoubleTabView, int, int, int, int, double&) const;
+
+  template<Type_Flux_Fa7 Fa7_Type>
+  KOKKOS_INLINE_FUNCTION std::enable_if_t<Fa7_Type == Type_Flux_Fa7::SORTIE_LIBRE, void>
+  flux_fa7_comp(CDoubleTabView, CDoubleTabView, int, CDoubleTabView, int, int, double& f) const { f = 0.; }
 
   /* ************************************** *
    * *********  POUR L'IMPLICITE ********** *
@@ -94,35 +104,51 @@ class Eval_Diff_VDF_Face_Gen : public Eval_VDF_Face, public Evaluateur_VDF
   coeffs_arete(const DoubleTab*, int, int, int, int, Type_Double& , Type_Double& ) const;
 
   template<Type_Flux_Arete Arete_Type, typename Type_Double> inline std::enable_if_t< Arete_Type == Type_Flux_Arete::PAROI || Arete_Type == Type_Flux_Arete::NAVIER_PAROI, void>
-  coeffs_arete(const DoubleTab*, int, int, int, int, Type_Double& , Type_Double& , Type_Double& ) const;
+  coeffs_arete(const DoubleTab&, const DoubleTab&, const DoubleTab*, int, int, int, int, Type_Double& , Type_Double& , Type_Double& ) const;
 
   template<Type_Flux_Arete Arete_Type, typename Type_Double>
   inline std::enable_if_t< Arete_Type == Type_Flux_Arete::FLUIDE || Arete_Type == Type_Flux_Arete::NAVIER_FLUIDE || Arete_Type == Type_Flux_Arete::PAROI_FLUIDE, void>
-  coeffs_arete(const DoubleTab*, int, int, int, int, Type_Double& , Type_Double& , Type_Double& ) const;
+  coeffs_arete(const DoubleTab&, const DoubleTab&, const DoubleTab*, int, int, int, int, Type_Double& , Type_Double& , Type_Double& ) const;
 
   template<Type_Flux_Arete Arete_Type, typename Type_Double> inline std::enable_if_t< (Arete_Type == Type_Flux_Arete::NAVIER), void>
-  coeffs_arete(const DoubleTab*, int, int, int, int, Type_Double& , Type_Double& , Type_Double& ) const;
+  coeffs_arete(const DoubleTab&, const DoubleTab&, const DoubleTab*, int, int, int, int, Type_Double& , Type_Double& , Type_Double& ) const;
 
   template<Type_Flux_Arete Arete_Type, typename Type_Double> inline std::enable_if_t< (Arete_Type == Type_Flux_Arete::COIN_FLUIDE), void>
-  coeffs_arete(const DoubleTab*, int, int, int, int, Type_Double& , Type_Double& , Type_Double& ) const { /* Do nothing */ }
+  coeffs_arete(const DoubleTab&, const DoubleTab&, const DoubleTab*, int, int, int, int, Type_Double& , Type_Double& , Type_Double& ) const { /* Do nothing */ }
+
+
+  void view_ro() const override
+  {
+    Evaluateur_VDF::view_ro();
+    static_cast<const DERIVED_T *>(this)->view_ro_impl();
+  }
 
 private:
   inline double surface_(int i,int j) const { return 0.5*(surface(i)+surface(j)); }
   inline double porosity_(int i,int j) const { return 0.5*(porosite(i)+porosite(j)); }
+  KOKKOS_INLINE_FUNCTION double mean_surface(int i,int j) const { return 0.5*(surface_v_(i)+surface_v_(j)); }
+  KOKKOS_INLINE_FUNCTION double mean_porosity(int i,int j) const { return 0.5*(porosite_v_(i)+porosite_v_(j)); }
 
   // CRTP pattern to static_cast the appropriate class and get the implementation : This is magic !
-  inline double nu_mean_2pts(int i=0, int j=0, int compo=0) const { return static_cast<const DERIVED_T *>(this)->nu_1_impl_face(i, j, compo); }
-  inline double nu_mean_4pts(int i, int j, int k, int l, int compo=0) const { return static_cast<const DERIVED_T *>(this)->nu_2_impl_face(i, j, k, l,compo); }
-  inline double nu_lam(int i, int j=0) const { return static_cast<const DERIVED_T *>(this)->nu_2_impl(i,j); } // Attention nu_2_impl and not nu_1_impl for Dift ...
-  inline double nu_lam_mean_4pts(int i, int j, int k, int l, int compo=0) const { return static_cast<const DERIVED_T *>(this)->nu_lam_impl_face(i,j,k,l,compo); }
-  inline double nu_lam_mean_2pts(int i, int j, int compo=0) const { return static_cast<const DERIVED_T *>(this)->nu_lam_impl_face2(i,j,compo); }
-  inline double nu_turb(int i, int compo=0) const { return static_cast<const DERIVED_T *>(this)->nu_t_impl(i,compo); }
-  inline double tau_tan(int i, int j) const { return static_cast<const DERIVED_T *>(this)->tau_tan_impl(i,j); }
-  inline bool uses_wall_law() const { return static_cast<const DERIVED_T *>(this)->uses_wall(); }
-  inline bool uses_mod_turb() const { return static_cast<const DERIVED_T *>(this)->uses_mod(); }
+  template<typename ExecSpace=HostSpace>
+  KOKKOS_INLINE_FUNCTION double nu_mean_2pts(int i=0, int j=0, int compo=0) const { return static_cast<const DERIVED_T *>(this)->template nu_1_impl_face<ExecSpace>(i, j, compo); }
+  template<typename ExecSpace=HostSpace>
+  KOKKOS_INLINE_FUNCTION double nu_mean_4pts(int i, int j, int k, int l, int compo=0) const { return static_cast<const DERIVED_T *>(this)->template nu_2_impl_face<ExecSpace>(i, j, k, l,compo); }
+  template<typename ExecSpace=HostSpace>
+  KOKKOS_INLINE_FUNCTION double nu_lam(int i, int j=0) const { return static_cast<const DERIVED_T *>(this)->template nu_2_impl<ExecSpace>(i,j); } // Attention nu_2_impl and not nu_1_impl for Dift ...
+  template<typename ExecSpace=HostSpace>
+  KOKKOS_INLINE_FUNCTION double nu_lam_mean_4pts(int i, int j, int k, int l, int compo=0) const { return static_cast<const DERIVED_T *>(this)->template nu_lam_impl_face<ExecSpace>(i,j,k,l,compo); }
+  template<typename ExecSpace=HostSpace>
+  KOKKOS_INLINE_FUNCTION double nu_lam_mean_2pts(int i, int j, int compo=0) const { return static_cast<const DERIVED_T *>(this)->template nu_lam_impl_face2<ExecSpace>(i,j,compo); }
+  template<typename ExecSpace=HostSpace>
+  KOKKOS_INLINE_FUNCTION double nu_turb(int i, int compo=0) const { return static_cast<const DERIVED_T *>(this)->template nu_t_impl<ExecSpace>(i,compo); }
+  KOKKOS_INLINE_FUNCTION double tau_tan(int i, int j) const { return static_cast<const DERIVED_T *>(this)->tau_tan_impl(i,j); }
+  KOKKOS_INLINE_FUNCTION bool uses_wall_law() const { return static_cast<const DERIVED_T *>(this)->uses_wall(); }
+  KOKKOS_INLINE_FUNCTION bool uses_mod_turb() const { return static_cast<const DERIVED_T *>(this)->uses_mod(); }
   inline const DoubleTab& k_elem() const { return static_cast<const DERIVED_T *>(this)->get_k_elem(); } // pour F5 seulement ...
 
   // methods to check coeffs/flux implementation
+  /*
   static constexpr double EPS = 1e-6;
 
   template<typename Type_Double> void check_error(const char * , const int, const int , const Type_Double& , const Type_Double& , const Type_Double& ) const;
@@ -133,11 +159,11 @@ class Eval_Diff_VDF_Face_Gen : public Eval_VDF_Face, public Evaluateur_VDF
 
   template<Type_Flux_Arete Arete_Type, typename Type_Double>
   inline std::enable_if_t<Arete_Type == Type_Flux_Arete::PAROI || Arete_Type == Type_Flux_Arete::NAVIER_PAROI, void>
-  test_coeffs_common(const int , const int , const int , const int , Type_Double& , Type_Double& ) const;
+  test_coeffs_common(const DoubleTab&, const DoubleTab&, const int , const int , const int , const int , Type_Double& , Type_Double& ) const;
 
   template<Type_Flux_Arete Arete_Type, typename Type_Double>
   inline std::enable_if_t<Arete_Type == Type_Flux_Arete::FLUIDE || Arete_Type == Type_Flux_Arete::NAVIER_FLUIDE || Arete_Type == Type_Flux_Arete::PAROI_FLUIDE, void>
-  test_coeffs_common(const int , const int , const int , const int , Type_Double& , Type_Double& , Type_Double& , Type_Double&) const;
+  test_coeffs_common(const DoubleTab&, const DoubleTab&, const int , const int , const int , const int , Type_Double& , Type_Double& , Type_Double& , Type_Double&) const;
 
   template<Type_Flux_Arete Arete_Type, typename Type_Double>
   inline std::enable_if_t<Arete_Type == Type_Flux_Arete::PERIODICITE, void>
@@ -160,11 +186,12 @@ class Eval_Diff_VDF_Face_Gen : public Eval_VDF_Face, public Evaluateur_VDF
 
   template<Type_Flux_Arete Arete_Type, typename Type_Double>
   inline std::enable_if_t<Arete_Type == Type_Flux_Arete::PAROI || Arete_Type == Type_Flux_Arete::NAVIER_PAROI, void>
-  test_coeffs_arete(const int, const int, const int, const int, const Type_Double&) const;
+  test_coeffs_arete(const DoubleTab&, const DoubleTab&, const int, const int, const int, const int, const Type_Double&) const;
 
   template<Type_Flux_Arete Arete_Type, typename Type_Double>
   inline std::enable_if_t<Arete_Type == Type_Flux_Arete::FLUIDE || Arete_Type == Type_Flux_Arete::NAVIER_FLUIDE || Arete_Type == Type_Flux_Arete::PAROI_FLUIDE, void>
-  test_coeffs_arete(const int, const int, const int, const int, const Type_Double& , const Type_Double&) const;
+  test_coeffs_arete(const DoubleTab&, const DoubleTab&, const int, const int, const int, const int, const Type_Double& , const Type_Double&) const;
+   */
 };
 
 #include <Eval_Diff_VDF_Face_Gen.tpp> // templates specializations ici ;)
diff --git a/src/VDF/Operateurs/Eval_Diff_Dift/Eval_Diff_VDF_Face_Gen.tpp b/src/VDF/Operateurs/Eval_Diff_Dift/Eval_Diff_VDF_Face_Gen.tpp
index 984555ffa4..26a9886586 100644
--- a/src/VDF/Operateurs/Eval_Diff_Dift/Eval_Diff_VDF_Face_Gen.tpp
+++ b/src/VDF/Operateurs/Eval_Diff_Dift/Eval_Diff_VDF_Face_Gen.tpp
@@ -1,5 +1,5 @@
 /****************************************************************************
-* Copyright (c) 2023, CEA
+* Copyright (c) 2026, CEA
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
@@ -25,20 +25,8 @@
  * *********  POUR L'EXPLICITE ********** *
  * ************************************** */
 
-template <typename DERIVED_T> template<Type_Flux_Fa7 Fa7_Type, typename Type_Double> inline std::enable_if_t< Fa7_Type == Type_Flux_Fa7::ELEM, void>
-Eval_Diff_VDF_Face_Gen<DERIVED_T>::flux_fa7(const DoubleTab& inco, const DoubleTab*, int elem, int fac1, int fac2, Type_Double& flux) const
-{
-  const int ori = orientation(fac1), ncomp = flux.size_array();
-  const double dist = dist_face(fac1,fac2,ori), surf = 0.5*(surface(fac1)*porosite(fac1)+surface(fac2)*porosite(fac2));
-  for (int k = 0; k < ncomp; k++)
-    {
-      const double tau = (inco(fac2,k)-inco(fac1,k))/dist, tau_tr = ACTIVATE_TAU_TR ? tau : 0.0;
-      const double visc_lam = nu_lam(elem, k), visc_turb = DERIVED_T::IS_TURB ? nu_turb(elem, k) : 0.;
-      flux[k] = ((tau + tau_tr) * (visc_lam + visc_turb)) * surf;
-    }
-}
-
-template <typename DERIVED_T> template<Type_Flux_Arete Arete_Type, typename Type_Double> inline std::enable_if_t< Arete_Type == Type_Flux_Arete::INTERNE, void>
+template <typename DERIVED_T> template<Type_Flux_Arete Arete_Type, typename Type_Double>
+inline std::enable_if_t< Arete_Type == Type_Flux_Arete::INTERNE, void>
 Eval_Diff_VDF_Face_Gen<DERIVED_T>::flux_arete(const DoubleTab& inco, const DoubleTab*, int fac1, int fac2, int fac3, int fac4, Type_Double& flux) const
 {
   const int ori1 = orientation(fac1), ori3 = orientation(fac3), elem1 = elem_(fac3,0), elem2 = elem_(fac3,1), elem3 = elem_(fac4,0), elem4 = elem_(fac4,1), ncomp = flux.size_array();
@@ -53,156 +41,129 @@ Eval_Diff_VDF_Face_Gen<DERIVED_T>::flux_arete(const DoubleTab& inco, const Doubl
     }
 }
 
-template <typename DERIVED_T> template<Type_Flux_Arete Arete_Type, typename Type_Double> inline std::enable_if_t< Arete_Type == Type_Flux_Arete::MIXTE, void>
-Eval_Diff_VDF_Face_Gen<DERIVED_T>::flux_arete(const DoubleTab& inco, const DoubleTab*, int fac1, int fac2, int fac3, int fac4, Type_Double& flux) const
+// ===== _comp scalar variants (one component k) for MDRangePolicy kernels =====
+
+template <typename DERIVED_T> template<Type_Flux_Fa7 Fa7_Type>
+KOKKOS_INLINE_FUNCTION std::enable_if_t<Fa7_Type == Type_Flux_Fa7::ELEM, void>
+Eval_Diff_VDF_Face_Gen<DERIVED_T>::flux_fa7_comp(CDoubleTabView inco, CDoubleTabView, int elem, int fac1, int fac2, int k, double& flux) const
 {
-  const int N = flux.size_array();
-  int elem[4], ori1 = orientation(fac1), ori3 = orientation(fac3);
-  elem[0] = elem_(fac3,0), elem[1] = elem_(fac3,1), elem[2] = elem_(fac4,0), elem[3] = elem_(fac4,1);
-  std::vector<double> visc_lam_temp(N), visc_turb_temp(N);
-  for (int k = 0; k < N; k++)
-    for (int i = 0; i < 4; i++)
-      if (elem[i] != -1)
-        {
-          visc_lam_temp[k] += nu_lam(elem[i], k);
-          visc_turb_temp[k] += nu_turb(elem[i], k);
-        }
-  for (int k = 0; k < N; k++)
-    {
-      visc_lam_temp[k] /= 3.0;
-      visc_turb_temp[k] /= 3.0;
-    }
-  const double surf = surface_(fac1,fac2), poros = porosity_(fac1,fac2);
+  const int ori = orientation_v_(fac1);
+  const double dist = le_dom_v_.dist_face(fac1, fac2, ori);
+  const double surf = 0.5 * (surface_v_(fac1) * porosite_v_(fac1) + surface_v_(fac2) * porosite_v_(fac2));
+  const double tau = (inco(fac2, k) - inco(fac1, k)) / dist;
+  const double tau_tr = ACTIVATE_TAU_TR ? tau : 0.0;
+  const double visc_lam = nu_lam<DeviceSpace>(elem, k);
+  const double visc_turb = DERIVED_T::IS_TURB ? nu_turb<DeviceSpace>(elem, k) : 0.;
+  flux = ((tau + tau_tr) * (visc_lam + visc_turb)) * surf;
+}
 
-  for (int k = 0; k < N; k++)
-    if (inco(fac4,k)*inco(fac3,k) != 0)
-      {
-        const double visc_lam = visc_lam_temp[k], visc_turb = DERIVED_T::IS_TURB ? visc_turb_temp[k] : 0.0;
-        const double tau = (inco(fac4,k)-inco(fac3,k))/dist_face(fac3,fac4,ori1), tau_tr = ACTIVATE_TAU_TR ? (inco(fac2,k)-inco(fac1,k))/dist_face(fac1,fac2,ori3) : 0.0;
-        flux[k] = ((tau + tau_tr) * (visc_lam + visc_turb)) * surf * poros;
-      }
+template <typename DERIVED_T> template<Type_Flux_Arete Arete_Type>
+KOKKOS_INLINE_FUNCTION std::enable_if_t<Arete_Type == Type_Flux_Arete::INTERNE, void>
+Eval_Diff_VDF_Face_Gen<DERIVED_T>::flux_arete_comp(CDoubleTabView inco, CDoubleTabView, int fac1, int fac2, int fac3, int fac4, int k, double& flux) const
+{
+  const int ori1 = orientation_v_(fac1), ori3 = orientation_v_(fac3);
+  const int elem1 = elem_v_(fac3,0), elem2 = elem_v_(fac3,1), elem3 = elem_v_(fac4,0), elem4 = elem_v_(fac4,1);
+  const double surf = mean_surface(fac1,fac2), poros = mean_porosity(fac1,fac2);
+  const double tau = (inco(fac4,k)-inco(fac3,k))/le_dom_v_.dist_face(fac3,fac4,ori1);
+  const double tau_tr = ACTIVATE_TAU_TR ? (inco(fac2,k)-inco(fac1,k))/le_dom_v_.dist_face(fac1,fac2,ori3) : 0.0;
+  const int ind = DERIVED_T::IS_ANISO ? ori3 : k;
+  const double visc_lam = nu_lam_mean_4pts<DeviceSpace>(elem1,elem2,elem3,elem4,ind);
+  const double visc_turb = DERIVED_T::IS_TURB ? nu_mean_4pts<DeviceSpace>(elem1,elem2,elem3,elem4,ind) : 0.0;
+  flux = ((tau+tau_tr)*(visc_lam+visc_turb))*surf*poros;
 }
 
-template <typename DERIVED_T> template<Type_Flux_Arete Arete_Type, typename Type_Double>
-inline std::enable_if_t<Arete_Type == Type_Flux_Arete::PAROI || Arete_Type == Type_Flux_Arete::NAVIER_PAROI, void>
-Eval_Diff_VDF_Face_Gen<DERIVED_T>::flux_arete(const DoubleTab& inco, const DoubleTab*, int fac1, int fac2, int fac3, int signe, Type_Double& flux) const
+template <typename DERIVED_T> template<Type_Flux_Arete Arete_Type>
+KOKKOS_INLINE_FUNCTION std::enable_if_t<Arete_Type == Type_Flux_Arete::MIXTE, void>
+Eval_Diff_VDF_Face_Gen<DERIVED_T>::flux_arete_comp(CDoubleTabView inco, CDoubleTabView, int fac1, int fac2, int fac3, int fac4, int k, double& flux) const
+{
+  const int ori1 = orientation_v_(fac1), ori3 = orientation_v_(fac3);
+  const int elems[4] = {elem_v_(fac3,0), elem_v_(fac3,1), elem_v_(fac4,0), elem_v_(fac4,1)};
+  const double surf = mean_surface(fac1,fac2), poros = mean_porosity(fac1,fac2);
+  if (inco(fac4,k)*inco(fac3,k) == 0) { flux = 0.; return; }
+  double visc_lam_temp = 0, visc_turb_temp = 0;
+  for (int i = 0; i < 4; i++)
+    if (elems[i] != -1) { visc_lam_temp += nu_lam<DeviceSpace>(elems[i],k); visc_turb_temp += nu_turb<DeviceSpace>(elems[i],k); }
+  visc_lam_temp /= 3.0;
+  visc_turb_temp /= 3.0;
+  const double visc_lam = visc_lam_temp, visc_turb = DERIVED_T::IS_TURB ? visc_turb_temp : 0.0;
+  const double tau = (inco(fac4,k)-inco(fac3,k))/le_dom_v_.dist_face(fac3,fac4,ori1);
+  const double tau_tr = ACTIVATE_TAU_TR ? (inco(fac2,k)-inco(fac1,k))/le_dom_v_.dist_face(fac1,fac2,ori3) : 0.0;
+  flux = ((tau+tau_tr)*(visc_lam+visc_turb))*surf*poros;
+}
+
+template <typename DERIVED_T> template<Type_Flux_Arete Arete_Type>
+KOKKOS_INLINE_FUNCTION std::enable_if_t<Arete_Type == Type_Flux_Arete::PAROI || Arete_Type == Type_Flux_Arete::NAVIER_PAROI, void>
+Eval_Diff_VDF_Face_Gen<DERIVED_T>::flux_arete_comp(CDoubleTabView inco, CDoubleTabView val_imp_face_bord, CDoubleTabView coeff_frottement_face_bord, CDoubleTabView, int fac1, int fac2, int fac3, int signe, int k, double& flux) const
 {
   constexpr bool is_PAROI = (Arete_Type == Type_Flux_Arete::PAROI);
-  const int rang1 = (fac1-premiere_face_bord), rang2 = (fac2-premiere_face_bord), ori = orientation(fac3), ncomp = flux.size_array();
-  if ( !uses_wall_law() )
+  const int ori = orientation_v_(fac3);
+  if (!uses_wall_law())
     {
-      int elem1 = elem_(fac3,0), elem2 = elem_(fac3,1);
-      if (is_PAROI)
-        {
-          if (elem1 == -1) elem1 = elem2;
-          else if (elem2 == -1) elem2 = elem1;
-        }
-
-      const double surf = surface_(fac1,fac2), poros = porosity_(fac1,fac2), dist = dist_norm_bord(fac1), tps = inconnue->temps();
-
-      const double vit_imp = is_PAROI ? 0.5*(Champ_Face_get_val_imp_face_bord(tps,rang1,ori,la_zcl.valeur())+Champ_Face_get_val_imp_face_bord(tps,rang2,ori,la_zcl.valeur())) :
-                             0.5*(Champ_Face_get_val_imp_face_bord_sym(inco,tps,rang1,ori,la_zcl.valeur())+Champ_Face_get_val_imp_face_bord_sym(inco,tps,rang2,ori,la_zcl.valeur()));
-
+      int elem1 = elem_v_(fac3,0), elem2 = elem_v_(fac3,1);
+      if (is_PAROI) { if (elem1==-1) elem1=elem2; else if (elem2==-1) elem2=elem1; }
+      const double surf = mean_surface(fac1,fac2), poros = mean_porosity(fac1,fac2), dist = le_dom_v_.dist_norm_bord(fac1);
+      const double vit_imp = 0.5*(val_imp_face_bord(fac1,ori)+val_imp_face_bord(fac2,ori));
       double coeff = 0.0;
-      for (int k = 0; k < ncomp; k++)
-        {
-          if (!is_PAROI) // NAVIER_PAROI
-            coeff = 0.5 * (Champ_Face_coeff_frottement_face_bord(fac1, k, la_zcl.valeur()) + Champ_Face_coeff_frottement_face_bord(fac2, k, la_zcl.valeur()));
-
-          const int ind = DERIVED_T::IS_ANISO ? ori : k;
-          const double visc_lam = nu_lam_mean_2pts(elem1,elem2,ind), visc_turb = DERIVED_T::IS_TURB ? nu_mean_2pts(elem1,elem2,ind) : 0.0;
-          const double tau  = (signe*(vit_imp-inco(fac3,k))/dist) - (signe * coeff * inco(fac3, k)), tau_tr = 0.;
-          flux[k] = ((tau + tau_tr) * (visc_lam + visc_turb)) * surf * poros;
-        }
+      if (!is_PAROI) coeff = 0.5*(coeff_frottement_face_bord(fac1,k)+coeff_frottement_face_bord(fac2,k));
+      const int ind = DERIVED_T::IS_ANISO ? ori : k;
+      const double visc_lam = nu_lam_mean_2pts<DeviceSpace>(elem1,elem2,ind), visc_turb = DERIVED_T::IS_TURB ? nu_mean_2pts<DeviceSpace>(elem1,elem2,ind) : 0.0;
+      const double tau = (signe*(vit_imp-inco(fac3,k))/dist) - (signe*coeff*inco(fac3,k));
+      flux = (tau*(visc_lam+visc_turb))*surf*poros;
     }
   else
-    {
-      double tau1 = tau_tan(rang1,ori)*0.5*surface(fac1), tau2 = tau_tan(rang2,ori)*0.5*surface(fac2);
-      for (int k = 0; k < ncomp; k++) flux[k] = tau1 + tau2;
-    }
+    flux = tau_tan(fac1,ori)*0.5*surface_v_(fac1) + tau_tan(fac2,ori)*0.5*surface_v_(fac2);
 }
 
-template <typename DERIVED_T> template<Type_Flux_Arete Arete_Type, typename Type_Double>
-inline std::enable_if_t< (Arete_Type == Type_Flux_Arete::NAVIER), void>
-Eval_Diff_VDF_Face_Gen<DERIVED_T>::flux_arete(const DoubleTab& inco, const DoubleTab*, int fac1, int fac2, int fac3, int signe, Type_Double& flux) const
+template <typename DERIVED_T> template<Type_Flux_Arete Arete_Type>
+KOKKOS_INLINE_FUNCTION std::enable_if_t<Arete_Type == Type_Flux_Arete::NAVIER, void>
+Eval_Diff_VDF_Face_Gen<DERIVED_T>::flux_arete_comp(CDoubleTabView inco, CDoubleTabView, CDoubleTabView coeff_frottement_face_bord, CDoubleTabView, int fac1, int fac2, int fac3, int signe, int k, double& flux) const
 {
-  //         |
-  // fac 3   | fac 2
-  //  --------
-  //         | fac 1
-  //         |
-
-  // fac3 est la face interne et fac1 et fac2 sont au bord Navier
-  // XXX : WARNING : nu/nu_turb deja dans coeff
-  const int ncomp = flux.size_array();
-  const double surf = surface_(fac1,fac2), poros = porosity_(fac1,fac2);
-  for (int k = 0; k < ncomp; k++)
-    {
-      const double coeff = 0.5 * (Champ_Face_coeff_frottement_face_bord(fac1, k, la_zcl.valeur()) + Champ_Face_coeff_frottement_face_bord(fac2, k, la_zcl.valeur()));
-      const double tau = - signe * coeff * inco(fac3, k), tau_tr = 0.;
-      flux[k] = (tau + tau_tr) * surf * poros;
-    }
+  const double surf = mean_surface(fac1,fac2), poros = mean_porosity(fac1,fac2);
+  const double coeff = 0.5*(coeff_frottement_face_bord(fac1,k)+coeff_frottement_face_bord(fac2,k));
+  flux = (-signe*coeff*inco(fac3,k))*surf*poros;
 }
 
-template <typename DERIVED_T> template<Type_Flux_Arete Arete_Type, typename Type_Double>
-inline std::enable_if_t< Arete_Type == Type_Flux_Arete::FLUIDE || Arete_Type == Type_Flux_Arete::NAVIER_FLUIDE || Arete_Type == Type_Flux_Arete::PAROI_FLUIDE, void>
-Eval_Diff_VDF_Face_Gen<DERIVED_T>::flux_arete(const DoubleTab& inco, const DoubleTab*, int fac1, int fac2, int fac3, int signe, Type_Double& flux3, Type_Double& flux1_2) const
+template <typename DERIVED_T> template<Type_Flux_Arete Arete_Type>
+KOKKOS_INLINE_FUNCTION std::enable_if_t<Arete_Type == Type_Flux_Arete::FLUIDE || Arete_Type == Type_Flux_Arete::NAVIER_FLUIDE || Arete_Type == Type_Flux_Arete::PAROI_FLUIDE, void>
+Eval_Diff_VDF_Face_Gen<DERIVED_T>::flux_arete_comp(CDoubleTabView inco, CDoubleTabView val_imp_face_bord, CDoubleTabView coeff_frottement_face_bord, CDoubleTabView, int fac1, int fac2, int fac3, int signe, int /*ncomp*/, int k, double& flux3, double& flux1_2) const
 {
-  assert (flux3.size_array() == flux1_2.size_array());
   constexpr bool is_NAV_FL = (Arete_Type == Type_Flux_Arete::NAVIER_FLUIDE), is_PAR_FL = (Arete_Type == Type_Flux_Arete::PAROI_FLUIDE);
-  const int rang1 = (fac1-premiere_face_bord), rang2 = (fac2-premiere_face_bord), elem1 = elem_(fac3,0), elem2 = elem_(fac3,1), ori= orientation(fac3), ncomp = flux3.size_array();
-  const double surf = surface_(fac1,fac2), poros = porosity_(fac1,fac2), surfporos = surface(fac3)*porosite(fac3), tps = inconnue->temps(),
-               dist1 = dist_norm_bord(fac1), dist2 = dist_face(fac1,fac2,ori);
-
+  const int elem1 = elem_v_(fac3,0), elem2 = elem_v_(fac3,1), ori = orientation_v_(fac3);
+  const double surf = mean_surface(fac1,fac2), poros = mean_porosity(fac1,fac2);
+  const double surfporos = surface_v_(fac3)*porosite_v_(fac3);
+  const double dist1 = le_dom_v_.dist_norm_bord(fac1), dist2 = le_dom_v_.dist_face(fac1,fac2,ori);
+  const int ind = DERIVED_T::IS_ANISO ? ori : k;
+  const double visc_lam = nu_lam_mean_2pts<DeviceSpace>(elem1,elem2,ind), visc_turb = DERIVED_T::IS_TURB ? nu_mean_2pts<DeviceSpace>(elem1,elem2,ind) : 0.0;
   double vit_imp, coeff = 0.0;
-
-  for (int k = 0; k < ncomp; k++)
-    {
-      const int ind = DERIVED_T::IS_ANISO ? ori : k;
-      const double visc_lam = nu_lam_mean_2pts(elem1,elem2,ind), visc_turb = DERIVED_T::IS_TURB ? nu_mean_2pts(elem1,elem2,ind) : 0.0;
-      if (is_NAV_FL)
-        {
-          vit_imp = 0.5*(Champ_Face_get_val_imp_face_bord_sym(inco,tps,rang1,ori,la_zcl.valeur())+ Champ_Face_get_val_imp_face_bord_sym(inco,tps,rang2,ori,la_zcl.valeur()));
-          coeff = 0.5 * (Champ_Face_coeff_frottement_face_bord(fac1, k, la_zcl.valeur()) + Champ_Face_coeff_frottement_face_bord(fac2, k, la_zcl.valeur()));
-        }
-      else if (is_PAR_FL) // On ne sait pas qui de fac1 ou de fac2 est la face de paroi
-        {
-          if (est_egal(inco(fac1,k),0)) vit_imp = Champ_Face_get_val_imp_face_bord(tps,rang2,ori,la_zcl.valeur()); // fac1 est la face de paroi
-          else vit_imp = Champ_Face_get_val_imp_face_bord(tps,rang1,ori,la_zcl.valeur()); // fac2 est la face de paroi
-        }
-      else vit_imp =  0.5*(Champ_Face_get_val_imp_face_bord(tps,rang1,ori,la_zcl.valeur())+Champ_Face_get_val_imp_face_bord(tps,rang2,ori,la_zcl.valeur()));
-
-      //         |
-      // fac 3   | fac 2
-      //  --------
-      //         | fac 1
-      //         |
-
-      // fac3 est la face interne et fac1 et fac2 sont au bord
-      const double tau_3 = (signe*(vit_imp-inco(fac3,k))/dist1) -(signe * coeff * inco(fac3, k)),
-                   tau_12 = (inco(fac2,k)-inco(fac1,k))/dist2, tau_tr_3 = ACTIVATE_TAU_TR ? tau_12 : 0.0, tau_tr_12 = ACTIVATE_TAU_TR ? tau_3 : 0.0;
-
-      flux3[k] = ((tau_3 + tau_tr_3) * (visc_lam + visc_turb)) * surf * poros;
-      flux1_2[k] = ((tau_12 + tau_tr_12) * (visc_lam + visc_turb)) * surfporos;
-    }
+  if (is_NAV_FL)
+    { vit_imp = 0.5*(val_imp_face_bord(fac1,ori)+val_imp_face_bord(fac2,ori)); coeff = 0.5*(coeff_frottement_face_bord(fac1,k)+coeff_frottement_face_bord(fac2,k)); }
+  else if (is_PAR_FL)
+    vit_imp = est_egal(inco(fac1,k),0,1e-12) ? val_imp_face_bord(fac2,ori) : val_imp_face_bord(fac1,ori);
+  else
+    vit_imp = 0.5*(val_imp_face_bord(fac1,ori)+val_imp_face_bord(fac2,ori));
+  const double tau_3 = (signe*(vit_imp-inco(fac3,k))/dist1)-(signe*coeff*inco(fac3,k));
+  const double tau_12 = (inco(fac2,k)-inco(fac1,k))/dist2;
+  const double tau_tr_3 = ACTIVATE_TAU_TR ? tau_12 : 0.0, tau_tr_12 = ACTIVATE_TAU_TR ? tau_3 : 0.0;
+  flux3   = ((tau_3  +tau_tr_3 )*(visc_lam+visc_turb))*surf*poros;
+  flux1_2 = ((tau_12 +tau_tr_12)*(visc_lam+visc_turb))*surfporos;
 }
 
-template <typename DERIVED_T> template<Type_Flux_Arete Arete_Type, typename Type_Double> inline std::enable_if_t< Arete_Type == Type_Flux_Arete::PERIODICITE, void>
-Eval_Diff_VDF_Face_Gen<DERIVED_T>::flux_arete(const DoubleTab& inco, const DoubleTab*, int fac1, int fac2, int fac3, int fac4, Type_Double& flux3_4, Type_Double& flux1_2) const
+template <typename DERIVED_T> template<Type_Flux_Arete Arete_Type>
+KOKKOS_INLINE_FUNCTION std::enable_if_t<Arete_Type == Type_Flux_Arete::PERIODICITE, void>
+Eval_Diff_VDF_Face_Gen<DERIVED_T>::flux_arete_comp(CDoubleTabView inco, CDoubleTabView, int fac1, int fac2, int fac3, int fac4, int k, double& flux3_4, double& flux1_2) const
 {
-  assert (flux3_4.size_array() == flux1_2.size_array());
-  const int ori1 = orientation(fac1), ori3 = orientation(fac3), elem1 = elem_(fac3,0), elem2 = elem_(fac3,1), elem3 = elem_(fac4,0), elem4 = elem_(fac4,1), ncomp = flux3_4.size_array();
-  const double dist3_4 = dist_face_period(fac3,fac4,ori1), dist1_2 = dist_face_period(fac1,fac2,ori3),
-               surf1_2 = surface_(fac1,fac2), poros1_2 = porosity_(fac1, fac2), surf3_4 = surface_(fac3,fac4), poros3_4 = porosity_(fac3, fac4);
-
-  for (int k = 0; k < ncomp; k++)
-    {
-      const int ind = DERIVED_T::IS_ANISO ? ori3 : k;
-      const double visc_lam = nu_lam_mean_4pts(elem1,elem2,elem3,elem4,ind), visc_turb = DERIVED_T::IS_TURB ? nu_mean_4pts(elem1,elem2,elem3,elem4,ind) : 0.0;
-      const double tau_34 = (inco(fac4,k)-inco(fac3,k))/dist3_4, tau_12 = (inco(fac2,k)-inco(fac1,k))/dist1_2, tau_tr_34 = ACTIVATE_TAU_TR ? tau_12 : 0.0, tau_tr_12 = ACTIVATE_TAU_TR ? tau_34 : 0.0;
-      flux3_4[k] = ((tau_34 + tau_tr_34) * (visc_lam + visc_turb)) * surf1_2 * poros1_2;
-      flux1_2[k] = ((tau_12 + tau_tr_12) * (visc_lam + visc_turb)) * surf3_4 * poros3_4;
-    }
+  const int ori1 = orientation_v_(fac1), ori3 = orientation_v_(fac3);
+  const int elem1 = elem_v_(fac3,0), elem2 = elem_v_(fac3,1), elem3 = elem_v_(fac4,0), elem4 = elem_v_(fac4,1);
+  const double dist3_4 = le_dom_v_.dist_face_period(fac3,fac4,ori1), dist1_2 = le_dom_v_.dist_face_period(fac1,fac2,ori3);
+  const double surf1_2 = mean_surface(fac1,fac2), poros1_2 = mean_porosity(fac1,fac2);
+  const double surf3_4 = mean_surface(fac3,fac4), poros3_4 = mean_porosity(fac3,fac4);
+  const int ind = DERIVED_T::IS_ANISO ? ori3 : k;
+  const double visc_lam = nu_lam_mean_4pts<DeviceSpace>(elem1,elem2,elem3,elem4,ind);
+  const double visc_turb = DERIVED_T::IS_TURB ? nu_mean_4pts<DeviceSpace>(elem1,elem2,elem3,elem4,ind) : 0.0;
+  const double tau_34 = (inco(fac4,k)-inco(fac3,k))/dist3_4, tau_12 = (inco(fac2,k)-inco(fac1,k))/dist1_2;
+  const double tau_tr_34 = ACTIVATE_TAU_TR ? tau_12 : 0.0, tau_tr_12 = ACTIVATE_TAU_TR ? tau_34 : 0.0;
+  flux3_4 = ((tau_34+tau_tr_34)*(visc_lam+visc_turb))*surf1_2*poros1_2;
+  flux1_2 = ((tau_12+tau_tr_12)*(visc_lam+visc_turb))*surf3_4*poros3_4;
 }
 
 /* ************************************** *
@@ -223,7 +184,7 @@ Eval_Diff_VDF_Face_Gen<DERIVED_T>::coeffs_fa7(const DoubleTab*, int elem,int fac
       f1[k] = f2[k] = ((d_tau + d_tau_tr) * (visc_lam + visc_turb)) * surf;
     }
 
-  if (TEST_COEFFS) test_coeffs_fa7<Fa7_Type,Type_Double>(elem,fac1,fac2,f1);
+  //if (TEST_COEFFS) test_coeffs_fa7<Fa7_Type,Type_Double>(elem,fac1,fac2,f1);
 }
 
 template <typename DERIVED_T> template<Type_Flux_Arete Arete_Type, typename Type_Double> inline std::enable_if_t< Arete_Type == Type_Flux_Arete::INTERNE, void>
@@ -241,7 +202,7 @@ Eval_Diff_VDF_Face_Gen<DERIVED_T>::coeffs_arete(const DoubleTab*, int fac1, int
       aii[k] = ajj[k] = ((d_tau + d_tau_tr) * (visc_lam + visc_turb)) * surf * poros;
     }
 
-  if (TEST_COEFFS) test_coeffs_arete<Arete_Type,Type_Double>(fac1,fac2,fac3,fac4,aii);
+  //if (TEST_COEFFS) test_coeffs_arete<Arete_Type,Type_Double>(fac1,fac2,fac3,fac4,aii);
 }
 
 template <typename DERIVED_T> template<Type_Flux_Arete Arete_Type, typename Type_Double> inline std::enable_if_t< Arete_Type == Type_Flux_Arete::MIXTE, void>
@@ -269,7 +230,7 @@ Eval_Diff_VDF_Face_Gen<DERIVED_T>::coeffs_arete(const DoubleTab*,int fac1, int f
   const double surf = surface_(fac1,fac2), poros = porosity_(fac1,fac2),
                d_tau = 1. / dist_face(fac3,fac4,ori1), d_tau_tr = 0.; // On derive par rapport a fac3 et fac4
 
-  const DoubleTab& inco = inconnue->valeurs();
+  const DoubleTab& inco = inconnue_->valeurs();
   for (int k = 0; k < ncomp; k++)
     if (inco(fac4,k) * inco(fac3,k) != 0)
       {
@@ -277,12 +238,12 @@ Eval_Diff_VDF_Face_Gen<DERIVED_T>::coeffs_arete(const DoubleTab*,int fac1, int f
         aii[k] = ajj[k] = ((d_tau + d_tau_tr) * (visc_lam + visc_turb)) * surf * poros;
       }
 
-  if (TEST_COEFFS) test_coeffs_arete<Arete_Type,Type_Double>(fac1,fac2,fac3,fac4,aii);
+  //if (TEST_COEFFS) test_coeffs_arete<Arete_Type,Type_Double>(fac1,fac2,fac3,fac4,aii);
 }
 
 template <typename DERIVED_T> template<Type_Flux_Arete Arete_Type, typename Type_Double>
 inline std::enable_if_t< Arete_Type == Type_Flux_Arete::PAROI || Arete_Type == Type_Flux_Arete::NAVIER_PAROI, void>
-Eval_Diff_VDF_Face_Gen<DERIVED_T>::coeffs_arete(const DoubleTab*, int fac1, int fac2, int fac3, int signe, Type_Double& aii1_2, Type_Double& aii3_4, Type_Double& ajj1_2) const
+Eval_Diff_VDF_Face_Gen<DERIVED_T>::coeffs_arete(const DoubleTab& val_imp_face_bord, const DoubleTab& coeff_frottement_face_bord, const DoubleTab*, int fac1, int fac2, int fac3, int signe, Type_Double& aii1_2, Type_Double& aii3_4, Type_Double& ajj1_2) const
 {
   assert(aii1_2.size_array() == aii3_4.size_array() && aii1_2.size_array() == ajj1_2.size_array());
   constexpr bool is_PAROI = (Arete_Type == Type_Flux_Arete::PAROI);
@@ -302,7 +263,7 @@ Eval_Diff_VDF_Face_Gen<DERIVED_T>::coeffs_arete(const DoubleTab*, int fac1, int
       for (int k = 0; k < ncomp; k++)
         {
           if (!is_PAROI) // NAVIER_PAROI
-            coeff = 0.5 * (Champ_Face_coeff_frottement_face_bord(fac1, k, la_zcl.valeur()) + Champ_Face_coeff_frottement_face_bord(fac2, k, la_zcl.valeur()));
+            coeff = 0.5 * (coeff_frottement_face_bord(fac1, k) + coeff_frottement_face_bord(fac2, k));
 
           const double d_tau = signe / dist - (signe * coeff), d_tau_tr = 0.; // On a pas derive ... deja nul dans le flux !
           const int ind = DERIVED_T::IS_ANISO ? ori : k;
@@ -313,12 +274,12 @@ Eval_Diff_VDF_Face_Gen<DERIVED_T>::coeffs_arete(const DoubleTab*, int fac1, int
     }
   else for (int k = 0; k < ncomp; k++) aii3_4[k] = aii1_2[k] = ajj1_2[k] = 0.;
 
-  if (TEST_COEFFS) test_coeffs_arete<Arete_Type,Type_Double>(fac1,fac2,fac3,signe,aii3_4);
+  //if (TEST_COEFFS) test_coeffs_arete<Arete_Type,Type_Double>(val_imp_face_bord, coeff_frottement_face_bord, fac1,fac2,fac3,signe,aii3_4);
 }
 
 template <typename DERIVED_T> template<Type_Flux_Arete Arete_Type, typename Type_Double>
 inline std::enable_if_t< Arete_Type == Type_Flux_Arete::FLUIDE || Arete_Type == Type_Flux_Arete::NAVIER_FLUIDE || Arete_Type == Type_Flux_Arete::PAROI_FLUIDE, void>
-Eval_Diff_VDF_Face_Gen<DERIVED_T>::coeffs_arete(const DoubleTab*, int fac1, int fac2, int fac3, int signe, Type_Double& aii1_2, Type_Double& aii3, Type_Double& ajj1_2) const
+Eval_Diff_VDF_Face_Gen<DERIVED_T>::coeffs_arete(const DoubleTab& val_imp_face_bord, const DoubleTab& coeff_frottement_face_bord, const DoubleTab*, int fac1, int fac2, int fac3, int signe, Type_Double& aii1_2, Type_Double& aii3, Type_Double& ajj1_2) const
 {
   assert(aii1_2.size_array() == aii3.size_array() && aii1_2.size_array() == ajj1_2.size_array());
   constexpr bool is_NAV_FL = (Arete_Type == Type_Flux_Arete::NAVIER_FLUIDE);
@@ -330,7 +291,7 @@ Eval_Diff_VDF_Face_Gen<DERIVED_T>::coeffs_arete(const DoubleTab*, int fac1, int
   for (int k = 0; k < ncomp; k++)
     {
       if (is_NAV_FL)
-        coeff = 0.5 * (Champ_Face_coeff_frottement_face_bord(fac1, k, la_zcl.valeur()) + Champ_Face_coeff_frottement_face_bord(fac2, k, la_zcl.valeur()));
+        coeff = 0.5 * (coeff_frottement_face_bord(fac1, k) + coeff_frottement_face_bord(fac2, k));
 
       const double d_tau_3 = (signe / dist1) - (signe * coeff), d_tau_tr_3 = 0., // On derive par rapport a fac3
                    d_tau_12 = 1. / dist2, d_tau_tr_12 = 0.; // On derive par rapport a fac1 et fac2
@@ -341,12 +302,12 @@ Eval_Diff_VDF_Face_Gen<DERIVED_T>::coeffs_arete(const DoubleTab*, int fac1, int
       aii1_2[k] = ajj1_2[k] = ((d_tau_12 + d_tau_tr_12) * (visc_lam + visc_turb)) * surfporos;
     }
 
-  if (TEST_COEFFS) test_coeffs_arete<Arete_Type,Type_Double>(fac1,fac2,fac3,signe,aii1_2,aii3);
+  //if (TEST_COEFFS) test_coeffs_arete<Arete_Type,Type_Double>(val_imp_face_bord, coeff_frottement_face_bord, fac1,fac2,fac3,signe,aii1_2,aii3);
 }
 
 template <typename DERIVED_T> template<Type_Flux_Arete Arete_Type, typename Type_Double>
 inline std::enable_if_t< (Arete_Type == Type_Flux_Arete::NAVIER), void>
-Eval_Diff_VDF_Face_Gen<DERIVED_T>::coeffs_arete(const DoubleTab*, int fac1, int fac2, int fac3, int signe, Type_Double& aii1_2, Type_Double& aii3, Type_Double& ajj1_2) const
+Eval_Diff_VDF_Face_Gen<DERIVED_T>::coeffs_arete(const DoubleTab& val_imp_face_bord, const DoubleTab& coeff_frottement_face_bord, const DoubleTab*, int fac1, int fac2, int fac3, int signe, Type_Double& aii1_2, Type_Double& aii3, Type_Double& ajj1_2) const
 {
   assert(aii1_2.size_array() == aii3.size_array() && aii1_2.size_array() == ajj1_2.size_array());
   const int elem1 = elem_(fac3,0), elem2 = elem_(fac3,1), ncomp = aii1_2.size_array(), ori = orientation(fac3);
@@ -356,7 +317,7 @@ Eval_Diff_VDF_Face_Gen<DERIVED_T>::coeffs_arete(const DoubleTab*, int fac1, int
     {
       const int ind = DERIVED_T::IS_ANISO ? ori : k;
       const double visc_lam = nu_lam_mean_2pts(elem1, elem2, ind), visc_turb = DERIVED_T::IS_TURB ? nu_mean_2pts(elem1, elem2, ind) : 0.0;
-      const double coeff = 0.5 * (Champ_Face_coeff_frottement_face_bord(fac1, k, la_zcl.valeur()) + Champ_Face_coeff_frottement_face_bord(fac2, k, la_zcl.valeur()));
+      const double coeff = 0.5 * (coeff_frottement_face_bord(fac1, k) + coeff_frottement_face_bord(fac2, k));
       const double d_tau_3 = - (signe * coeff), d_tau_tr_3 = 0., // On derive par rapport a fac3
                    d_tau_12 = 1. / dist2, d_tau_tr_12 = 0.; // On derive par rapport a fac1 et fac2
 
@@ -380,13 +341,14 @@ Eval_Diff_VDF_Face_Gen<DERIVED_T>::coeffs_arete(const DoubleTab*,int fac1, int f
       aii[k] = ajj[k] = ((d_tau + d_tau_tr) * (visc_lam + visc_turb)) * surf * poros;
     }
 
-  if (TEST_COEFFS) test_coeffs_arete<Arete_Type,Type_Double>(fac1,fac2,fac3,fac4,aii);
+  //if (TEST_COEFFS) test_coeffs_arete<Arete_Type,Type_Double>(fac1,fac2,fac3,fac4,aii);
 }
 
 /* ************************************** *
  * *********   For checking   *********** *
  * ************************************** */
 
+/*
 template <typename DERIVED_T> template<typename Type_Double>
 void Eval_Diff_VDF_Face_Gen<DERIVED_T>::check_error(const char * nom_funct, const int Type_Flux, const int ncomp, const Type_Double& f1, const Type_Double& flux_p, const Type_Double& flux_m) const
 {
@@ -411,7 +373,7 @@ template <typename DERIVED_T> template<Type_Flux_Arete Arete_Type, typename Type
 inline std::enable_if_t< Arete_Type == Type_Flux_Arete::INTERNE || Arete_Type == Type_Flux_Arete::MIXTE, void>
 Eval_Diff_VDF_Face_Gen<DERIVED_T>::test_coeffs_common(const int fac1, const int fac2, const int fac3, const int fac4, Type_Double& flux_p, Type_Double& flux_m) const
 {
-  DoubleTab inco_pert = inconnue->valeurs();
+  DoubleTab inco_pert = inconnue_->valeurs();
 
   const int ncomp = flux_p.size_array();
   for (int k = 0; k < ncomp; k++)  inco_pert(fac4,k) += EPS; // XXX : ATTENTION SIGNE
@@ -423,49 +385,49 @@ Eval_Diff_VDF_Face_Gen<DERIVED_T>::test_coeffs_common(const int fac1, const int
 
 template <typename DERIVED_T> template<Type_Flux_Arete Arete_Type, typename Type_Double>
 inline std::enable_if_t<Arete_Type == Type_Flux_Arete::PAROI || Arete_Type == Type_Flux_Arete::NAVIER_PAROI, void>
-Eval_Diff_VDF_Face_Gen<DERIVED_T>::test_coeffs_common(const int fac1, const int fac2, const int fac3, const int signe, Type_Double& flux_p, Type_Double& flux_m) const
+Eval_Diff_VDF_Face_Gen<DERIVED_T>::test_coeffs_common(const DoubleTab& val_imp_face_bord, const DoubleTab& coeff_frottement_face_bord, const int fac1, const int fac2, const int fac3, const int signe, Type_Double& flux_p, Type_Double& flux_m) const
 {
-  DoubleTab inco_pert = inconnue->valeurs();
+  DoubleTab inco_pert = inconnue_->valeurs();
 
   const int ncomp = flux_p.size_array();
   for (int k = 0; k < ncomp; k++)  inco_pert(fac3,k) -= EPS; // XXX : ATTENTION SIGNE
 
-  flux_arete<Arete_Type,Type_Double>(inco_pert,nullptr,fac1,fac2,fac3,signe,flux_p);
+  flux_arete<Arete_Type,Type_Double>(inco_pert,val_imp_face_bord, coeff_frottement_face_bord,nullptr,fac1,fac2,fac3,signe,flux_p);
 
   for (int k = 0; k < ncomp; k++) inco_pert(fac3,k) += 2.0 * EPS; // XXX : ATTENTION SIGNE
-  flux_arete<Arete_Type,Type_Double>(inco_pert,nullptr,fac1,fac2,fac3,signe,flux_m);
+  flux_arete<Arete_Type,Type_Double>(inco_pert,val_imp_face_bord, coeff_frottement_face_bord,nullptr,fac1,fac2,fac3,signe,flux_m);
 }
 
 template <typename DERIVED_T> template<Type_Flux_Arete Arete_Type, typename Type_Double>
 inline std::enable_if_t<Arete_Type == Type_Flux_Arete::FLUIDE || Arete_Type == Type_Flux_Arete::NAVIER_FLUIDE || Arete_Type == Type_Flux_Arete::PAROI_FLUIDE, void>
-Eval_Diff_VDF_Face_Gen<DERIVED_T>::test_coeffs_common(const int fac1, const int fac2, const int fac3, const int signe, Type_Double& flux_p3, Type_Double& flux_m3, Type_Double& flux_p12, Type_Double& flux_m12) const
+Eval_Diff_VDF_Face_Gen<DERIVED_T>::test_coeffs_common(const DoubleTab& val_imp_face_bord, const DoubleTab& coeff_frottement_face_bord, const int fac1, const int fac2, const int fac3, const int signe, Type_Double& flux_p3, Type_Double& flux_m3, Type_Double& flux_p12, Type_Double& flux_m12) const
 {
-  DoubleTab inco_pert = inconnue->valeurs();
+  DoubleTab inco_pert = inconnue_->valeurs();
   Type_Double poubelle(flux_p3.size_array());
 
   const int ncomp = flux_p3.size_array();
 
   // pour flux3
   for (int k = 0; k < ncomp; k++) inco_pert(fac3,k) -= EPS; // XXX : ATTENTION SIGNE
-  flux_arete<Arete_Type,Type_Double>(inco_pert,nullptr,fac1,fac2,fac3,signe,flux_p3,poubelle);
+  flux_arete<Arete_Type,Type_Double>(inco_pert,val_imp_face_bord, coeff_frottement_face_bord,nullptr,fac1,fac2,fac3,signe,flux_p3,poubelle);
 
   for (int k = 0; k < ncomp; k++) inco_pert(fac3,k) += 2.0 * EPS; // XXX : ATTENTION SIGNE
-  flux_arete<Arete_Type,Type_Double>(inco_pert,nullptr,fac1,fac2,fac3,signe,flux_m3,poubelle);
+  flux_arete<Arete_Type,Type_Double>(inco_pert,val_imp_face_bord, coeff_frottement_face_bord,nullptr,fac1,fac2,fac3,signe,flux_m3,poubelle);
 
   // pour flux1_2
-  inco_pert = inconnue->valeurs(); // back to real values
+  inco_pert = inconnue_->valeurs(); // back to real values
   for (int k = 0; k < ncomp; k++) inco_pert(fac2,k) += EPS; // XXX : ATTENTION SIGNE
-  flux_arete<Arete_Type,Type_Double>(inco_pert,nullptr,fac1,fac2,fac3,signe,poubelle,flux_p12);
+  flux_arete<Arete_Type,Type_Double>(inco_pert,val_imp_face_bord, coeff_frottement_face_bord,nullptr,fac1,fac2,fac3,signe,poubelle,flux_p12);
 
   for (int k = 0; k < ncomp; k++)  inco_pert(fac2,k) -= 2.0 * EPS; // XXX : ATTENTION SIGNE
-  flux_arete<Arete_Type,Type_Double>(inco_pert,nullptr,fac1,fac2,fac3,signe,poubelle,flux_m12);
+  flux_arete<Arete_Type,Type_Double>(inco_pert,val_imp_face_bord, coeff_frottement_face_bord,nullptr,fac1,fac2,fac3,signe,poubelle,flux_m12);
 }
 
 template <typename DERIVED_T> template<Type_Flux_Arete Arete_Type, typename Type_Double>
 inline std::enable_if_t<Arete_Type == Type_Flux_Arete::PERIODICITE, void>
 Eval_Diff_VDF_Face_Gen<DERIVED_T>::test_coeffs_common(const int fac1, const int fac2, const int fac3, const int fac4, Type_Double& flux_p, Type_Double& flux_m) const
 {
-  DoubleTab inco_pert = inconnue->valeurs();
+  DoubleTab inco_pert = inconnue_->valeurs();
   Type_Double poubelle(flux_p.size_array());
   const int ncomp = flux_p.size_array();
 
@@ -481,7 +443,7 @@ Eval_Diff_VDF_Face_Gen<DERIVED_T>::test_coeffs_fa7(const int elem, const int fac
 {
   const int ncomp = f1.size_array();
   Type_Double flux_p(ncomp), flux_m(ncomp);
-  DoubleTab inco_pert = inconnue->valeurs();
+  DoubleTab inco_pert = inconnue_->valeurs();
 
   for (int k = 0; k < ncomp; k++) inco_pert(fac2,k) += EPS; // XXX : ATTENTION SIGNE
   flux_fa7<Fa7_Type,Type_Double>(inco_pert,nullptr,elem,fac1,fac2,flux_p);
@@ -509,26 +471,26 @@ Eval_Diff_VDF_Face_Gen<DERIVED_T>::test_coeffs_arete(const int fac1, const int f
   const int ncomp = aii.size_array();
   Type_Double flux_p(ncomp), flux_m(ncomp);
   test_coeffs_common<Arete_Type,Type_Double>(fac1,fac2,fac3,fac4,flux_p,flux_m);
-  if (inconnue->valeurs()(fac4,0) * inconnue->valeurs()(fac3,0) != 0) check_error<Type_Double>(__func__,(int)Arete_Type,ncomp,aii,flux_p,flux_m);
+  if (inconnue_->valeurs()(fac4,0) * inconnue_->valeurs()(fac3,0) != 0) check_error<Type_Double>(__func__,(int)Arete_Type,ncomp,aii,flux_p,flux_m);
 }
 
 template <typename DERIVED_T> template<Type_Flux_Arete Arete_Type, typename Type_Double>
 inline std::enable_if_t<Arete_Type == Type_Flux_Arete::PAROI || Arete_Type == Type_Flux_Arete::NAVIER_PAROI, void>
-Eval_Diff_VDF_Face_Gen<DERIVED_T>::test_coeffs_arete(const int fac1, const int fac2, const int fac3, const int signe, const Type_Double& aii3_4) const
+Eval_Diff_VDF_Face_Gen<DERIVED_T>::test_coeffs_arete(const DoubleTab& val_imp_face_bord, const DoubleTab& coeff_frottement_face_bord, const int fac1, const int fac2, const int fac3, const int signe, const Type_Double& aii3_4) const
 {
   const int ncomp = aii3_4.size_array();
   Type_Double flux_p(ncomp), flux_m(ncomp);
-  test_coeffs_common<Arete_Type,Type_Double>(fac1,fac2,fac3,signe,flux_p,flux_m);
+  test_coeffs_common<Arete_Type,Type_Double>(val_imp_face_bord, coeff_frottement_face_bord,fac1,fac2,fac3,signe,flux_p,flux_m);
   if ( !uses_wall_law() ) check_error<Type_Double>(__func__,(int)Arete_Type,ncomp,aii3_4,flux_p,flux_m);
 }
 
 template <typename DERIVED_T> template<Type_Flux_Arete Arete_Type, typename Type_Double>
 inline std::enable_if_t<Arete_Type == Type_Flux_Arete::FLUIDE || Arete_Type == Type_Flux_Arete::NAVIER_FLUIDE || Arete_Type == Type_Flux_Arete::PAROI_FLUIDE, void>
-Eval_Diff_VDF_Face_Gen<DERIVED_T>::test_coeffs_arete(const int fac1, const int fac2, const int fac3, const int signe, const Type_Double& aii1_2, const Type_Double& aii3) const
+Eval_Diff_VDF_Face_Gen<DERIVED_T>::test_coeffs_arete(const DoubleTab& val_imp_face_bord, const DoubleTab& coeff_frottement_face_bord, const int fac1, const int fac2, const int fac3, const int signe, const Type_Double& aii1_2, const Type_Double& aii3) const
 {
   const int ncomp = aii1_2.size_array();
   Type_Double flux_p3(ncomp), flux_m3(ncomp), flux_p12(ncomp), flux_m12(ncomp);
-  test_coeffs_common<Arete_Type,Type_Double>(fac1,fac2,fac3,signe,flux_p3,flux_m3,flux_p12,flux_m12);
+  test_coeffs_common<Arete_Type,Type_Double>(val_imp_face_bord, coeff_frottement_face_bord,fac1,fac2,fac3,signe,flux_p3,flux_m3,flux_p12,flux_m12);
   check_error<Type_Double>(__func__,(int)Arete_Type,ncomp,aii3,flux_p3,flux_m3);
   check_error<Type_Double>(__func__,(int)Arete_Type,ncomp,aii1_2,flux_p12,flux_m12);
 }
@@ -542,5 +504,5 @@ Eval_Diff_VDF_Face_Gen<DERIVED_T>::test_coeffs_arete(const int fac1, const int f
   test_coeffs_common<Arete_Type,Type_Double>(fac1,fac2,fac3,fac4,flux_p,flux_m);
   check_error<Type_Double>(__func__,(int)Arete_Type,ncomp,aii,flux_p,flux_m);
 }
-
+*/
 #endif /* Eval_Diff_VDF_Face_Gen_TPP_included */
diff --git a/src/VDF/Operateurs/Eval_Diff_Dift/Eval_Dift_Multiphase_VDF.h b/src/VDF/Operateurs/Eval_Diff_Dift/Eval_Dift_Multiphase_VDF.h
index 0adb2a09b7..a1a39cefbe 100644
--- a/src/VDF/Operateurs/Eval_Diff_Dift/Eval_Dift_Multiphase_VDF.h
+++ b/src/VDF/Operateurs/Eval_Diff_Dift/Eval_Dift_Multiphase_VDF.h
@@ -1,5 +1,5 @@
 /****************************************************************************
-* Copyright (c) 2024, CEA
+* Copyright (c) 2025, CEA
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
@@ -37,16 +37,16 @@ class Eval_Dift_Multiphase_VDF : public Eval_Dift_VDF
   {
     const DoubleTab& rho = ref_probleme_->milieu().masse_volumique().passe();
     const int cR = rho.dimension(0) == 1;
-    tab_diffusivite_turbulente = nu_t_;
+    tab_diffusivite_turbulente_ = nu_t_;
 
     if (need_alpha_rho_ && sub_type(Pb_Multiphase, ref_probleme_.valeur()))
       {
         for (int e = 0; e < nu_t_->dimension(0); e++)
           for (int n = 0; n < nu_t_->dimension(1); n++)
-            tab_diffusivite_turbulente(e, n) = tab_alpha_(e, n) * rho(!cR * e, n) * nu_t_.valeur()(e, n);
+            tab_diffusivite_turbulente_(e, n) = tab_alpha_(e, n) * rho(!cR * e, n) * nu_t_.valeur()(e, n);
       }
 
-    tab_diffusivite_turbulente.echange_espace_virtuel();
+    tab_diffusivite_turbulente_.echange_espace_virtuel();
     tab_diff_turb_first_update_ = false;
   }
 
@@ -55,7 +55,7 @@ class Eval_Dift_Multiphase_VDF : public Eval_Dift_VDF
     if (tab_diff_turb_first_update_)
       const_cast<Eval_Dift_Multiphase_VDF&>(*this).update_diffusivite_turbulente();
 
-    return tab_diffusivite_turbulente;
+    return tab_diffusivite_turbulente_;
   }
 
   const Champ_Fonc_base& diffusivite_turbulente() const { throw; }
diff --git a/src/VDF/Operateurs/Eval_Diff_Dift/Eval_Dift_VDF.h b/src/VDF/Operateurs/Eval_Diff_Dift/Eval_Dift_VDF.h
index 1fbf26e1c8..471ddcb672 100644
--- a/src/VDF/Operateurs/Eval_Diff_Dift/Eval_Dift_VDF.h
+++ b/src/VDF/Operateurs/Eval_Diff_Dift/Eval_Dift_VDF.h
@@ -24,6 +24,18 @@
 class Eval_Dift_VDF : public Eval_Diff_VDF
 {
 public:
+  inline Eval_Dift_VDF() { }
+  inline Eval_Dift_VDF(const Eval_Dift_VDF& eval) : Eval_Diff_VDF(eval)
+  {
+    is_multi_ = eval.is_multi_;
+    loipar = eval.loipar;
+    ref_diffusivite_turbulente_ = eval.ref_diffusivite_turbulente_;
+    //equivalent_distance.ref(eval.equivalent_distance);
+    tab_diffusivite_turbulente_.ref(eval.tab_diffusivite_turbulente_);
+    tab_diffusivite_turbulente_v_ = eval.tab_diffusivite_turbulente_v_;
+  }
+  virtual ~Eval_Dift_VDF() { }
+
   inline void mettre_a_jour() override
   {
     Eval_Diff_VDF::mettre_a_jour();
@@ -35,46 +47,60 @@ class Eval_Dift_VDF : public Eval_Diff_VDF
     return equivalent_distance[boundary_index](local_face);
   }
 
-  inline double compute_heq_impl(double d0, int i, double d1, int j, int compo) const
+  template<typename ExecSpace=HostSpace>
+  KOKKOS_INLINE_FUNCTION double tab_diffusivite_turbulente(int face, int comp) const { if constexpr (std::is_same<ExecSpace, HostSpace>::value) return tab_diffusivite_turbulente_(face, comp); else return tab_diffusivite_turbulente_v_(face, comp); }
+
+  template<typename ExecSpace=HostSpace>
+  KOKKOS_INLINE_FUNCTION double compute_heq_impl(double d0, int i, double d1, int j, int compo) const
   {
-    const double heq_lam = Eval_Diff_VDF::compute_heq_impl(d0, i, d1, j, compo);
-    const double heq_turb = 0.5 * (tab_diffusivite_turbulente(i, is_multi_ * compo) + tab_diffusivite_turbulente(j, is_multi_ * compo)) / (d1 + d0);
+    const double heq_lam = Eval_Diff_VDF::compute_heq_impl<ExecSpace>(d0, i, d1, j, compo);
+    const double heq_turb = 0.5 * (tab_diffusivite_turbulente<ExecSpace>(i, is_multi_ * compo) + tab_diffusivite_turbulente<ExecSpace>(j, is_multi_ * compo)) / (d1 + d0);
     return heq_lam + heq_turb;
   }
 
-  inline double nu_t_impl(int i, int compo) const { return tab_diffusivite_turbulente(i, is_multi_ * compo); }
+  template<typename ExecSpace=HostSpace>
+  KOKKOS_INLINE_FUNCTION double nu_t_impl(int i, int compo) const
+  {
+    return tab_diffusivite_turbulente<ExecSpace>(i, is_multi_ * compo);
+  }
 
-  inline double nu_lam_impl_face(int i, int j, int k, int l, int compo) const
+  template<typename ExecSpace=HostSpace>
+  KOKKOS_INLINE_FUNCTION double nu_lam_impl_face(int i, int j, int k, int l, int compo) const
   {
-    return Eval_Diff_VDF::nu_2_impl_face(i, j, k, l, compo);
+    return Eval_Diff_VDF::nu_2_impl_face<ExecSpace>(i, j, k, l, compo);
   }
 
-  inline double nu_lam_impl_face2(int i, int j, int compo) const
+  template<typename ExecSpace=HostSpace>
+  KOKKOS_INLINE_FUNCTION double nu_lam_impl_face2(int i, int j, int compo) const
   {
-    return Eval_Diff_VDF::nu_1_impl_face(i, j, compo);
+    return Eval_Diff_VDF::nu_1_impl_face<ExecSpace>(i, j, compo);
   }
 
-  inline double nu_1_impl(int i, int compo) const
+  template<typename ExecSpace=HostSpace>
+  KOKKOS_INLINE_FUNCTION double nu_1_impl(int i, int compo) const
   {
-    const double nu_lam = Eval_Diff_VDF::nu_1_impl(i, compo);
-    const double nu_turb = tab_diffusivite_turbulente(i, is_multi_ * compo);
+    const double nu_lam = Eval_Diff_VDF::nu_1_impl<ExecSpace>(i, compo);
+    const double nu_turb = tab_diffusivite_turbulente<ExecSpace>(i, is_multi_ * compo);
     return nu_lam + nu_turb;
   }
 
-  inline double nu_2_impl(int i, int compo) const
+  template<typename ExecSpace=HostSpace>
+  KOKKOS_INLINE_FUNCTION double nu_2_impl(int i, int compo) const
   {
-    return Eval_Diff_VDF::nu_2_impl(i, compo);
+    return Eval_Diff_VDF::nu_2_impl<ExecSpace>(i, compo);
   }
 
-  inline double nu_1_impl_face(int i, int j, int compo) const
+  template<typename ExecSpace=HostSpace>
+  KOKKOS_INLINE_FUNCTION double nu_1_impl_face(int i, int j, int compo) const
   {
-    return 0.5 * (tab_diffusivite_turbulente(i, is_multi_ * compo) + tab_diffusivite_turbulente(j, is_multi_ * compo));
+    return 0.5 * (tab_diffusivite_turbulente<ExecSpace>(i, is_multi_ * compo) + tab_diffusivite_turbulente<ExecSpace>(j, is_multi_ * compo));
   }
 
-  inline double nu_2_impl_face(int i, int j, int k, int l, int compo) const
+  template<typename ExecSpace=HostSpace>
+  KOKKOS_INLINE_FUNCTION double nu_2_impl_face(int i, int j, int k, int l, int compo) const
   {
-    return 0.25 * (tab_diffusivite_turbulente(i, is_multi_ * compo) + tab_diffusivite_turbulente(j, is_multi_ * compo) +
-                   tab_diffusivite_turbulente(k, is_multi_ * compo) + tab_diffusivite_turbulente(l, is_multi_ * compo));
+    return 0.25 * (tab_diffusivite_turbulente<ExecSpace>(i, is_multi_ * compo) + tab_diffusivite_turbulente<ExecSpace>(j, is_multi_ * compo) +
+                   tab_diffusivite_turbulente<ExecSpace>(k, is_multi_ * compo) + tab_diffusivite_turbulente<ExecSpace>(l, is_multi_ * compo));
   }
 
   void update_equivalent_distance()
@@ -92,19 +118,26 @@ class Eval_Dift_VDF : public Eval_Diff_VDF
   inline void associer_diff_turb(const Champ_Fonc_base& diff_turb)
   {
     ref_diffusivite_turbulente_ = diff_turb;
-    tab_diffusivite_turbulente.ref(diff_turb.valeurs());
+    tab_diffusivite_turbulente_.ref(diff_turb.valeurs());
     is_multi_ = (diff_turb.valeurs().dimension(1) > 1) ? 1 : 0;
   }
 
   inline virtual void associer_loipar(const Turbulence_paroi_scal_base& loi_paroi) { loipar = loi_paroi; }
   inline virtual void init_ind_fluctu_term() { /* do nothing */}
 
+  void view_ro_impl() const override
+  {
+    Eval_Diff_VDF::view_ro_impl();
+    tab_diffusivite_turbulente_v_ = tab_diffusivite_turbulente_.view_ro();
+  }
+
 protected:
   int is_multi_ = 0;
   OBS_PTR(Champ_Fonc_base) ref_diffusivite_turbulente_;
   OBS_PTR(Turbulence_paroi_scal_base) loipar;
   DoubleVects equivalent_distance;
-  DoubleTab tab_diffusivite_turbulente;
+  DoubleTab tab_diffusivite_turbulente_;
+  mutable CDoubleTabView tab_diffusivite_turbulente_v_;
 };
 
 #endif /* Eval_Dift_VDF_included */
diff --git a/src/VDF/Operateurs/Eval_Diff_Dift/Eval_Dift_VDF_leaves.cpp b/src/VDF/Operateurs/Eval_Diff_Dift/Eval_Dift_VDF_leaves.cpp
index 71c23b1f91..9bb72ed8d9 100644
--- a/src/VDF/Operateurs/Eval_Diff_Dift/Eval_Dift_VDF_leaves.cpp
+++ b/src/VDF/Operateurs/Eval_Diff_Dift/Eval_Dift_VDF_leaves.cpp
@@ -37,19 +37,3 @@ void Eval_Dift_VDF_Face::mettre_a_jour()
       if (tab.size_array() > 0) tau_tan_.ref(tab);
     }
 }
-
-double Eval_Dift_VDF_Face::tau_tan_impl(int face, int k) const
-{
-  const int nb_faces = le_dom->nb_faces();
-  const ArrOfInt& ind_faces_virt_bord = le_dom->ind_faces_virt_bord();
-  int f = (face >= tau_tan_.dimension(0)) ? ind_faces_virt_bord[face-nb_faces] : face;
-  if(f >= tau_tan_.dimension_tot(0))
-    {
-      Cerr << "Erreur dans tau_tan " << finl;
-      Cerr << "dimension : " << tau_tan_.dimension(0) << finl;
-      Cerr << "dimension_tot : " << tau_tan_.dimension_tot(0) << finl;
-      Cerr << "face : " << face << finl;
-      Process::exit();
-    }
-  return tau_tan_(f,k);
-}
diff --git a/src/VDF/Operateurs/Eval_Diff_Dift/Eval_Dift_VDF_leaves.h b/src/VDF/Operateurs/Eval_Diff_Dift/Eval_Dift_VDF_leaves.h
index 009925b28e..d064dd5699 100644
--- a/src/VDF/Operateurs/Eval_Diff_Dift/Eval_Dift_VDF_leaves.h
+++ b/src/VDF/Operateurs/Eval_Diff_Dift/Eval_Dift_VDF_leaves.h
@@ -1,5 +1,5 @@
 /****************************************************************************
-* Copyright (c) 2024, CEA
+* Copyright (c) 2026, CEA
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
@@ -92,23 +92,45 @@ class Eval_Dift_VDF_Multi_inco_Elem : public Eval_Diff_VDF_Elem_Gen<Eval_Dift_VD
 class Eval_Dift_VDF_Face : public Eval_Diff_VDF_Face_Gen<Eval_Dift_VDF_Face>, public Eval_Dift_VDF
 {
 public:
+  inline Eval_Dift_VDF_Face() { }
+  inline Eval_Dift_VDF_Face(const Eval_Dift_VDF_Face& eval) : Eval_Diff_VDF_Face_Gen<Eval_Dift_VDF_Face>(eval), Eval_Dift_VDF(eval)
+  {
+    le_modele_turbulence = eval.le_modele_turbulence;
+    loipar = eval.loipar;
+    tau_tan_.ref(eval.tau_tan_);
+    tau_tan_v_ = eval.tau_tan_v_;
+  }
+  virtual ~Eval_Dift_VDF_Face() { }
   static constexpr bool IS_TURB = true, CALC_FA7_SORTIE_LIB = true, CALC_ARR_PAR_FL = false;
   inline void associer_modele_turbulence(const Modele_turbulence_hyd_base& mod) { le_modele_turbulence = mod;  }
-  inline bool uses_wall() const { return le_modele_turbulence->utiliser_loi_paroi(); }
+  KOKKOS_INLINE_FUNCTION bool uses_wall() const { return tau_tan_v_.data(); }
   void mettre_a_jour() override;
-  double tau_tan_impl(int face,int k) const;
+  void view_ro() const override
+  {
+    Eval_Diff_VDF_Face_Gen<Eval_Dift_VDF_Face>::view_ro();
+    if (le_modele_turbulence->utiliser_loi_paroi())
+      tau_tan_v_ = tau_tan_.view_ro();
+  }
+  KOKKOS_INLINE_FUNCTION double tau_tan_impl(int face,int k) const
+  {
+    int size = (int)tau_tan_v_.extent(0);
+    int f = (face >= size) ? le_dom_v_.ind_faces_virt_bord(face-le_dom_v_.nb_faces()) : face;
+    assert(f<size);
+    return tau_tan_v_(f,k);
+  }
 
 private:
   OBS_PTR(Modele_turbulence_hyd_base) le_modele_turbulence;
   OBS_PTR(Turbulence_paroi_base) loipar;
   DoubleTab tau_tan_;
+  mutable CDoubleTabView tau_tan_v_;
 };
 
 class Eval_Dift_Multiphase_VDF_Face : public Eval_Diff_VDF_Face_Gen<Eval_Dift_Multiphase_VDF_Face>, public Eval_Dift_Multiphase_VDF
 {
 public:
   static constexpr bool IS_TURB = true, CALC_FA7_SORTIE_LIB = true, CALC_ARR_PAR_FL = false;
-  inline bool uses_wall() const { return false; }
+  KOKKOS_INLINE_FUNCTION bool uses_wall() const { return false; }
 };
 
 #endif /* Eval_Dift_VDF_leaves_included */
diff --git a/src/VDF/Operateurs/Eval_Divers/CL_Types_include.h b/src/VDF/Operateurs/Eval_Divers/CL_Types_include.h
index 9594003e5d..b184a5ca43 100644
--- a/src/VDF/Operateurs/Eval_Divers/CL_Types_include.h
+++ b/src/VDF/Operateurs/Eval_Divers/CL_Types_include.h
@@ -1,5 +1,5 @@
 /****************************************************************************
-* Copyright (c) 2025, CEA
+* Copyright (c) 2026, CEA
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
@@ -29,5 +29,13 @@
 #include <Neumann_paroi.h>
 #include <Periodique.h>
 #include <Symetrie.h>
+#include <kokkos++.h>
+
+/*! @brief BC_view struct to pass views on array to define Boundary conditions values (val_imp, t_ext, h_imp, ...)
+ */
+struct BC_View
+{
+  CDoubleTabView val[3];
+};
 
 #endif /* CL_Types_include_included */
diff --git a/src/VDF/Operateurs/Eval_Divers/Eval_Div_VDF_Elem.h b/src/VDF/Operateurs/Eval_Divers/Eval_Div_VDF_Elem.h
index e7f55e8a79..df0ad6122d 100644
--- a/src/VDF/Operateurs/Eval_Divers/Eval_Div_VDF_Elem.h
+++ b/src/VDF/Operateurs/Eval_Divers/Eval_Div_VDF_Elem.h
@@ -1,5 +1,5 @@
 /****************************************************************************
-* Copyright (c) 2025, CEA
+* Copyright (c) 2026, CEA
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
@@ -33,25 +33,26 @@ class Eval_Div_VDF_Elem : public Eval_Div_VDF, public Eval_VDF_Elem
    * *********  POUR L'EXPLICITE ********** *
    * ************************************** */
 
-  template <typename BC, typename Type_Double> // Generic return
-  inline void flux_face(const DoubleTab& inco, const DoubleTab&, const int face, const BC&, const int , Type_Double& flux) const
-  { for (int k=0; k<flux.size_array(); k++) flux[k] = inco(face,k)*surface(face)*porosite(face); }
-
-  template <typename Type_Double> inline void flux_face(const DoubleTab&, const DoubleTab&, const int , const Symetrie&, const int, Type_Double& ) const { /* Do nothing */ }
-  template <typename Type_Double> inline void flux_face(const DoubleTab&, const DoubleTab&, const int , const Dirichlet_paroi_fixe&, const int, Type_Double& ) const { /* Do nothing */ }
-  template <typename Type_Double> inline void flux_face(const DoubleTab&, const DoubleTab&, const int , const Dirichlet_paroi_defilante&, const int, Type_Double& ) const { /* Do nothing */ }
-
   template <typename Type_Double>
-  inline void flux_face(const DoubleTab& inco, const int boundary_index, const int face, const int local_face, const Echange_externe_impose&, const int, Type_Double& flux) const
+  inline void flux_face(const DoubleTab& inco, const DoubleTab&, const int face, const Echange_global_impose&, const int , Type_Double& flux) const
   { for (int k=0; k<flux.size_array(); k++) flux[k] = inco(face,k)*surface(face)*porosite(face); }
-
-  template <typename Type_Double>
-  inline void flux_faces_interne(const DoubleTab& inco, const int face, Type_Double& flux) const
+  template <typename Type_Double> inline void flux_face(const DoubleTab& inco, const int boundary_index, const int face, const int local_face, const Echange_externe_impose&, const int, Type_Double& flux) const
   { for (int k=0; k<flux.size_array(); k++) flux[k] = inco(face,k)*surface(face)*porosite(face); }
 
+  KOKKOS_INLINE_FUNCTION void flux_faces_interne_comp(CDoubleTabView inco, const int face, const int k, double& flux) const
+  { flux = inco(face,k)*surface_v_(face)*porosite_v_(face); }
+  template <typename BC>
+  KOKKOS_INLINE_FUNCTION void flux_faces_bord_comp(CDoubleTabView inco, CDoubleTabView, const int face, const BC_View&, const int, const int k, double& flux) const
+  {
+    if constexpr (std::is_same_v<BC, Symetrie> || std::is_same_v<BC, Dirichlet_paroi_fixe> || std::is_same_v<BC, Dirichlet_paroi_defilante>)
+      flux = 0.;
+    else
+      flux = inco(face,k)*surface_v_(face)*porosite_v_(face);
+  }
+
   /* ************************************** *
-   * *********  POUR L'IMPLICITE ********** *
-   * ************************************** */
+  * *********  POUR L'IMPLICITE ********** *
+  * ************************************** */
 
   template <typename BC, typename Type_Double>
   inline void coeffs_face(const int, const int, const BC&, Type_Double& , Type_Double&  ) const { /* Do nothing */ }
@@ -63,6 +64,7 @@ class Eval_Div_VDF_Elem : public Eval_Div_VDF, public Eval_VDF_Elem
   template <typename BC, typename Type_Double> inline void secmem_face(const int, const BC&, const int, Type_Double& ) const { throw; }
   template <typename Type_Double> inline void secmem_face(const int, const int, const int, const Echange_externe_impose&, const int, Type_Double& ) const { throw; }
   template <typename Type_Double> inline void secmem_faces_interne(const int, Type_Double& ) const { throw; }
+
 };
 
 #endif /* Eval_Div_VDF_Elem_included */
diff --git a/src/VDF/Operateurs/Eval_Divers/Eval_VDF_Face.h b/src/VDF/Operateurs/Eval_Divers/Eval_VDF_Face.h
index 98b9251293..a45c8599e0 100644
--- a/src/VDF/Operateurs/Eval_Divers/Eval_VDF_Face.h
+++ b/src/VDF/Operateurs/Eval_Divers/Eval_VDF_Face.h
@@ -1,5 +1,5 @@
 /****************************************************************************
-* Copyright (c) 2024, CEA
+* Copyright (c) 2025, CEA
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
@@ -35,15 +35,15 @@ class Eval_VDF_Face
                         CALC_ARR_INT = true, CALC_ARR_MIXTE = true, CALC_ARR_PERIO = true, CALC_ARR_PAR_FL = true, CALC_ARR_PAR = true,
                         CALC_ARR_NAVIER_PAR = true, CALC_ARR_NAVIER_FL = true, CALC_ARR_NAVIER = true;
   inline void associer_inconnue(const Champ_base& );
-
+  inline OBS_PTR(Champ_base) inconnue() const { return inconnue_; }
 protected:
-  OBS_PTR(Champ_base) inconnue;
+  OBS_PTR(Champ_base) inconnue_;
 };
 
 inline void Eval_VDF_Face::associer_inconnue(const Champ_base& inco)
 {
   assert(sub_type(Champ_Face_VDF,inco));
-  inconnue=ref_cast(Champ_Face_VDF,inco);
+  inconnue_=ref_cast(Champ_Face_VDF,inco);
 }
 
 #endif /* Eval_VDF_Face_included */
diff --git a/src/VDF/Operateurs/Eval_Divers/Evaluateur_VDF.cpp b/src/VDF/Operateurs/Eval_Divers/Evaluateur_VDF.cpp
index f5145b128c..361c3ecb2b 100644
--- a/src/VDF/Operateurs/Eval_Divers/Evaluateur_VDF.cpp
+++ b/src/VDF/Operateurs/Eval_Divers/Evaluateur_VDF.cpp
@@ -1,5 +1,5 @@
 /****************************************************************************
-* Copyright (c) 2023, CEA
+* Copyright (c) 2025, CEA
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
@@ -14,41 +14,43 @@
 *****************************************************************************/
 
 #include <Evaluateur_VDF.h>
-#include <Equation_base.h>
 #include <Milieu_base.h>
-#include <Domaine_Cl_VDF.h>
 
-Evaluateur_VDF::Evaluateur_VDF(const Evaluateur_VDF& eval) : le_dom(eval.le_dom), la_zcl(eval.la_zcl), dimension(eval.dimension),
-  premiere_face_bord(eval.premiere_face_bord)
+Evaluateur_VDF::Evaluateur_VDF(const Evaluateur_VDF& eval)
 {
-  surface.ref(eval.surface);
-  orientation.ref(eval.orientation);
-  elem_.ref(eval.elem_);
+  le_dom = eval.le_dom;
+  la_zcl = eval.la_zcl;
   porosite.ref(eval.porosite);
-  volume_entrelaces.ref(eval.volume_entrelaces);
-  xv.ref(eval.xv);
+  le_dom_v_ = eval.le_dom_v_;
+  elem_v_ = eval.elem_v_;
+  surface_v_ = eval.surface_v_;
+  orientation_v_ = eval.orientation_v_;
+  porosite_v_ = eval.porosite_v_;
+  volume_entrelaces_v_ = eval.volume_entrelaces_v_;
+  xv_v_ = eval.xv_v_;
+}
+
+void Evaluateur_VDF::view_ro() const
+{
+  le_dom_v_.set(le_dom);
+  elem_v_ = le_dom->face_voisins().view_ro();
+  surface_v_ = le_dom->face_surfaces().view_ro();
+  orientation_v_ = le_dom->orientation().view_ro();
+  porosite_v_ = porosite.view_ro();
+  volume_entrelaces_v_ = le_dom->volumes_entrelaces().view_ro();
+  xv_v_ = le_dom->xv().view_ro();
 }
 
 void Evaluateur_VDF::associer_domaines(const Domaine_VDF& domaine_vdf, const Domaine_Cl_VDF& domaine_cl_vdf)
 {
   le_dom = domaine_vdf;
   la_zcl = domaine_cl_vdf;
-  dimension = Objet_U::dimension;
-  premiere_face_bord = domaine_vdf.premiere_face_bord();
-  surface.ref(domaine_vdf.face_surfaces());
-  orientation.ref(domaine_vdf.orientation());
-  elem_.ref(domaine_vdf.face_voisins());
-  porosite.ref(la_zcl->equation().milieu().porosite_face());
-  volume_entrelaces.ref(domaine_vdf.volumes_entrelaces());
-  xv.ref(domaine_vdf.xv());
+  associer_porosite(la_zcl->equation().milieu().porosite_face());
 }
 
+// Peut etre appelee par F5:
 void Evaluateur_VDF::associer_porosite(const DoubleVect& poro)
 {
   porosite.ref(poro);
 }
 
-double Evaluateur_VDF::dist_norm_bord(int face) const
-{
-  return le_dom->dist_norm_bord(face);
-}
diff --git a/src/VDF/Operateurs/Eval_Divers/Evaluateur_VDF.h b/src/VDF/Operateurs/Eval_Divers/Evaluateur_VDF.h
index e1d2751bac..7c4d618785 100644
--- a/src/VDF/Operateurs/Eval_Divers/Evaluateur_VDF.h
+++ b/src/VDF/Operateurs/Eval_Divers/Evaluateur_VDF.h
@@ -19,8 +19,9 @@
 #include <Domaine_VDF.h>
 #include <TRUST_Ref.h>
 #include <TRUSTTab.h>
-
-class Domaine_Cl_VDF;
+#include <Domaine_Cl_VDF.h>
+#include <Milieu_base.h>
+#include <Equation_base.h>
 
 /*! @brief class Evaluateur_VDF Classe de base des evaluateurs VDF.
  *
@@ -38,25 +39,32 @@ class Evaluateur_VDF
   Evaluateur_VDF(const Evaluateur_VDF& );
   virtual void associer_domaines(const Domaine_VDF& , const Domaine_Cl_VDF& );
   virtual void associer_porosite(const DoubleVect&);
-
-  inline double dist_face_period(int fac1, int fac2, int k) const { return le_dom->dist_face_period(fac1,fac2,k); }
-  inline double dist_face(int fac1, int fac2, int k) const
-  {
-    return xv(fac2,k) - xv(fac1,k);
-    //return le_dom->dist_face(fac1, fac2, k);
-  }
+  virtual void view_ro() const;
+  inline const Domaine_Cl_VDF& get_la_zcl() const { return la_zcl.valeur(); }
 
 protected:
   OBS_PTR(Domaine_VDF) le_dom;
   OBS_PTR(Domaine_Cl_VDF) la_zcl;
-  int dimension = -100, premiere_face_bord = -100;
-  IntTab elem_;                       // les 2 elements voisins d'une face
-  DoubleVect surface;          // surfaces des faces
-  IntVect orientation;         // orientations des faces
   DoubleVect porosite;               // porosites surfaciques
-  DoubleVect volume_entrelaces;//
-  DoubleTab xv;                // coord des centres des faces
-  double dist_norm_bord(int) const;
+
+  // Vues utilisees par les parties Kokkos:
+  Domaine_VDF_View le_dom_v_;                 // Struct pour acceder aux donnees sur le device du Domaine_VDF
+  mutable CIntTabView elem_v_;                // les 2 elements voisins d'une face
+  mutable CDoubleArrView surface_v_;          // surfaces des faces
+  mutable CIntArrView orientation_v_;         // orientations des faces
+  mutable CDoubleArrView porosite_v_;          // porosites surfaciques
+  mutable CDoubleArrView volume_entrelaces_v_; // volumes entrelacees
+  mutable CDoubleTabView xv_v_;                // coord des centres des faces
+
+  // Fonctions appelees par les parties encore non portees:
+  inline int elem_(int face, int k) const { return le_dom->face_voisins(face, k); }
+  inline int orientation(int face) const { return le_dom->orientation(face); }
+  inline double xv(int face, int k) const { return le_dom->xv(face, k); }
+  inline double dist_norm_bord(int face) const { return le_dom->dist_norm_bord(face); }
+  inline double surface(int face) const { return le_dom->face_surfaces()(face); }
+  inline double volume_entrelaces(int face) const { return le_dom->volumes_entrelaces()(face); }
+  inline double dist_face_period(int fac1, int fac2, int k) const { return le_dom->dist_face_period(fac1,fac2,k); }
+  inline double dist_face(int fac1, int fac2, int k) const { return xv(fac2,k) - xv(fac1,k); }
 };
 
 #endif /* Evaluateur_VDF_included */
diff --git a/src/VDF/Operateurs/Iterateurs/Iterateur_VDF_Elem.h b/src/VDF/Operateurs/Iterateurs/Iterateur_VDF_Elem.h
index c7ac3bff73..22cf52f8ac 100644
--- a/src/VDF/Operateurs/Iterateurs/Iterateur_VDF_Elem.h
+++ b/src/VDF/Operateurs/Iterateurs/Iterateur_VDF_Elem.h
@@ -1,5 +1,5 @@
 /****************************************************************************
-* Copyright (c) 2025, CEA
+* Copyright (c) 2026, CEA
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
@@ -33,13 +33,13 @@ class Iterateur_VDF_Elem : public Iterateur_VDF_base
 
 public:
   Iterateur_VDF_Elem() { }
-  Iterateur_VDF_Elem(const Iterateur_VDF_Elem<_TYPE_>& iter) : Iterateur_VDF_base(iter), flux_evaluateur(iter.flux_evaluateur) { elem.ref(iter.elem); }
+  Iterateur_VDF_Elem(const Iterateur_VDF_Elem<_TYPE_>& iter) : Iterateur_VDF_base(iter), flux_evaluateur_(iter.flux_evaluateur_) { elem_.ref(iter.elem_); }
 
-  inline Evaluateur_VDF& evaluateur() override { return static_cast<Evaluateur_VDF&> (flux_evaluateur); }
-  inline const Evaluateur_VDF& evaluateur() const override { return static_cast<const Evaluateur_VDF&> (flux_evaluateur); }
+  inline Evaluateur_VDF& evaluateur() override { return static_cast<Evaluateur_VDF&> (flux_evaluateur_); }
+  inline const Evaluateur_VDF& evaluateur() const override { return static_cast<const Evaluateur_VDF&> (flux_evaluateur_); }
 
   int impr(Sortie& os) const override;
-  void completer_() override { elem.ref(le_dom->face_voisins()); }
+  void completer_() override { elem_.ref(le_dom->face_voisins()); }
   void ajouter_contribution_autre_pb(const DoubleTab& inco, Matrice_Morse& matrice, const Cond_lim& la_cl, std::map<int, std::pair<int, int>>&) const override;
   void contribuer_au_second_membre(DoubleTab& ) const override;
 
@@ -49,13 +49,13 @@ class Iterateur_VDF_Elem : public Iterateur_VDF_base
   void creer_champ_T_paroi_pour_flux_parietal() override { /* TODO FIXME */ }
 
 protected:
-  _TYPE_ flux_evaluateur;
-  IntTab elem;
+  _TYPE_ flux_evaluateur_;
+  IntTab elem_;
   mutable SFichier Flux, Flux_moment, Flux_sum;
   inline const Milieu_base& milieu() const { return (la_zcl->equation()).milieu(); }
   OBS_PTR(Correlation_base) corr_flux_parietal_;
 
-private:
+  private_but_public_for_cuda
   template <typename Type_Double>
   void ajouter_blocs_bords(const int , matrices_t mats, DoubleTab& secmem, const tabs_t& semi_impl) const;
 
@@ -77,6 +77,7 @@ class Iterateur_VDF_Elem : public Iterateur_VDF_base
 
   void modifier_flux() const;
   template <typename Type_Double> inline void fill_flux_tables_(const int, const int , const double , const Type_Double& , DoubleTab& ) const;
+  KOKKOS_INLINE_FUNCTION void fill_flux_tables_(const int, const int , const double , CIntTabView, DoubleArrView, DoubleTabView, DoubleTabView) const;
 
   void fill_derivee_cc(matrices_t mats, const tabs_t& semi_impl, VectorDeriv& d_cc) const;
 
@@ -102,6 +103,7 @@ class Iterateur_VDF_Elem : public Iterateur_VDF_base
 
   template <typename Type_Double>
   void contribuer_au_second_membre_bords_(const Echange_externe_impose& , const int , const int , const int, const int , const Front_VF& , DoubleTab& ) const;
+
 };
 
 #include <Iterateur_VDF_Elem.tpp> // templates specializations ici ;)
diff --git a/src/VDF/Operateurs/Iterateurs/Iterateur_VDF_Elem.tpp b/src/VDF/Operateurs/Iterateurs/Iterateur_VDF_Elem.tpp
index 34f32fad1e..fae8e0199f 100644
--- a/src/VDF/Operateurs/Iterateurs/Iterateur_VDF_Elem.tpp
+++ b/src/VDF/Operateurs/Iterateurs/Iterateur_VDF_Elem.tpp
@@ -21,6 +21,10 @@
 #include <Champ_Uniforme.h>
 #include <communications.h>
 #include <TRUSTSingle.h>
+#include <Eval_Diff_VDF_leaves.h>
+#include <Debog.h>
+#include <typeinfo>
+#include <string_view>
 
 template<class _TYPE_>
 void Iterateur_VDF_Elem<_TYPE_>::ajouter_contribution_autre_pb(const DoubleTab& inco, Matrice_Morse& matrice, const Cond_lim& la_cl, std::map<int, std::pair<int, int>>& f2e) const
@@ -35,9 +39,9 @@ void Iterateur_VDF_Elem<_TYPE_>::ajouter_contribution_autre_pb(const DoubleTab&
       for (int f = ndeb; f < nfin; f++)
         {
           const int e1 = f2e[f].first, e2 = f2e[f].second;
-          flux_evaluateur.coeffs_face(f, ndeb, cl, aii, ajj);
+          flux_evaluateur_.coeffs_face(f, ndeb, cl, aii, ajj);
           for (int i = 0; i < ncomp; i++)
-            matrice(e1 * ncomp + i, e2 * ncomp + i) = -(elem(f, 0) > -1 ? aii[i] : ajj[i]);
+            matrice(e1 * ncomp + i, e2 * ncomp + i) = -(elem_(f, 0) > -1 ? aii[i] : ajj[i]);
         }
     }
 }
@@ -49,18 +53,20 @@ void Iterateur_VDF_Elem<_TYPE_>::ajouter_contribution_autre_pb(const DoubleTab&
 template<class _TYPE_>
 void Iterateur_VDF_Elem<_TYPE_>::ajouter_blocs(matrices_t mats, DoubleTab& secmem, const tabs_t& semi_impl) const
 {
-  ((_TYPE_&) flux_evaluateur).mettre_a_jour();
+  ((_TYPE_&) flux_evaluateur_).mettre_a_jour();
   assert(op_base->equation().inconnue().valeurs().nb_dim() < 3 && la_zcl && le_dom);
   const int ncomp = op_base->equation().inconnue().valeurs().line_size();
   DoubleTab& flux_bords = op_base->flux_bords();
   flux_bords.resize(le_dom->nb_faces_bord(), ncomp);
   flux_bords = 0.;
+
   // modif b.m.: on va faire += sur des items virtuels, initialiser les cases : sinon risque que les cases soient invalides ou non initialisees
+  DoubleArrView data = static_cast<ArrOfDouble&>(secmem).view_wo();
+  Kokkos::parallel_for(start_gpu_timer(__KERNEL_NAME__), range_1D(secmem.size(), secmem.size_array()), KOKKOS_LAMBDA(const int i)
   {
-    int n = secmem.size_array() - secmem.size();
-    double *data = secmem.addr() + secmem.size();
-    for (; n; n--, data++) *data = 0.;
-  }
+    data(i)=0.;
+  });
+  end_gpu_timer(__KERNEL_NAME__);
   if (ncomp == 1)
     {
       ajouter_blocs_bords < SingleDouble > (ncomp, mats, secmem, semi_impl);
@@ -137,41 +143,49 @@ void Iterateur_VDF_Elem<_TYPE_>::ajouter_blocs_bords(const int ncomp, matrices_t
           Process::exit();
           break;
         }
+      Debog::verifier(la_cl.valeur().que_suis_je(),resu);
     }
 }
 
 template<class _TYPE_> template<typename Type_Double>
-void Iterateur_VDF_Elem<_TYPE_>::ajouter_blocs_interne(const int N, matrices_t mats, DoubleTab& resu, const tabs_t& semi_impl) const
+void Iterateur_VDF_Elem<_TYPE_>::ajouter_blocs_interne(const int N, matrices_t mats, DoubleTab& tab_resu, const tabs_t& semi_impl) const
 {
-  const DoubleTab& donnee = semi_impl.count(nom_ch_inco_) ? semi_impl.at(nom_ch_inco_) : le_champ_convecte_ou_inc->valeurs();
-  Type_Double flux(N), aii(N * (multiscalar_diff_ ? N : 1)), ajj(N * (multiscalar_diff_ ? N : 1)), aef(N);
-  const int ndeb = le_dom->premiere_face_int(), nfin = le_dom->nb_faces(), Mv = le_ch_v ? le_ch_v->valeurs().line_size() : N;
-  for (int face = ndeb; face < nfin; face++)
-    {
-      flux_evaluateur.flux_faces_interne(donnee, face, flux);
-      const int e0 = elem(face, 0), e1 = elem(face, 1);
-      // second membre
-      for (int k = 0; k < N; k++)
-        {
-          resu(e0, k) += flux[k];
-          resu(e1, k) -= flux[k];
-        }
-    }
+  const DoubleTab& tab_donnee = semi_impl.count(nom_ch_inco_) ? semi_impl.at(nom_ch_inco_) : le_champ_convecte_ou_inc->valeurs();
+  const int ndeb = le_dom->premiere_face_int(), nfin = le_dom->nb_faces();
+
+  // Copy of flux_evaluateur_ is mandatory here cause it's an attribute of the class
+  _TYPE_ flux_evaluateur = flux_evaluateur_;
+  flux_evaluateur.view_ro();
+  CIntTabView elem = elem_.view_ro();
+  CDoubleTabView donnee = tab_donnee.view_ro();
+  DoubleTabView resu = tab_resu.view_rw();
+  Kokkos::parallel_for(start_gpu_timer(__KERNEL_NAME__),
+                       Kokkos::MDRangePolicy<Kokkos::Rank<2>>({ndeb, 0}, {nfin, N}),
+                       KOKKOS_LAMBDA(const int face, const int k)
+  {
+    double flux;
+    flux_evaluateur.flux_faces_interne_comp(donnee, face, k, flux);
+    const int e0 = elem(face, 0), e1 = elem(face, 1);
+    Kokkos::atomic_add(&resu(e0, k), +flux);
+    Kokkos::atomic_add(&resu(e1, k), -flux);
+  });
+  end_gpu_timer(__KERNEL_NAME__);
 
   Matrice_Morse *m_vit = (mats.count("vitesse") && is_convective_op()) ? mats.at("vitesse") : nullptr, *mat = (!is_pb_multiphase() && mats.count(nom_ch_inco_)) ? mats.at(nom_ch_inco_) : nullptr;
   VectorDeriv d_cc;
   fill_derivee_cc(mats, semi_impl, d_cc);
 
+  Type_Double aii(N * (multiscalar_diff_ ? N : 1)), ajj(N * (multiscalar_diff_ ? N : 1)), aef(N);
+  const int Mv = le_ch_v ? le_ch_v->valeurs().line_size() : N;
   //derivees : vitesse
   if (m_vit)
     for (int face = ndeb; face < nfin; face++)
       {
-        flux_evaluateur.coeffs_faces_interne_bloc_vitesse(donnee, face, aef);
+        flux_evaluateur.coeffs_faces_interne_bloc_vitesse(tab_donnee, face, aef);
         for (int i = 0; i < 2; i++)
           for (int n = 0, m = 0; n < N; n++, m += (Mv > 1))
-            (*m_vit)(N * elem(face, i) + n, Mv * face + m) += (i ? -1.0 : 1.0) * aef(n);
+            (*m_vit)(N * elem_(face, i) + n, Mv * face + m) += (i ? -1.0 : 1.0) * aef(n);
       }
-
   //derivees : champ convecte
   if (mat || d_cc.size() > 0)
     for (int face = ndeb; face < nfin; face++)
@@ -182,7 +196,7 @@ void Iterateur_VDF_Elem<_TYPE_>::ajouter_blocs_interne(const int N, matrices_t m
 }
 
 template<class _TYPE_> template<bool should_calc_flux, typename Type_Double, typename BC>
-void Iterateur_VDF_Elem<_TYPE_>::ajouter_blocs_bords_(const BC& cl, const int ndeb, const int nfin, const int N, matrices_t mats, DoubleTab& resu, const tabs_t& semi_impl) const
+void Iterateur_VDF_Elem<_TYPE_>::ajouter_blocs_bords_(const BC& cl, const int ndeb, const int nfin, const int N, matrices_t mats, DoubleTab& tab_resu, const tabs_t& semi_impl) const
 {
   constexpr bool is_Neum_paroi_adiab = std::is_same<BC, Neumann_paroi_adiabatique>::value;
 
@@ -199,9 +213,9 @@ void Iterateur_VDF_Elem<_TYPE_>::ajouter_blocs_bords_(const BC& cl, const int nd
       if (is_Neum_paroi_adiab)
         Process::exit(); // On bloque ici :-)
 
-      const DoubleTab& donnee = semi_impl.count(nom_ch_inco_) ? semi_impl.at(nom_ch_inco_) : le_champ_convecte_ou_inc->valeurs(),
-                       val_b = sub_type(Champ_Face_base, le_champ_convecte_ou_inc.valeur()) ? DoubleTab() :
-                               (use_base_val_b_ ? le_champ_convecte_ou_inc->Champ_base::valeur_aux_bords() : le_champ_convecte_ou_inc->valeur_aux_bords()); // si le champ associe est un champ_face, alors on est dans un operateur de div
+      const DoubleTab& tab_donnee = semi_impl.count(nom_ch_inco_) ? semi_impl.at(nom_ch_inco_) : le_champ_convecte_ou_inc->valeurs(),
+                       tab_val_b = sub_type(Champ_Face_base, le_champ_convecte_ou_inc.valeur()) ? DoubleTab() :
+                                   (use_base_val_b_ ? le_champ_convecte_ou_inc->Champ_base::valeur_aux_bords() : le_champ_convecte_ou_inc->valeur_aux_bords()); // si le champ associe est un champ_face, alors on est dans un operateur de div
 
       Matrice_Morse *mat = (!is_pb_multiphase() && mats.count(nom_ch_inco_)) ? mats.at(nom_ch_inco_) : nullptr;
       VectorDeriv d_cc;
@@ -211,34 +225,70 @@ void Iterateur_VDF_Elem<_TYPE_>::ajouter_blocs_bords_(const BC& cl, const int nd
       if (is_Temp_impose_flux_parietal || is_Neumann_flux_parietal || is_paroi_contact_flux_parietal)
         {
           fill_derivee_cc(mats, semi_impl, d_cc);
-          const DoubleTab& donnee2 = is_pb_multiphase() ? le_champ_convecte_ou_inc->valeurs() : donnee ; // On tente de toujours impliciter le flux parietal en pb multi lol
+          const DoubleTab& donnee2 = is_pb_multiphase() ? le_champ_convecte_ou_inc->valeurs() : tab_donnee ; // On tente de toujours impliciter le flux parietal en pb multi lol
           mat = mats.count(nom_ch_inco_) ? mats.at(nom_ch_inco_) : nullptr; // On tente de toujours impliciter le flux parietal en pb multi lol
-          ajouter_blocs_bords_flux_parietal_<Type_Double, BC>(cl, ndeb, nfin, N, donnee2, resu, mat, d_cc, semi_impl);
+          ajouter_blocs_bords_flux_parietal_<Type_Double, BC>(cl, ndeb, nfin, N, donnee2, tab_resu, mat, d_cc, semi_impl);
         }
       else
         {
-          int e, Mv = le_ch_v ? le_ch_v->valeurs().line_size() : N;
-          Type_Double flux(N), aii(N * (multiscalar_diff_ ? N : 1)), ajj(N * (multiscalar_diff_ ? N : 1)), aef(N);
-          for (int face = ndeb; face < nfin; face++)
+          // Copy of flux_evaluateur_ is mandatory here cause it's an attribute of the class
+          _TYPE_ flux_evaluateur = flux_evaluateur_;
+          flux_evaluateur.view_ro();
+          CIntTabView elem = elem_.view_ro();
+          CDoubleTabView donnee = tab_donnee.view_ro();
+          CDoubleTabView val_b = tab_val_b.view_ro();
+          BC_View bc_view;
+          // BC with imposed (non zero) values:
+          if constexpr (std::is_same_v<BC, Dirichlet_entree_fluide> ||
+                        std::is_same_v<BC, Scalaire_impose_paroi> ||
+                        std::is_same_v<BC, Dirichlet_loi_paroi>)
+            bc_view.val[0] = cl.tab_val_imp().view_ro();
+          else if constexpr (std::is_same_v<BC, Neumann_paroi>)
+            bc_view.val[0] = cl.tab_flux_impose().view_ro();
+          else if constexpr (std::is_same_v<BC, Echange_global_impose>)
             {
-              flux_evaluateur.flux_face(donnee, val_b, face, cl, ndeb, flux); // Generic code
-              fill_flux_tables_(face, N, 1.0 /* coeff */, flux, resu);
+              bc_view.val[0] = cl.tab_h_imp().view_ro();
+              bc_view.val[1] = cl.tab_T_ext().view_ro();
+              if (cl.has_phi_ext()) bc_view.val[2] = cl.tab_phi_ext().view_ro();
             }
+          else if constexpr (std::is_same_v<BC, Dirichlet_paroi_fixe> ||
+                             std::is_same_v<BC, Symetrie> ||
+                             std::is_same_v<BC, Neumann_sortie_libre>)
+            {}
+          else
+            {
+              cerr << "Error, forbid to assess a BC of type: " << std::string_view(typeid(BC).name()) << endl;
+              Process::exit();
+            }
+          DoubleTabView resu = tab_resu.view_rw();
+          DoubleTabView flux_bords = op_base->flux_bords().view_rw();
+          Kokkos::parallel_for(start_gpu_timer(__KERNEL_NAME__),
+                               Kokkos::MDRangePolicy<Kokkos::Rank<2>>({ndeb, 0}, {nfin, N}),
+                               KOKKOS_LAMBDA(const int face, const int k)
+          {
+            double flux_k;
+            flux_evaluateur.template flux_faces_bord_comp<BC>(donnee, val_b, face, bc_view, ndeb, k, flux_k);
+            const int elem1 = elem(face, 0), elem2 = elem(face, 1);
+            if (elem1 > -1) { Kokkos::atomic_add(&resu(elem1, k), +flux_k); flux_bords(face, k) += flux_k; }
+            if (elem2 > -1) { Kokkos::atomic_add(&resu(elem2, k), -flux_k); flux_bords(face, k) -= flux_k; }
+          });
+          end_gpu_timer(__KERNEL_NAME__);
 
           Matrice_Morse *m_vit = (mats.count("vitesse") && is_convective_op()) ? mats.at("vitesse") : nullptr;
-
           fill_derivee_cc(mats, semi_impl, d_cc);
 
+          int e, Mv = le_ch_v ? le_ch_v->valeurs().line_size() : N;
           //derivees : vitesse
           if (m_vit)
             {
+              Type_Double aef(N);
               const IntTab *fcl_v = le_ch_v ? &ref_cast(Champ_Face_base, le_ch_v.valeur()).fcl() : nullptr;
               for (int f = ndeb; f < nfin; f++)
                 if ((*fcl_v)(f, 0) < 2)
                   {
-                    flux_evaluateur.coeffs_face_bloc_vitesse(donnee, val_b, f, cl, ndeb, aef);
+                    flux_evaluateur.coeffs_face_bloc_vitesse(tab_donnee, tab_val_b, f, cl, ndeb, aef);
                     for (int i = 0; i < 2; i++)
-                      if ((e = elem(f, i)) >= 0)
+                      if ((e = elem_(f, i)) >= 0)
                         for (int n = 0, m = 0; n < N; n++, m += (Mv > 1))
                           (*m_vit)(N * e + n, Mv * f + m) += (i ? -1.0 : 1.0) * aef(n);
                   }
@@ -246,46 +296,61 @@ void Iterateur_VDF_Elem<_TYPE_>::ajouter_blocs_bords_(const BC& cl, const int nd
 
           //derivees : champ convecte
           if (mat || d_cc.size() > 0)
-            for (int face = ndeb; face < nfin; face++)
-              {
-                flux_evaluateur.coeffs_face(face, ndeb, cl, aii, ajj); // Generic code
-                fill_coeffs_matrices(face, aii, ajj, mat, d_cc);
-              }
+            {
+              Type_Double aii(N * (multiscalar_diff_ ? N : 1)), ajj(N * (multiscalar_diff_ ? N : 1));
+              for (int face = ndeb; face < nfin; face++)
+                {
+                  flux_evaluateur.coeffs_face(face, ndeb, cl, aii, ajj); // Generic code
+                  fill_coeffs_matrices(face, aii, ajj, mat, d_cc);
+                }
+            }
         }
     }
 }
 
 template<class _TYPE_> template<typename Type_Double>
 void Iterateur_VDF_Elem<_TYPE_>::ajouter_blocs_bords_(const Periodique& cl, const int ndeb, const int nfin, const int N, const Front_VF& frontiere_dis,
-                                                      matrices_t mats, DoubleTab& resu, const tabs_t& semi_impl) const
+                                                      matrices_t mats, DoubleTab& tab_resu, const tabs_t& semi_impl) const
 {
-  DoubleTab& flux_bords = op_base->flux_bords();
   if (_TYPE_::CALC_FLUX_FACES_PERIO)
     {
-      const DoubleTab& donnee = semi_impl.count(nom_ch_inco_) ? semi_impl.at(nom_ch_inco_) : le_champ_convecte_ou_inc->valeurs();
-
-      // Luis : je rajoute l'option multiscalar_diff dans les CL périodiques
-      Type_Double flux(N), aii(N * (multiscalar_diff_ ? N : 1)), ajj(N * (multiscalar_diff_ ? N : 1)), aef(N);
-      for (int face = ndeb; face < nfin; face++)
-        {
-          const int e0 = elem(face, 0), e1 = elem(face, 1);
-          flux_evaluateur.flux_face(donnee, donnee, face, cl, ndeb, flux); // attention 2 fois donnee
-
-          for (int n = 0; n < N; n++)
-            {
-              if (e0 > -1)
-                {
-                  resu(e0, n) += 0.5 * flux[n];
-                  if (face < (ndeb + frontiere_dis.nb_faces() / 2)) flux_bords(face, n) += flux[n];
-                }
-              if (e1 > -1)
-                {
-                  resu(e1, n) -= 0.5 * flux[n];
-                  if ((ndeb + frontiere_dis.nb_faces() / 2) <= face) flux_bords(face, n) -= flux[n];
-                }
-            }
-        }
+      const DoubleTab& tab_donnee = semi_impl.count(nom_ch_inco_) ? semi_impl.at(nom_ch_inco_) : le_champ_convecte_ou_inc->valeurs();
+      int nb_faces = frontiere_dis.nb_faces();
+      // Copy of flux_evaluateur_ is mandatory here cause it's an attribute of the class
+      _TYPE_ flux_evaluateur = flux_evaluateur_;
+      flux_evaluateur.view_ro();
+      // DoubleTab tab_val_imp(1,1);
+      // PL: Even this tiny array should be DoubleTrav cause repeated allocation/allocation on device is sloooow
+      // Seen during profiling: 6% of runtime on dalia for Canal_VDF !!!
+      DoubleTrav tab_val_imp(1,1); // Trick to pass to evaluateur flux_face the distance periodicity value
+      tab_val_imp(0,0) = cl.distance();
+      BC_View bc_view;
+      bc_view.val[0] = tab_val_imp.view_ro();
+      CIntTabView elem = elem_.view_ro();
+      CDoubleTabView donnee = tab_donnee.view_ro();
+      DoubleTabView resu = tab_resu.view_rw();
+      DoubleTabView flux_bords = op_base->flux_bords().view_rw();
+      Kokkos::parallel_for(start_gpu_timer(__KERNEL_NAME__),
+                           Kokkos::MDRangePolicy<Kokkos::Rank<2>>({ndeb, 0}, {nfin, N}),
+                           KOKKOS_LAMBDA(const int face, const int k)
+      {
+        double flux_k;
+        flux_evaluateur.template flux_faces_bord_comp<Periodique>(donnee, donnee, face, bc_view, ndeb, k, flux_k);
+        const int e0 = elem(face, 0), e1 = elem(face, 1);
+        if (e0 > -1)
+          {
+            Kokkos::atomic_add(&resu(e0, k), +0.5 * flux_k);
+            if (face < (ndeb + nb_faces / 2)) Kokkos::atomic_add(&flux_bords(face, k), +flux_k);
+          }
+        if (e1 > -1)
+          {
+            Kokkos::atomic_add(&resu(e1, k), -0.5 * flux_k);
+            if ((ndeb + nb_faces / 2) <= face) Kokkos::atomic_add(&flux_bords(face, k), -flux_k);
+          }
+      });
+      end_gpu_timer(__KERNEL_NAME__);
 
+      Type_Double aii(N * (multiscalar_diff_ ? N : 1)), ajj(N * (multiscalar_diff_ ? N : 1)), aef(N);
       Matrice_Morse *m_vit = mats.count("vitesse") ? mats.at("vitesse") : nullptr, *mat = (!is_pb_multiphase() && mats.count(nom_ch_inco_)) ? mats.at(nom_ch_inco_) : nullptr;
       VectorDeriv d_cc;
       fill_derivee_cc(mats, semi_impl, d_cc);
@@ -294,14 +359,14 @@ void Iterateur_VDF_Elem<_TYPE_>::ajouter_blocs_bords_(const Periodique& cl, cons
       if (m_vit)
         for (int face = ndeb; face < nfin; face++)
           {
-            const int e0 = elem(face, 0), e1 = elem(face, 1);
-            flux_evaluateur.coeffs_face_bloc_vitesse(donnee, DoubleTab(), face, cl, ndeb, aef);
+            const int e0 = elem_(face, 0), e1 = elem_(face, 1);
+            flux_evaluateur.coeffs_face_bloc_vitesse(tab_donnee, DoubleTab(), face, cl, ndeb, aef);
             if (e0 > -1)
               for (int i = 0; i < N; i++)
-                if (face < (ndeb + frontiere_dis.nb_faces() / 2)) (*m_vit)(e0 * N + i, face * N + i) += aef[i];
+                if (face < (ndeb + nb_faces / 2)) (*m_vit)(e0 * N + i, face * N + i) += aef[i];
             if (e1 > -1)
               for (int i = 0; i < N; i++)
-                if ((ndeb + frontiere_dis.nb_faces() / 2) <= face) (*m_vit)(e1 * N + i, face * N + i) -= aef[i];
+                if ((ndeb + nb_faces / 2) <= face) (*m_vit)(e1 * N + i, face * N + i) -= aef[i];
           }
 
       //derivees : champ convecte
@@ -333,10 +398,11 @@ void Iterateur_VDF_Elem<_TYPE_>::ajouter_blocs_bords_(const Echange_externe_impo
         boundary_index = num_cl;
 
       int e, Mv = le_ch_v ? le_ch_v->valeurs().line_size() : N;
+      ToDo_Kokkos("BC Echange_externe_impose");
       for (int face = ndeb; face < nfin; face++)
         {
           const int local_face = le_dom->front_VF(boundary_index).num_local_face(face);
-          flux_evaluateur.flux_face(donnee, boundary_index, face, local_face, cl, ndeb, flux);
+          flux_evaluateur_.flux_face(donnee, boundary_index, face, local_face, cl, ndeb, flux);
           fill_flux_tables_(face, N, 1.0 /* coeff */, flux, resu);
         }
 
@@ -351,10 +417,10 @@ void Iterateur_VDF_Elem<_TYPE_>::ajouter_blocs_bords_(const Echange_externe_impo
           for (int face = ndeb; face < nfin; face++)
             {
               const int local_face = le_dom->front_VF(boundary_index).num_local_face(face);
-              flux_evaluateur.coeffs_face_bloc_vitesse(donnee, val_b, boundary_index, face, local_face, cl, ndeb, aef);
+              flux_evaluateur_.coeffs_face_bloc_vitesse(donnee, val_b, boundary_index, face, local_face, cl, ndeb, aef);
 
               for (int i = 0; i < 2; i++)
-                if ((e = elem(face, i)) >= 0)
+                if ((e = elem_(face, i)) >= 0)
                   for (int n = 0, m = 0; n < N; n++, m += (Mv > 1)) (*m_vit)(N * e + n, Mv * face + m) += (i ? -1.0 : 1.0) * aef(n);
             }
         }
@@ -364,7 +430,7 @@ void Iterateur_VDF_Elem<_TYPE_>::ajouter_blocs_bords_(const Echange_externe_impo
         for (int face = ndeb; face < nfin; face++)
           {
             const int local_face = le_dom->front_VF(boundary_index).num_local_face(face);
-            flux_evaluateur.coeffs_face(donnee, boundary_index, face, local_face, ndeb, cl, aii, ajj);
+            flux_evaluateur_.coeffs_face(donnee, boundary_index, face, local_face, ndeb, cl, aii, ajj);
             fill_coeffs_matrices(face, aii, ajj, mat, d_cc); // XXX : Attention Yannick pour d_cc c'est pas tout a fait comme avant ... N et M ...
           }
     }
@@ -390,21 +456,21 @@ inline void Iterateur_VDF_Elem<_TYPE_>::fill_coeffs_matrices(const int f, const
   if (mat)
     {
       for (int i = 0; i < 2; i++)
-        for (int j = 0, e = elem(f, i); j < 2; j++)
-          for (int n = 0, eb = elem(f, j); n < N; n++)
+        for (int j = 0, e = elem_(f, i); j < 2; j++)
+          for (int n = 0, eb = elem_(f, j); n < N; n++)
             for (int m = (multiscalar_diff_ ? 0 : n); m < (multiscalar_diff_ ? N : n + 1); m++)
               (*mat)(N * e + n, N * eb + m) += (i == j ? 1.0 : -1.0) * coeff * (j ? ajj[multiscalar_diff_ ? N * n + m : n] : aii[multiscalar_diff_ ? N * n + m : n]);
     }
   else
     for (auto &&d_m_i : d_cc)
       for (int i = 0; i < 2; i++)
-        for (int j = 0, e = elem(f, i); j < 2; j++)
+        for (int j = 0, e = elem_(f, i); j < 2; j++)
           {
             const int M = std::get<2> (d_m_i);
             const DoubleTab& d_var_cc = *std::get<0> (d_m_i);
             Matrice_Morse& d_var_operateur = *std::get<1> (d_m_i);
 
-            for (int n = 0, m = 0, eb = elem(f, j); n < N; n++, m += (M > 1))
+            for (int n = 0, m = 0, eb = elem_(f, j); n < N; n++, m += (M > 1))
               d_var_operateur(N * e + n, M * eb + m) += (i == j ? 1.0 : -1.0) * coeff * (j ? ajj[n] : aii[n]) * d_var_cc(eb, m);
           }
 }
@@ -412,7 +478,7 @@ inline void Iterateur_VDF_Elem<_TYPE_>::fill_coeffs_matrices(const int f, const
 template<class _TYPE_> template<typename Type_Double>
 inline void Iterateur_VDF_Elem<_TYPE_>::fill_coeffs_matrices(const int face, Type_Double& aii, Type_Double& ajj, Matrice_Morse *mat, VectorDeriv& d_cc) const
 {
-  const int e0 = elem(face, 0), e1 = elem(face, 1);
+  const int e0 = elem_(face, 0), e1 = elem_(face, 1);
   const int N = multiscalar_diff_ ? int(sqrt(aii.size_array())) : aii.size_array();
 
   if (mat)
diff --git a/src/VDF/Operateurs/Iterateurs/Iterateur_VDF_Elem_FT_TCL.tpp b/src/VDF/Operateurs/Iterateurs/Iterateur_VDF_Elem_FT_TCL.tpp
index 16746197ca..6286e20c0a 100644
--- a/src/VDF/Operateurs/Iterateurs/Iterateur_VDF_Elem_FT_TCL.tpp
+++ b/src/VDF/Operateurs/Iterateurs/Iterateur_VDF_Elem_FT_TCL.tpp
@@ -1,5 +1,5 @@
 /****************************************************************************
-* Copyright (c) 2023, CEA
+* Copyright (c) 2025, CEA
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
@@ -21,11 +21,29 @@
  * Elie Saikali : NOTA BENE : fichier surcharger dans trio pour le FT, TCL model
  */
 
+template<class _TYPE_>
+KOKKOS_INLINE_FUNCTION void Iterateur_VDF_Elem<_TYPE_>::fill_flux_tables_(const int face, const int ncomp, const double coeff, CIntTabView elem, DoubleArrView flux, DoubleTabView resu, DoubleTabView flux_bords) const
+{
+  const int elem1 = elem(face, 0), elem2 = elem(face, 1);
+  if (elem1 > -1)
+    for (int k = 0; k < ncomp; k++)
+      {
+        Kokkos::atomic_add(&resu(elem1, k), + coeff * flux[k]);
+        flux_bords(face, k) += coeff * flux[k];
+      }
+  if (elem2 > -1)
+    for (int k = 0; k < ncomp; k++)
+      {
+        Kokkos::atomic_add(&resu(elem2, k), - coeff * flux[k]);
+        flux_bords(face, k) -= coeff * flux[k];
+      }
+}
+
 template<class _TYPE_> template<typename Type_Double>
 inline void Iterateur_VDF_Elem<_TYPE_>::fill_flux_tables_(const int face, const int ncomp, const double coeff, const Type_Double& flux, DoubleTab& resu) const
 {
   DoubleTab& flux_bords = op_base->flux_bords();
-  const int elem1 = elem(face, 0), elem2 = elem(face, 1);
+  const int elem1 = elem_(face, 0), elem2 = elem_(face, 1);
   if (elem1 > -1)
     for (int k = 0; k < ncomp; k++)
       {
diff --git a/src/VDF/Operateurs/Iterateurs/Iterateur_VDF_Elem_Multiphase_Parietal.tpp b/src/VDF/Operateurs/Iterateurs/Iterateur_VDF_Elem_Multiphase_Parietal.tpp
index 67820dc4ed..5f6d67c5b0 100644
--- a/src/VDF/Operateurs/Iterateurs/Iterateur_VDF_Elem_Multiphase_Parietal.tpp
+++ b/src/VDF/Operateurs/Iterateurs/Iterateur_VDF_Elem_Multiphase_Parietal.tpp
@@ -122,9 +122,9 @@ void Iterateur_VDF_Elem<_TYPE_>::ajouter_blocs_bords_flux_parietal_(const BC& cl
       nv = 0.;
       d_nuc = 0.;
 
-      const int e = elem(face, 0) > -1 ? elem(face, 0) : elem(face, 1);
+      const int e = elem_(face, 0) > -1 ? elem_(face, 0) : elem_(face, 1);
 
-      const double y = elem(face, 0) > -1 ? le_dom->dist_face_elem0(face, e) : le_dom->dist_face_elem1(face, e);
+      const double y = elem_(face, 0) > -1 ? le_dom->dist_face_elem0(face, e) : le_dom->dist_face_elem1(face, e);
 
       // fill in struct
       in.N = N;
@@ -181,7 +181,7 @@ void Iterateur_VDF_Elem<_TYPE_>::ajouter_blocs_bords_flux_parietal_(const BC& cl
                   (*pdTf_qpi)(e, k, l, m) += dTf_qpi(k, l, m) * fs(face);
 
           for (int k = 0; k < N; k++)
-            flux[k] = (elem(face, 0) != -1) ? qpk(k) * fs(face) * pf(face) : -qpk(k) * fs(face) * pf(face);
+            flux[k] = (elem_(face, 0) != -1) ? qpk(k) * fs(face) * pf(face) : -qpk(k) * fs(face) * pf(face);
 
           fill_flux_tables_(face, N, 1.0, flux, resu);
 
@@ -269,7 +269,7 @@ void Iterateur_VDF_Elem<_TYPE_>::ajouter_blocs_bords_flux_parietal_(const BC& cl
                   (*pdTf_qpi)(e, k, l, m) += dTf_qpi(k, l, m) * fs(face);
 
           for (int k = 0; k < N; k++)
-            flux[k] = (elem(face, 0) != -1) ? qpk(k) * fs(face) : -qpk(k) * fs(face);
+            flux[k] = (elem_(face, 0) != -1) ? qpk(k) * fs(face) : -qpk(k) * fs(face);
 
           fill_flux_tables_(face, N, 1.0, flux, resu);
 
diff --git a/src/VDF/Operateurs/Iterateurs/Iterateur_VDF_Elem_bis.tpp b/src/VDF/Operateurs/Iterateurs/Iterateur_VDF_Elem_bis.tpp
index 13587fcf40..5335a8c284 100644
--- a/src/VDF/Operateurs/Iterateurs/Iterateur_VDF_Elem_bis.tpp
+++ b/src/VDF/Operateurs/Iterateurs/Iterateur_VDF_Elem_bis.tpp
@@ -17,17 +17,15 @@
 #define Iterateur_VDF_Elem_bis_TPP_included
 
 template <class _TYPE_>
-void  Iterateur_VDF_Elem<_TYPE_>::modifier_flux() const
+void Iterateur_VDF_Elem<_TYPE_>::modifier_flux() const
 {
   if (op_base->equation().inconnue().le_nom().debute_par("temperature")
       && !( sub_type(Operateur_Diff_base,op_base.valeur()) && ref_cast(Operateur_Diff_base,op_base.valeur()).diffusivite().le_nom() == "conductivite" ) )
     {
-      DoubleTab& flux_bords=op_base->flux_bords();
       const Domaine_VDF& le_dom_vdf=ref_cast(Domaine_VDF,op_base->equation().domaine_dis());
-      const Champ_base& rho = (op_base->equation()).milieu().masse_volumique();
+      const Champ_base& masse_volumique = (op_base->equation()).milieu().masse_volumique();
       const Champ_Don_base& Cp = (op_base->equation()).milieu().capacite_calorifique();
-      const IntTab& face_voisins=le_dom_vdf.face_voisins();
-      int rho_uniforme = sub_type(Champ_Uniforme,rho) ? 1 : 0, cp_uniforme = sub_type(Champ_Uniforme,Cp) ? 1 : 0;
+      int rho_uniforme = sub_type(Champ_Uniforme,masse_volumique) ? 1 : 0, cp_uniforme = sub_type(Champ_Uniforme,Cp) ? 1 : 0;
       int is_rho_u=op_base->equation().probleme().is_dilatable();
       if (is_rho_u)
         {
@@ -37,13 +35,21 @@ void  Iterateur_VDF_Elem<_TYPE_>::modifier_flux() const
             if (ref_cast(Op_Conv_VDF_base,op).vitesse().le_nom()=="rho_u") is_rho_u = 1;
         }
       const int nb_faces_bords = le_dom_vdf.nb_faces_bord();
-      for (int face = 0; face < nb_faces_bords; face++)
-        for(int k = 0; k < flux_bords.dimension(1); k++)
+
+      CIntTabView face_voisins = le_dom_vdf.face_voisins().view_ro();
+      CDoubleTabView rho = masse_volumique.valeurs().view_ro();
+      CDoubleTabView cp = Cp.valeurs().view_ro();
+      DoubleTabView flux_bords = op_base->flux_bords().view_rw();
+      Kokkos::parallel_for(start_gpu_timer(__KERNEL_NAME__), range_1D(0, nb_faces_bords), KOKKOS_LAMBDA(const int face)
+      {
+        for(int k = 0; k < (int)flux_bords.extent(1); k++)
           {
             int e = (face_voisins(face, 0) != -1) ? face_voisins(face, 0) : face_voisins(face, 1);
-            const double rho_ = (is_rho_u) ? 1.0 : rho.valeurs()(!rho_uniforme * e, k);
-            flux_bords(face, k) *= rho_ * Cp.valeurs()(!cp_uniforme * e, k);
+            const double rho_ = (is_rho_u) ? 1.0 : rho(!rho_uniforme * e, k);
+            flux_bords(face, k) *= rho_ * cp(!cp_uniforme * e, k);
           }
+      });
+      end_gpu_timer(__KERNEL_NAME__);
     }
 }
 
@@ -55,34 +61,40 @@ int Iterateur_VDF_Elem<_TYPE_>::impr(Sortie& os) const
   const int impr_bord=(madomaine.bords_a_imprimer().est_vide() ? 0:1);
   const Schema_Temps_base& sch = la_zcl->equation().probleme().schema_temps();
   double temps = sch.temps_courant();
-  DoubleTab& flux_bords=op_base->flux_bords();
-  DoubleVect bilan(flux_bords.dimension(1));
-  int k,face;
+  DoubleTab& tab_flux_bords=op_base->flux_bords();
+  DoubleVect bilan(tab_flux_bords.dimension(1));
   int nb_front_Cl=le_dom->nb_front_Cl();
-  DoubleTrav flux_bords2( 3, nb_front_Cl , flux_bords.dimension(1));
-  flux_bords2=0;
+  DoubleTrav tab_flux_bords2( 3, nb_front_Cl , tab_flux_bords.dimension(1));
+  tab_flux_bords2=0;
   /*flux_bord(k)          ->   flux_bords2(0,num_cl,k) */
   /*flux_bord_perio1(k)   ->   flux_bords2(1,num_cl,k) */
   /*flux_bord_perio2(k)   ->   flux_bords2(2,num_cl,k) */
+  const int ncomp = tab_flux_bords.dimension(1);
+  CDoubleTabView flux_bords = tab_flux_bords.view_ro();
+  DoubleTabView3 flux_bords2 = tab_flux_bords2.view_rw<3>();
   for (int num_cl=0; num_cl<nb_front_Cl; num_cl++)
     {
       const Cond_lim& la_cl = la_zcl->les_conditions_limites(num_cl);
       const Front_VF& frontiere_dis = ref_cast(Front_VF,la_cl->frontiere_dis());
-      int ndeb = frontiere_dis.num_premiere_face();
-      int nfin = ndeb + frontiere_dis.nb_faces();
-      int periodicite = (type_cl(la_cl)==periodique?1:0);
-      for (face=ndeb; face<nfin; face++)
-        for(k=0; k<flux_bords.dimension(1); k++)
+      const int ndeb = frontiere_dis.num_premiere_face();
+      const int nfin = ndeb + frontiere_dis.nb_faces();
+      const int periodicite = (type_cl(la_cl)==periodique?1:0);
+      const int first_half_end = ndeb + frontiere_dis.nb_faces() / 2;
+      Kokkos::parallel_for(start_gpu_timer(__KERNEL_NAME__), range_1D(ndeb, nfin), KOKKOS_LAMBDA(const int face)
+      {
+        for(int k=0; k<ncomp; k++)
           {
-            flux_bords2(0,num_cl,k)+=flux_bords(face, k);
+            Kokkos::atomic_add(&flux_bords2(0,num_cl,k), flux_bords(face, k));
             if(periodicite)
               {
-                if( face < (ndeb+frontiere_dis.nb_faces()/2) ) flux_bords2(1,num_cl,k)+=flux_bords(face, k);
-                else flux_bords2(2,num_cl,k)+=flux_bords(face, k);
+                if( face < first_half_end ) Kokkos::atomic_add(&flux_bords2(1,num_cl,k), flux_bords(face, k));
+                else Kokkos::atomic_add(&flux_bords2(2,num_cl,k), flux_bords(face, k));
               }
           }
+      });
+      end_gpu_timer(__KERNEL_NAME__);
     } /* fin for num_cl */
-  mp_sum_for_each_item(flux_bords2);
+  mp_sum_for_each_item(tab_flux_bords2);
   if (je_suis_maitre())
     {
       op_base->ouvrir_fichier(Flux,"",1);
@@ -91,18 +103,18 @@ int Iterateur_VDF_Elem<_TYPE_>::impr(Sortie& os) const
         {
           const Cond_lim& la_cl = la_zcl->les_conditions_limites(num_cl);
           int periodicite = (type_cl(la_cl)==periodique?1:0);
-          for(k=0; k<flux_bords.dimension(1); k++)
+          for(int k=0; k<tab_flux_bords.dimension(1); k++)
             {
-              bilan(k)+=flux_bords2(0,num_cl,k);
+              bilan(k)+=tab_flux_bords2(0,num_cl,k);
               if(periodicite)
                 {
-                  Flux.add_col(flux_bords2(1,num_cl,k));
-                  Flux.add_col(flux_bords2(2,num_cl,k));
+                  Flux.add_col(tab_flux_bords2(1,num_cl,k));
+                  Flux.add_col(tab_flux_bords2(2,num_cl,k));
                 }
-              else Flux.add_col(flux_bords2(0,num_cl,k));
+              else Flux.add_col(tab_flux_bords2(0,num_cl,k));
             }
         }
-      for(k=0; k<flux_bords.dimension(1); k++)
+      for(int k=0; k<tab_flux_bords.dimension(1); k++)
         Flux.add_col(bilan(k));
       Flux << finl;
     }
@@ -120,20 +132,20 @@ int Iterateur_VDF_Elem<_TYPE_>::impr(Sortie& os) const
           if (madomaine.bords_a_imprimer().contient(la_fr.le_nom()))
             {
               Flux_face << "# Flux par face sur " << la_fr.le_nom() << " au temps " << temps << " : " << finl;
-              for (face=ndeb; face<nfin; face++)
+              for (int face=ndeb; face<nfin; face++)
                 {
                   if (dimension == 2)
                     Flux_face << "# Face a x= " << le_dom->xv(face,0) << " y= " << le_dom->xv(face,1);
                   else if (dimension == 3)
                     Flux_face << "# Face a x= " << le_dom->xv(face,0) << " y= " << le_dom->xv(face,1) << " z= " << le_dom->xv(face,2);
-                  for(k=0; k<flux_bords.dimension(1); k++)
+                  for(int k=0; k<tab_flux_bords.dimension(1); k++)
                     {
                       if (!est_egal(le_dom_vdf.face_surfaces(face),0., 1.e-20))
                         {
                           Flux_face << " surface_face(m2)= " << le_dom_vdf.face_surfaces(face);
-                          Flux_face << " flux_par_surface(W/m2)= " << flux_bords(face, k)/le_dom_vdf.face_surfaces(face);
+                          Flux_face << " flux_par_surface(W/m2)= " << tab_flux_bords(face, k)/le_dom_vdf.face_surfaces(face);
                         }
-                      Flux_face << " flux(W)= " << flux_bords(face, k);
+                      Flux_face << " flux(W)= " << tab_flux_bords(face, k);
                     }
                   Flux_face << finl;
                 }
@@ -149,7 +161,7 @@ int Iterateur_VDF_Elem<_TYPE_>::impr(Sortie& os) const
 template <class _TYPE_>
 void Iterateur_VDF_Elem<_TYPE_>::contribuer_au_second_membre(DoubleTab& resu) const
 {
-  ((_TYPE_&) flux_evaluateur).mettre_a_jour();
+  ((_TYPE_&) flux_evaluateur_).mettre_a_jour();
   const int ncomp = resu.line_size();
   assert(resu.nb_dim() < 3 && la_zcl && le_dom);
   assert(op_base->flux_bords().dimension(0)==le_dom->nb_faces_bord()); /* resize deja fait */
@@ -221,8 +233,8 @@ void Iterateur_VDF_Elem<_TYPE_>::contribuer_au_second_membre_interne(const int n
   const int ndeb = domaine_VDF.premiere_face_int(), nfin = domaine_VDF.nb_faces();
   for (int face = ndeb; face < nfin; face++)
     {
-      const int elem0 = elem(face,0), elem1 = elem(face,1);
-      flux_evaluateur.secmem_faces_interne(face, flux);
+      const int elem0 = elem_(face,0), elem1 = elem_(face,1);
+      flux_evaluateur_.secmem_faces_interne(face, flux);
       for (int k = 0; k < ncomp; k++)
         {
           resu(elem0,k) += flux[k];
@@ -240,7 +252,7 @@ void Iterateur_VDF_Elem<_TYPE_>::contribuer_au_second_membre_bords_(const BC& cl
       Type_Double flux(ncomp);
       for (int face = ndeb; face < nfin; face++)
         {
-          flux_evaluateur.secmem_face(face, cl, ndeb, flux); // Generic code
+          flux_evaluateur_.secmem_face(face, cl, ndeb, flux); // Generic code
           is_Periodique ? fill_flux_tables_(face,ncomp,0.5 /* coeff */,flux,resu) : fill_flux_tables_(face,ncomp,1.0 /* coeff */,flux,resu);
         }
     }
@@ -257,7 +269,7 @@ void Iterateur_VDF_Elem<_TYPE_>::contribuer_au_second_membre_bords_(const Echang
       for (int face = ndeb; face < nfin; face++)
         {
           int local_face=le_dom->front_VF(boundary_index).num_local_face(face);
-          flux_evaluateur.secmem_face(boundary_index,face,local_face, cl, ndeb, flux);
+          flux_evaluateur_.secmem_face(boundary_index,face,local_face, cl, ndeb, flux);
           fill_flux_tables_(face,ncomp,1.0 /* coeff */,flux,resu);
         }
     }
diff --git a/src/VDF/Operateurs/Iterateurs/Iterateur_VDF_Face.h b/src/VDF/Operateurs/Iterateurs/Iterateur_VDF_Face.h
index c93dc12013..a9fc9755b6 100644
--- a/src/VDF/Operateurs/Iterateurs/Iterateur_VDF_Face.h
+++ b/src/VDF/Operateurs/Iterateurs/Iterateur_VDF_Face.h
@@ -1,5 +1,5 @@
 /****************************************************************************
-* Copyright (c) 2024, CEA
+* Copyright (c) 2026, CEA
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
@@ -36,8 +36,8 @@ class Iterateur_VDF_Face : public Iterateur_VDF_base
   Iterateur_VDF_Face() { }
   Iterateur_VDF_Face(const Iterateur_VDF_Face<_TYPE_>& );
 
-  inline Evaluateur_VDF& evaluateur() override { return static_cast<Evaluateur_VDF&> (flux_evaluateur); }
-  inline const Evaluateur_VDF& evaluateur() const override { return static_cast<const Evaluateur_VDF&> (flux_evaluateur); }
+  inline Evaluateur_VDF& evaluateur() override { return static_cast<Evaluateur_VDF&> (flux_evaluateur_); }
+  inline const Evaluateur_VDF& evaluateur() const override { return static_cast<const Evaluateur_VDF&> (flux_evaluateur_); }
 
   int impr(Sortie& os) const override;
   void completer_() override;
@@ -46,19 +46,24 @@ class Iterateur_VDF_Face : public Iterateur_VDF_base
   void ajouter_blocs(matrices_t mats, DoubleTab& secmem, const tabs_t& semi_impl) const override;
 
 protected:
-  _TYPE_ flux_evaluateur;
+  _TYPE_ flux_evaluateur_;
   int nb_elem = -100, premiere_arete_interne = -100, derniere_arete_interne = -100, premiere_arete_mixte = -100, derniere_arete_mixte = -100;
   int premiere_arete_bord = -100, derniere_arete_bord = -100, premiere_arete_coin = -100, derniere_arete_coin = -100;
   mutable SFichier Flux, Flux_moment, Flux_sum;
-  IntTab Qdm, elem, elem_faces;
-  IntVect orientation, type_arete_bord, type_arete_coin;
+  IntTab Qdm_, elem_, elem_faces_;
+  IntVect orientation_, type_arete_bord, type_arete_coin;
+  mutable IntVects aretes_bord_par_type_; // List of arete_bord ranked by type
+  mutable IntVects aretes_coin_par_type_; // List of arete_coin ranked by type
+  mutable DoubleTab val_imp_face_bord_; // Tableau de travail qui stocke les valeurs imposees aux faces de bord. Utile pour le GPU
+  mutable DoubleTab coeff_frottement_face_bord_; // Tableau de travail qui stocke les coeff de frottement aux faces de bord. Utile pour le GPU
 
 private:
-  void multiply_by_rho_if_hydraulique(DoubleTab&) const;
-  template<typename Type_Double> void fill_resu_tab(const int, const int, const int, const Type_Double&, DoubleTab&) const;
   template<typename Type_Double> void fill_coeff_matrice_morse(const int, const int, const int, const int, const Type_Double&, Matrice_Morse&) const;
   template<typename Type_Double> void fill_coeff_matrice_morse(const int, const int, const int, const int, const Type_Double&, const Type_Double&, Matrice_Morse&) const;
 
+  private_but_public_for_cuda
+  void multiply_by_rho_if_hydraulique(DoubleTab&) const;
+
   /* ************************************** *
    * *********  INTERFACE BLOCS  ********** *
    * ********* SFINAE  TEMPLATES ********** *
@@ -70,15 +75,15 @@ class Iterateur_VDF_Face : public Iterateur_VDF_base
 
   template <bool should_calc_flux, Type_Flux_Arete Arete_Type, typename Type_Double>
   std::enable_if_t< Arete_Type == Type_Flux_Arete::PAROI || Arete_Type == Type_Flux_Arete::NAVIER || Arete_Type == Type_Flux_Arete::NAVIER_PAROI, void>
-  ajouter_blocs_aretes_bords_(const int , const int , const matrices_t& , DoubleTab& , const tabs_t& ) const;
+  ajouter_blocs_aretes_bords_(const IntVect& , const int , const matrices_t& , DoubleTab& , const tabs_t& ) const;
 
   template <bool should_calc_flux, Type_Flux_Arete Arete_Type, typename Type_Double>
   std::enable_if_t< Arete_Type == Type_Flux_Arete::FLUIDE || Arete_Type == Type_Flux_Arete::PAROI_FLUIDE || Arete_Type == Type_Flux_Arete::NAVIER_FLUIDE, void>
-  ajouter_blocs_aretes_bords_(const int , const int , const matrices_t& , DoubleTab& , const tabs_t& ) const;
+  ajouter_blocs_aretes_bords_(const IntVect& , const int , const matrices_t& , DoubleTab& , const tabs_t& ) const;
 
   template <bool should_calc_flux, Type_Flux_Arete Arete_Type, typename Type_Double>
   std::enable_if_t< Arete_Type == Type_Flux_Arete::PERIODICITE, void>
-  ajouter_blocs_aretes_bords_(const int , const int , const matrices_t& , DoubleTab& , const tabs_t& ) const;
+  ajouter_blocs_aretes_bords_(const IntVect& , const int , const matrices_t& , DoubleTab& , const tabs_t& ) const;
 
   /* ====== COINS ===== */
   template<typename Type_Double>
@@ -86,15 +91,15 @@ class Iterateur_VDF_Face : public Iterateur_VDF_base
 
   template <bool should_calc_flux, Type_Flux_Arete Arete_Type, TypeAreteCoinVDF::type_arete Arete_Type_Coin, typename Type_Double>
   std::enable_if_t< Arete_Type == Type_Flux_Arete::PAROI, void>
-  ajouter_blocs_aretes_coins_(const int , const int , matrices_t , DoubleTab& , const tabs_t& ) const;
+  ajouter_blocs_aretes_coins_(const IntVect& , const int , matrices_t , DoubleTab& , const tabs_t& ) const;
 
   template <bool should_calc_flux, Type_Flux_Arete Arete_Type, typename Type_Double>
   std::enable_if_t<Arete_Type == Type_Flux_Arete::COIN_FLUIDE, void>
-  ajouter_blocs_aretes_coins_(const int , const int , matrices_t , DoubleTab& , const tabs_t& ) const;
+  ajouter_blocs_aretes_coins_(const IntVect& , const int , matrices_t , DoubleTab& , const tabs_t& ) const;
 
   template <bool should_calc_flux, Type_Flux_Arete Arete_Type, typename Type_Double>
   std::enable_if_t<Arete_Type == Type_Flux_Arete::PERIODICITE, void>
-  ajouter_blocs_aretes_coins_(const int , const int , matrices_t , DoubleTab& , const tabs_t& ) const;
+  ajouter_blocs_aretes_coins_(const IntVect& , const int , matrices_t , DoubleTab& , const tabs_t& ) const;
 
   /* ====== INTERNES  & MIXTES ===== */
   template<typename Type_Double>
diff --git a/src/VDF/Operateurs/Iterateurs/Iterateur_VDF_Face.tpp b/src/VDF/Operateurs/Iterateurs/Iterateur_VDF_Face.tpp
index dfaf4c994a..6aa983f1ef 100644
--- a/src/VDF/Operateurs/Iterateurs/Iterateur_VDF_Face.tpp
+++ b/src/VDF/Operateurs/Iterateurs/Iterateur_VDF_Face.tpp
@@ -25,13 +25,25 @@
 template<class _TYPE_>
 void Iterateur_VDF_Face<_TYPE_>::ajouter_blocs(matrices_t mats, DoubleTab& secmem, const tabs_t& semi_impl) const
 {
-  ((_TYPE_&) flux_evaluateur).mettre_a_jour();
+  ((_TYPE_&) flux_evaluateur_).mettre_a_jour();
   assert(op_base->equation().inconnue().valeurs().nb_dim() < 3);
   const int ncomp = op_base->equation().inconnue().valeurs().line_size();
   DoubleTab& tab_flux_bords = op_base->flux_bords();
   tab_flux_bords.resize(le_dom->nb_faces_bord(), dimension);
   tab_flux_bords = 0.;
 
+  // Oblige de dimensionner plus grand que necessaire pour gerer simplement les faces virtuelles
+  // Important d'utiliser temps et zcl de l'evaluateur pour F5!
+  const Domaine_Cl_VDF& zcl = evaluateur().get_la_zcl();
+
+  // Compute coeff_frottement_face_bord_
+  Champ_Face_coeff_frottement_face_bord(coeff_frottement_face_bord_, zcl);
+
+  // Compute val_imp_face_bord_
+  double temps = flux_evaluateur_.inconnue()->temps();
+  const DoubleTab& tab_inco = semi_impl.count(nom_ch_inco_) ? semi_impl.at(nom_ch_inco_) : le_champ_convecte_ou_inc->valeurs();
+  Champ_Face_get_val_imp_face_bord(temps, val_imp_face_bord_, zcl, &tab_inco);
+
   if (ncomp == 1)
     {
       ajouter_blocs_aretes_bords<SingleDouble>(ncomp, mats, secmem, semi_impl);
@@ -72,119 +84,168 @@ void Iterateur_VDF_Face<_TYPE_>::ajouter_blocs_aretes_bords(const int ncomp, con
 {
   if (!_TYPE_::CALC_ARR_BORD) return; /* do nothing */
 
-  for (int n_arete = premiere_arete_bord; n_arete < derniere_arete_bord; n_arete++)
+  if (aretes_bord_par_type_.size()==0)
     {
-      const int n_type = type_arete_bord(n_arete - premiere_arete_bord);
-      switch(n_type)
+      // Fill aretes_bord_par_type_
+      const int n_types = 7; // See enum type_arete in Domaine_Cl_VDF.h
+      aretes_bord_par_type_.resize(n_types);
+      ArrOfInt size(n_types);
+      for (int num_arete = 0; num_arete < type_arete_bord.size(); num_arete++)
         {
-        case TypeAreteBordVDF::PAROI_PAROI:
-          ajouter_blocs_aretes_bords_<_TYPE_::CALC_ARR_PAR, Type_Flux_Arete::PAROI, Type_Double>(n_arete, ncomp, mats, secmem, semi_impl);
-          break;
-        case TypeAreteBordVDF::PAROI_FLUIDE:
-          ajouter_blocs_aretes_bords_<_TYPE_::CALC_ARR_PAR_FL, Type_Flux_Arete::PAROI_FLUIDE, Type_Double>(n_arete, ncomp, mats, secmem, semi_impl);
-          break;
-        case TypeAreteBordVDF::FLUIDE_FLUIDE:
-          ajouter_blocs_aretes_bords_<_TYPE_::CALC_ARR_FL, Type_Flux_Arete::FLUIDE, Type_Double>(n_arete, ncomp, mats, secmem, semi_impl);
-          break;
-        case TypeAreteBordVDF::PAROI_NAVIER:
-          ajouter_blocs_aretes_bords_<_TYPE_::CALC_ARR_NAVIER_PAR, Type_Flux_Arete::NAVIER_PAROI, Type_Double>(n_arete, ncomp, mats, secmem, semi_impl);
-          break;
-        case TypeAreteBordVDF::FLUIDE_NAVIER:
-          ajouter_blocs_aretes_bords_<_TYPE_::CALC_ARR_NAVIER_FL, Type_Flux_Arete::NAVIER_FLUIDE, Type_Double>(n_arete, ncomp, mats, secmem, semi_impl);
-          break;
-        case TypeAreteBordVDF::NAVIER_NAVIER:
-          ajouter_blocs_aretes_bords_<_TYPE_::CALC_ARR_NAVIER, Type_Flux_Arete::NAVIER, Type_Double>(n_arete, ncomp, mats, secmem, semi_impl);
-          break;
-        case TypeAreteBordVDF::PERIO_PERIO:
-          ajouter_blocs_aretes_bords_<_TYPE_::CALC_ARR_PERIO, Type_Flux_Arete::PERIODICITE, Type_Double>(n_arete, ncomp, mats, secmem, semi_impl);
-          break;
-        default:
-          Cerr << "On a rencontre un type d'arete non prevu : [ num arete : " << n_arete << " ], [ type : " << n_type << " ]" << finl;
-          Process::exit();
+          int n_type = type_arete_bord[num_arete];
+          if (n_type >= 0)
+            size(n_type)++;
+        }
+      for (int n_type = 0; n_type < n_types; n_type++)
+        {
+          aretes_bord_par_type_[n_type].resize(size(n_type));
+          size(n_type) = 0;
+        }
+      for (int num_arete = 0; num_arete < type_arete_bord.size(); num_arete++)
+        {
+          int n_type = type_arete_bord[num_arete];
+          if (n_type >= 0)
+            {
+              aretes_bord_par_type_[n_type][size(n_type)] = premiere_arete_bord + num_arete;
+              size(n_type)++;
+            }
         }
     }
+
+  ajouter_blocs_aretes_bords_<_TYPE_::CALC_ARR_PAR, Type_Flux_Arete::PAROI, Type_Double>(aretes_bord_par_type_[TypeAreteBordVDF::PAROI_PAROI],ncomp,mats,secmem,semi_impl);
+  ajouter_blocs_aretes_bords_<_TYPE_::CALC_ARR_PAR_FL, Type_Flux_Arete::PAROI_FLUIDE, Type_Double>(aretes_bord_par_type_[TypeAreteBordVDF::PAROI_FLUIDE], ncomp, mats, secmem, semi_impl);
+  ajouter_blocs_aretes_bords_<_TYPE_::CALC_ARR_FL, Type_Flux_Arete::FLUIDE, Type_Double>(aretes_bord_par_type_[TypeAreteBordVDF::FLUIDE_FLUIDE],ncomp,mats,secmem,semi_impl);
+  ajouter_blocs_aretes_bords_<_TYPE_::CALC_ARR_NAVIER_PAR, Type_Flux_Arete::NAVIER_PAROI, Type_Double>(aretes_bord_par_type_[TypeAreteBordVDF::PAROI_NAVIER],ncomp,mats,secmem,semi_impl);
+  ajouter_blocs_aretes_bords_<_TYPE_::CALC_ARR_NAVIER_FL, Type_Flux_Arete::NAVIER_FLUIDE, Type_Double>(aretes_bord_par_type_[TypeAreteBordVDF::FLUIDE_NAVIER], ncomp, mats, secmem, semi_impl);
+  ajouter_blocs_aretes_bords_<_TYPE_::CALC_ARR_NAVIER, Type_Flux_Arete::NAVIER, Type_Double>(aretes_bord_par_type_[TypeAreteBordVDF::NAVIER_NAVIER],ncomp,mats,secmem,semi_impl);
+  ajouter_blocs_aretes_bords_<_TYPE_::CALC_ARR_PERIO, Type_Flux_Arete::PERIODICITE, Type_Double>(aretes_bord_par_type_[TypeAreteBordVDF::PERIO_PERIO], ncomp, mats, secmem, semi_impl);
 }
 
 template <class _TYPE_> template <bool should_calc_flux, Type_Flux_Arete Arete_Type, typename Type_Double>
 std::enable_if_t< Arete_Type == Type_Flux_Arete::PAROI || Arete_Type == Type_Flux_Arete::NAVIER || Arete_Type == Type_Flux_Arete::NAVIER_PAROI, void>
-Iterateur_VDF_Face<_TYPE_>::ajouter_blocs_aretes_bords_(const int n_arete, const int ncomp, const matrices_t& mats, DoubleTab& secmem, const tabs_t& semi_impl) const
+Iterateur_VDF_Face<_TYPE_>::ajouter_blocs_aretes_bords_(const IntVect& tab_aretes_bord, const int ncomp, const matrices_t& mats, DoubleTab& tab_resu, const tabs_t& semi_impl) const
 {
   if (should_calc_flux)
     {
       constexpr bool is_PAROI = (Arete_Type == Type_Flux_Arete::PAROI);
-      Type_Double flux(ncomp), aii1_2(ncomp), aii3_4(ncomp), ajj1_2(ncomp);
-      const int n = le_dom->nb_faces_bord(), fac1 = Qdm(n_arete, 0), fac2 = Qdm(n_arete, 1), fac3 = Qdm(n_arete, 2), signe = Qdm(n_arete, 3);
-      DoubleTab& tab_flux_bords = op_base->flux_bords();
-      const DoubleTab& inco = semi_impl.count(nom_ch_inco_) ? semi_impl.at(nom_ch_inco_) : le_champ_convecte_ou_inc->valeurs();
+      const DoubleTab& tab_inco = semi_impl.count(nom_ch_inco_) ? semi_impl.at(nom_ch_inco_) : le_champ_convecte_ou_inc->valeurs();
 
-      const DoubleTab* a_r = (!is_pb_multi || !is_conv_op_) ? nullptr : semi_impl.count("alpha_rho") ? &semi_impl.at("alpha_rho") :
-                             &ref_cast(Pb_Multiphase,op_base->equation().probleme()).equation_masse().champ_conserve().valeurs();
+      const DoubleTab* tab_a_r = (!is_pb_multi || !is_conv_op_) ? nullptr : semi_impl.count("alpha_rho") ? &semi_impl.at("alpha_rho") :
+                                 &ref_cast(Pb_Multiphase,op_base->equation().probleme()).equation_masse().champ_conserve().valeurs();
 
       // second membre
-      flux_evaluateur.template flux_arete < Arete_Type > (inco, a_r, fac1, fac2, fac3, signe, flux);
-      for (int k = 0; k < ncomp; k++)
-        {
-          secmem(fac3, k) += signe * flux[k];
-          if (is_PAROI)
-            {
-              if (fac1 < n) tab_flux_bords(fac1, orientation(fac3)) -= 0.5 * signe * flux[k];
-              if (fac2 < n) tab_flux_bords(fac2, orientation(fac3)) -= 0.5 * signe * flux[k];
-            }
-        }
+      const int n = le_dom->nb_faces_bord();
+      // Copy of flux_evaluateur_ is mandatory here cause it's an attribute of the class
+      _TYPE_ flux_evaluateur = flux_evaluateur_;
+      flux_evaluateur.view_ro();
+      CIntTabView Qdm = Qdm_.view_ro();
+      CIntArrView aretes_bord = tab_aretes_bord.view_ro();
+      CIntArrView orientation = orientation_.view_ro();
+      CDoubleTabView a_r;
+      CDoubleTabView inco = tab_inco.view_ro();
+      CDoubleTabView val_imp_face_bord = val_imp_face_bord_.view_ro();
+      CDoubleTabView coeff_frottement_face_bord = coeff_frottement_face_bord_.view_ro();
+      DoubleTabView resu = tab_resu.view_rw();
+      DoubleTabView flux_bords = op_base->flux_bords().view_rw();
+      int size = tab_aretes_bord.size();
+      Kokkos::parallel_for(start_gpu_timer(__KERNEL_NAME__),
+                           Kokkos::MDRangePolicy<Kokkos::Rank<2>>({0, 0}, {size, ncomp}),
+                           KOKKOS_LAMBDA(const int index, const int k)
+      {
+        const int n_arete = aretes_bord(index);
+        const int fac1 = Qdm(n_arete, 0), fac2 = Qdm(n_arete, 1), fac3 = Qdm(n_arete, 2), signe = Qdm(n_arete, 3);
+        double flux;
+        flux_evaluateur.template flux_arete_comp<Arete_Type>(inco, val_imp_face_bord, coeff_frottement_face_bord, a_r, fac1, fac2, fac3, signe, k, flux);
+        Kokkos::atomic_add(&resu(fac3, k), + signe * flux);
+        if (is_PAROI)
+          {
+            if (fac1 < n) Kokkos::atomic_add(&flux_bords(fac1, orientation(fac3)), - 0.5 * signe * flux);
+            if (fac2 < n) Kokkos::atomic_add(&flux_bords(fac2, orientation(fac3)), - 0.5 * signe * flux);
+          }
+      });
+      end_gpu_timer(__KERNEL_NAME__);
 
       // derivees : champ convecte
       Matrice_Morse *matrice = (is_pb_multi && is_conv_op_) ? (mats.count(nom_ch_inco_) && !semi_impl.count(nom_ch_inco_) ? mats.at(nom_ch_inco_) : nullptr) :
                                  (mats.count(nom_ch_inco_) ? mats.at(nom_ch_inco_) : nullptr);
       if (matrice)
       {
-          flux_evaluateur.template coeffs_arete < Arete_Type > (a_r, fac1, fac2, fac3, signe, aii1_2, aii3_4, ajj1_2);
-          for (int i = 0; i < ncomp; i++)
-            fill_coeff_matrice_morse < Type_Double > (fac3, i, ncomp, signe, aii3_4, *matrice);
+          Type_Double aii1_2(ncomp), aii3_4(ncomp), ajj1_2(ncomp);
+          for (int index = 0; index < tab_aretes_bord.size(); index++)
+            {
+              const int n_arete = tab_aretes_bord(index);
+              const int fac1 = Qdm_(n_arete, 0), fac2 = Qdm_(n_arete, 1), fac3 = Qdm_(n_arete, 2), signe = Qdm_(n_arete, 3);
+              flux_evaluateur.template coeffs_arete<Arete_Type>(val_imp_face_bord_, coeff_frottement_face_bord_, tab_a_r, fac1, fac2, fac3, signe, aii1_2, aii3_4, ajj1_2);
+              for (int i = 0; i < ncomp; i++)
+                fill_coeff_matrice_morse<Type_Double>(fac3, i, ncomp, signe, aii3_4, *matrice);
+            }
         }
     }
 }
 
 template <class _TYPE_>  template <bool should_calc_flux, Type_Flux_Arete Arete_Type, typename Type_Double>
 std::enable_if_t<Arete_Type == Type_Flux_Arete::FLUIDE || Arete_Type == Type_Flux_Arete::PAROI_FLUIDE || Arete_Type == Type_Flux_Arete::NAVIER_FLUIDE, void>
-Iterateur_VDF_Face<_TYPE_>::ajouter_blocs_aretes_bords_(const int n_arete, const int ncomp, const matrices_t& mats, DoubleTab& secmem, const tabs_t& semi_impl) const
+Iterateur_VDF_Face<_TYPE_>::ajouter_blocs_aretes_bords_(const IntVect& tab_aretes_bord, const int ncomp, const matrices_t& mats, DoubleTab& tab_resu, const tabs_t& semi_impl) const
 {
   if (should_calc_flux)
     {
       constexpr bool is_FLUIDE = (Arete_Type == Type_Flux_Arete::FLUIDE), is_PAROI_FL = (Arete_Type == Type_Flux_Arete::PAROI_FLUIDE);
-      Type_Double flux3(ncomp), flux1_2(ncomp), aii1_2(ncomp), aii3_4(ncomp), ajj1_2(ncomp);
-      const int n = le_dom->nb_faces_bord(), fac1 = Qdm(n_arete, 0), fac2 = Qdm(n_arete, 1), fac3 = Qdm(n_arete, 2), signe = Qdm(n_arete, 3);
-      DoubleTab& tab_flux_bords = op_base->flux_bords();
-      const DoubleTab& inco = semi_impl.count(nom_ch_inco_) ? semi_impl.at(nom_ch_inco_) : le_champ_convecte_ou_inc->valeurs();
+      const DoubleTab& tab_inco = semi_impl.count(nom_ch_inco_) ? semi_impl.at(nom_ch_inco_) : le_champ_convecte_ou_inc->valeurs();
 
-      const DoubleTab* a_r = (!is_pb_multi || !is_conv_op_) ? nullptr : semi_impl.count("alpha_rho") ? &semi_impl.at("alpha_rho") :
-                             &ref_cast(Pb_Multiphase,op_base->equation().probleme()).equation_masse().champ_conserve().valeurs();
+      const DoubleTab* tab_a_r = (!is_pb_multi || !is_conv_op_) ? nullptr : semi_impl.count("alpha_rho") ? &semi_impl.at("alpha_rho") :
+                                 &ref_cast(Pb_Multiphase,op_base->equation().probleme()).equation_masse().champ_conserve().valeurs();
 
       // second membre
-      flux_evaluateur.template flux_arete < Arete_Type > (inco, a_r, fac1, fac2, fac3, signe, flux3, flux1_2);
-      for (int k = 0; k < ncomp; k++)
-        secmem(fac3, k) += signe * flux3[k];
-
-      fill_resu_tab < Type_Double > (fac1, fac2, ncomp, flux1_2, secmem);
-
-      if (is_FLUIDE || is_PAROI_FL)
-        {
-          if (fac1 < n)
-            for (int k = 0; k < ncomp; k++) tab_flux_bords(fac1, orientation(fac3)) -= 0.5 * signe * flux3[k];
-
-          if (fac2 < n)
-            for (int k = 0; k < ncomp; k++) tab_flux_bords(fac2, orientation(fac3)) -= 0.5 * signe * flux3[k];
-        }
+      const int n = le_dom->nb_faces_bord();
+      // Copy of flux_evaluateur_ is mandatory here cause it's an attribute of the class
+      _TYPE_ flux_evaluateur = flux_evaluateur_;
+      flux_evaluateur.view_ro();
+      CIntTabView Qdm = Qdm_.view_ro();
+      CIntArrView aretes_bord = tab_aretes_bord.view_ro();
+      CIntArrView orientation = orientation_.view_ro();
+      CDoubleTabView a_r;
+      if (tab_a_r!=nullptr) a_r = tab_a_r->view_ro();
+      CDoubleTabView inco = tab_inco.view_ro();
+      CDoubleTabView val_imp_face_bord = val_imp_face_bord_.view_ro();
+      CDoubleTabView coeff_frottement_face_bord = coeff_frottement_face_bord_.view_ro();
+      DoubleTabView resu = tab_resu.view_rw();
+      DoubleTabView flux_bords = op_base->flux_bords().view_rw();
+      int size = tab_aretes_bord.size();
+      Kokkos::parallel_for(start_gpu_timer(__KERNEL_NAME__),
+                           Kokkos::MDRangePolicy<Kokkos::Rank<2>>({0, 0}, {size, ncomp}),
+                           KOKKOS_LAMBDA(const int index, const int k)
+      {
+        const int n_arete = aretes_bord(index);
+        const int fac1 = Qdm(n_arete, 0), fac2 = Qdm(n_arete, 1), fac3 = Qdm(n_arete, 2), signe = Qdm(n_arete, 3);
+        double flux3, flux1_2;
+        flux_evaluateur.template flux_arete_comp<Arete_Type>(inco, val_imp_face_bord, coeff_frottement_face_bord, a_r, fac1, fac2, fac3, signe, ncomp, k, flux3, flux1_2);
+        Kokkos::atomic_add(&resu(fac3, k), + signe * flux3);
+        Kokkos::atomic_add(&resu(fac1, k), + flux1_2);
+        Kokkos::atomic_add(&resu(fac2, k), - flux1_2);
+        if (is_FLUIDE || is_PAROI_FL)
+          {
+            if (fac1 < n) Kokkos::atomic_add(&flux_bords(fac1, orientation(fac3)), - 0.5 * signe * flux3);
+            if (fac2 < n) Kokkos::atomic_add(&flux_bords(fac2, orientation(fac3)), - 0.5 * signe * flux3);
+          }
+      });
+      end_gpu_timer(__KERNEL_NAME__);
 
       // derivees : champ convecte
       Matrice_Morse *matrice = (is_pb_multi && is_conv_op_) ? (mats.count(nom_ch_inco_) && !semi_impl.count(nom_ch_inco_) ? mats.at(nom_ch_inco_) : nullptr) :
                                  (mats.count(nom_ch_inco_) ? mats.at(nom_ch_inco_) : nullptr);
       if (matrice)
       {
-          flux_evaluateur.template coeffs_arete < Arete_Type > (a_r, fac1, fac2, fac3, signe, aii1_2, aii3_4, ajj1_2);
-          for (int i = 0; i < ncomp; i++)
+          Type_Double aii1_2(ncomp), aii3_4(ncomp), ajj1_2(ncomp);
+          for (int index = 0; index < tab_aretes_bord.size(); index++)
             {
-              fill_coeff_matrice_morse < Type_Double > (fac3, i, ncomp, signe, aii3_4, *matrice);
-              fill_coeff_matrice_morse < Type_Double > (fac1, fac2, i, ncomp, aii1_2, ajj1_2, *matrice);
+              const int n_arete = tab_aretes_bord(index);
+              const int fac1 = Qdm_(n_arete, 0), fac2 = Qdm_(n_arete, 1), fac3 = Qdm_(n_arete, 2), signe = Qdm_(n_arete, 3);
+              flux_evaluateur.template coeffs_arete<Arete_Type>(val_imp_face_bord_, coeff_frottement_face_bord_, tab_a_r, fac1, fac2, fac3, signe, aii1_2, aii3_4, ajj1_2);
+              for (int i = 0; i < ncomp; i++)
+                {
+                  fill_coeff_matrice_morse<Type_Double>(fac3, i, ncomp, signe, aii3_4, *matrice);
+                  fill_coeff_matrice_morse<Type_Double>(fac1, fac2, i, ncomp, aii1_2, ajj1_2, *matrice);
+                }
             }
         }
     }
@@ -192,41 +253,58 @@ Iterateur_VDF_Face<_TYPE_>::ajouter_blocs_aretes_bords_(const int n_arete, const
 
 template <class _TYPE_>  template <bool should_calc_flux, Type_Flux_Arete Arete_Type, typename Type_Double>
 std::enable_if_t<Arete_Type == Type_Flux_Arete::PERIODICITE, void>
-Iterateur_VDF_Face<_TYPE_>::ajouter_blocs_aretes_bords_(const int n_arete, const int ncomp, const matrices_t& mats, DoubleTab& secmem, const tabs_t& semi_impl) const
+Iterateur_VDF_Face<_TYPE_>::ajouter_blocs_aretes_bords_(const IntVect& tab_aretes_bord, const int ncomp, const matrices_t& mats, DoubleTab& tab_resu, const tabs_t& semi_impl) const
 {
   if (should_calc_flux)
     {
-      Type_Double flux3_4(ncomp), flux1_2(ncomp), aii(ncomp), ajj(ncomp);
-      const int fac1 = Qdm(n_arete, 0), fac2 = Qdm(n_arete, 1), fac3 = Qdm(n_arete, 2), fac4 = Qdm(n_arete, 3);
-      const DoubleTab& inco = semi_impl.count(nom_ch_inco_) ? semi_impl.at(nom_ch_inco_) : le_champ_convecte_ou_inc->valeurs();
-
-
+      const DoubleTab& tab_inco = semi_impl.count(nom_ch_inco_) ? semi_impl.at(nom_ch_inco_) : le_champ_convecte_ou_inc->valeurs();
 
       // second membre
-      flux_evaluateur.template flux_arete < Arete_Type > (inco, nullptr, fac1, fac2, fac3, fac4, flux3_4, flux1_2);
-      for (int k = 0; k < ncomp; k++)
-        {
-          secmem(fac3, k) += 0.5 * flux3_4[k];
-          secmem(fac4, k) -= 0.5 * flux3_4[k];
-        }
-
-      fill_resu_tab < Type_Double > (fac1, fac2, ncomp, flux1_2, secmem);
+      // Copy of flux_evaluateur_ is mandatory here cause it's an attribute of the class
+      _TYPE_ flux_evaluateur = flux_evaluateur_;
+      flux_evaluateur.view_ro();
+      CIntTabView Qdm = Qdm_.view_ro();
+      CIntArrView aretes_bord = tab_aretes_bord.view_ro();
+      CDoubleTabView a_r;
+      CDoubleTabView inco = tab_inco.view_ro();
+      DoubleTabView resu = tab_resu.view_rw();
+      int size = tab_aretes_bord.size();
+      Kokkos::parallel_for(start_gpu_timer(__KERNEL_NAME__),
+                           Kokkos::MDRangePolicy<Kokkos::Rank<2>>({0, 0}, {size, ncomp}),
+                           KOKKOS_LAMBDA(const int index, const int k)
+      {
+        const int n_arete = aretes_bord(index);
+        const int fac1 = Qdm(n_arete, 0), fac2 = Qdm(n_arete, 1), fac3 = Qdm(n_arete, 2), fac4 = Qdm(n_arete, 3);
+        double flux3_4, flux1_2;
+        flux_evaluateur.template flux_arete_comp<Arete_Type>(inco, a_r, fac1, fac2, fac3, fac4, k, flux3_4, flux1_2);
+        Kokkos::atomic_add(&resu(fac3, k), + 0.5 * flux3_4);
+        Kokkos::atomic_add(&resu(fac4, k), - 0.5 * flux3_4);
+        Kokkos::atomic_add(&resu(fac1, k), + flux1_2);
+        Kokkos::atomic_add(&resu(fac2, k), - flux1_2);
+      });
+      end_gpu_timer(__KERNEL_NAME__);
 
       // derivees : champ convecte
       Matrice_Morse *matrice = (is_pb_multi && is_conv_op_) ? (mats.count(nom_ch_inco_) && !semi_impl.count(nom_ch_inco_) ? mats.at(nom_ch_inco_) : nullptr) :
                                  (mats.count(nom_ch_inco_) ? mats.at(nom_ch_inco_) : nullptr);
       if (matrice)
       {
-          flux_evaluateur.template coeffs_arete < Arete_Type > (nullptr, fac3, fac4, fac1, fac2, aii, ajj);
-          for (int i = 0; i < ncomp; i++)
-            fill_coeff_matrice_morse(fac1, fac2, i, ncomp, aii, ajj, *matrice);
-
-          flux_evaluateur.template coeffs_arete < Arete_Type > (nullptr, fac1, fac2, fac3, fac4, aii, ajj);
-          for (int i = 0; i < ncomp; i++)
+          Type_Double aii(ncomp), ajj(ncomp);
+          for (int index = 0; index < tab_aretes_bord.size(); index++)
             {
-              aii[i] *= 0.5;
-              ajj[i] *= 0.5;
-              fill_coeff_matrice_morse < Type_Double > (fac3, fac4, i, ncomp, aii, ajj, *matrice);
+              const int n_arete = tab_aretes_bord(index);
+              const int fac1 = Qdm_(n_arete, 0), fac2 = Qdm_(n_arete, 1), fac3 = Qdm_(n_arete, 2), fac4 = Qdm_(n_arete, 3);
+              flux_evaluateur.template coeffs_arete<Arete_Type>(nullptr, fac3, fac4, fac1, fac2, aii, ajj);
+              for (int i = 0; i < ncomp; i++)
+                fill_coeff_matrice_morse(fac1, fac2, i, ncomp, aii, ajj, *matrice);
+
+              flux_evaluateur_.template coeffs_arete<Arete_Type>(nullptr, fac1, fac2, fac3, fac4, aii, ajj);
+              for (int i = 0; i < ncomp; i++)
+                {
+                  aii[i] *= 0.5;
+                  ajj[i] *= 0.5;
+                  fill_coeff_matrice_morse<Type_Double>(fac3, fac4, i, ncomp, aii, ajj, *matrice);
+                }
             }
         }
     }
@@ -238,107 +316,163 @@ Iterateur_VDF_Face<_TYPE_>::ajouter_blocs_aretes_bords_(const int n_arete, const
 template<class _TYPE_> template <typename Type_Double>
 void Iterateur_VDF_Face<_TYPE_>::ajouter_blocs_aretes_coins(const int ncomp, matrices_t mats, DoubleTab& secmem, const tabs_t& semi_impl) const
 {
-  for (int n_arete = premiere_arete_coin; n_arete < derniere_arete_coin; n_arete++)
+  if (aretes_coin_par_type_.size()==0)
     {
-      const int n_type = type_arete_coin(n_arete - premiere_arete_coin);
-      switch(n_type)
+      // Fill aretes_coins_par_type_
+      const int n_types = 17; // See enum type_arete in Domaine_Cl_VDF.h
+      aretes_coin_par_type_.resize(n_types);
+      ArrOfInt size(n_types);
+      for (int num_arete = 0; num_arete < type_arete_coin.size(); num_arete++)
         {
-        case TypeAreteCoinVDF::PAROI_FLUIDE:
-          ajouter_blocs_aretes_coins_<_TYPE_::CALC_ARR_PAR, Type_Flux_Arete::PAROI, TypeAreteCoinVDF::PAROI_FLUIDE, Type_Double>(n_arete, ncomp, mats, secmem, semi_impl);
-          break;
-        case TypeAreteCoinVDF::FLUIDE_PAROI:
-          ajouter_blocs_aretes_coins_<_TYPE_::CALC_ARR_PAR, Type_Flux_Arete::PAROI, TypeAreteCoinVDF::FLUIDE_PAROI, Type_Double>(n_arete, ncomp, mats, secmem, semi_impl);
-          break;
-        case TypeAreteCoinVDF::PERIO_PAROI:
-          ajouter_blocs_aretes_coins_<_TYPE_::CALC_ARR_PAR, Type_Flux_Arete::PAROI, TypeAreteCoinVDF::PERIO_PAROI, Type_Double>(n_arete, ncomp, mats, secmem, semi_impl);
-          break;
-        case TypeAreteCoinVDF::FLUIDE_FLUIDE:
-          ajouter_blocs_aretes_coins_<_TYPE_::CALC_ARR_COIN_FL, Type_Flux_Arete::COIN_FLUIDE, Type_Double>(n_arete, ncomp, mats, secmem, semi_impl);
-          break;
-        case TypeAreteCoinVDF::PERIO_PERIO:
-          ajouter_blocs_aretes_coins_<_TYPE_::CALC_ARR_PERIO, Type_Flux_Arete::PERIODICITE, Type_Double>(n_arete, ncomp, mats, secmem, semi_impl);
-          break;
-        default:
-          break;
+          int n_type = type_arete_coin[num_arete];
+          if (n_type >= 0)
+            size(n_type)++;
+        }
+      for (int n_type = 0; n_type < n_types; n_type++)
+        {
+          aretes_coin_par_type_[n_type].resize(size(n_type));
+          size(n_type) = 0;
+        }
+      for (int num_arete = 0; num_arete < type_arete_coin.size(); num_arete++)
+        {
+          int n_type = type_arete_coin[num_arete];
+          if (n_type >= 0)
+            {
+              aretes_coin_par_type_[n_type][size(n_type)] = premiere_arete_coin + num_arete;
+              size(n_type)++;
+            }
         }
     }
+  ajouter_blocs_aretes_coins_<_TYPE_::CALC_ARR_PAR, Type_Flux_Arete::PAROI, TypeAreteCoinVDF::PAROI_FLUIDE, Type_Double>(aretes_coin_par_type_[TypeAreteCoinVDF::PAROI_FLUIDE], ncomp, mats, secmem, semi_impl);
+  ajouter_blocs_aretes_coins_<_TYPE_::CALC_ARR_PAR, Type_Flux_Arete::PAROI, TypeAreteCoinVDF::FLUIDE_PAROI, Type_Double>(aretes_coin_par_type_[TypeAreteCoinVDF::FLUIDE_PAROI], ncomp, mats, secmem, semi_impl);
+  ajouter_blocs_aretes_coins_<_TYPE_::CALC_ARR_PAR, Type_Flux_Arete::PAROI, TypeAreteCoinVDF::PERIO_PAROI, Type_Double>(aretes_coin_par_type_[TypeAreteCoinVDF::PERIO_PAROI], ncomp, mats, secmem, semi_impl);
+  ajouter_blocs_aretes_coins_<_TYPE_::CALC_ARR_COIN_FL, Type_Flux_Arete::COIN_FLUIDE, Type_Double>(aretes_coin_par_type_[TypeAreteCoinVDF::FLUIDE_FLUIDE], ncomp, mats, secmem, semi_impl);
+  ajouter_blocs_aretes_coins_<_TYPE_::CALC_ARR_PERIO, Type_Flux_Arete::PERIODICITE, Type_Double>(aretes_coin_par_type_[TypeAreteCoinVDF::PERIO_PERIO], ncomp, mats, secmem, semi_impl);
 }
 
 template <class _TYPE_> template <bool should_calc_flux, Type_Flux_Arete Arete_Type, TypeAreteCoinVDF::type_arete Arete_Type_Coin, typename Type_Double>
 std::enable_if_t< Arete_Type == Type_Flux_Arete::PAROI, void>
-Iterateur_VDF_Face<_TYPE_>::ajouter_blocs_aretes_coins_(const int n_arete, const int ncomp, matrices_t mats, DoubleTab& secmem, const tabs_t& semi_impl) const
+Iterateur_VDF_Face<_TYPE_>::ajouter_blocs_aretes_coins_(const IntVect& tab_aretes_coin, const int ncomp, matrices_t mats, DoubleTab& tab_resu, const tabs_t& semi_impl) const
 {
   if (should_calc_flux)
     {
       constexpr bool is_PERIO_PAROI = (Arete_Type_Coin == TypeAreteCoinVDF::PERIO_PAROI);
-      Type_Double flux(ncomp), aii1_2(ncomp), aii3_4(ncomp), ajj1_2(ncomp);
-      const int fac1 = Qdm(n_arete, 0), fac2 = Qdm(n_arete, 1), fac3 = Qdm(n_arete, 2), signe = Qdm(n_arete, 3);
-      DoubleTab& tab_flux_bords = op_base->flux_bords();
-      const DoubleTab& inco = semi_impl.count(nom_ch_inco_) ? semi_impl.at(nom_ch_inco_) : le_champ_convecte_ou_inc->valeurs();
 
-      const DoubleTab* a_r = (!is_pb_multi || !is_conv_op_) ? nullptr : semi_impl.count("alpha_rho") ? &semi_impl.at("alpha_rho") :
-                             &ref_cast(Pb_Multiphase,op_base->equation().probleme()).equation_masse().champ_conserve().valeurs();
+      const DoubleTab& tab_inco = semi_impl.count(nom_ch_inco_) ? semi_impl.at(nom_ch_inco_) : le_champ_convecte_ou_inc->valeurs();
+
+      const DoubleTab* tab_a_r = (!is_pb_multi || !is_conv_op_) ? nullptr : semi_impl.count("alpha_rho") ? &semi_impl.at("alpha_rho") :
+                                 &ref_cast(Pb_Multiphase,op_base->equation().probleme()).equation_masse().champ_conserve().valeurs();
 
       // second membre
-      flux_evaluateur.template flux_arete < Arete_Type > (inco, a_r, fac1, fac2, fac3, signe, flux);
-      for (int k = 0; k < ncomp; k++)
-        {
-          secmem(fac3, k) += signe * flux[k];
-          if (is_PERIO_PAROI) /* on met 0.25 sur les deux faces (car on  ajoutera deux fois la contrib) */
-            {
-              tab_flux_bords(fac1, orientation(fac3)) -= 0.25 * signe * flux[k];
-              tab_flux_bords(fac2, orientation(fac3)) -= 0.25 * signe * flux[k];
-            }
-          else
-            tab_flux_bords(fac1, orientation(fac3)) -= 0.5 * signe * flux[k];
-        }
+      // Copy of flux_evaluateur_ is mandatory here cause it's an attribute of the class
+      _TYPE_ flux_evaluateur = flux_evaluateur_;
+      flux_evaluateur.view_ro();
+      CIntTabView Qdm = Qdm_.view_ro();
+      CIntArrView aretes_coin = tab_aretes_coin.view_ro();
+      CIntArrView orientation = orientation_.view_ro();
+      CDoubleTabView a_r;
+      if (tab_a_r!=nullptr) a_r = tab_a_r->view_ro();
+      CDoubleTabView inco = tab_inco.view_ro();
+      CDoubleTabView val_imp_face_bord = val_imp_face_bord_.view_ro();
+      CDoubleTabView coeff_frottement_face_bord = coeff_frottement_face_bord_.view_ro();
+      DoubleTabView resu = tab_resu.view_rw();
+      DoubleTabView flux_bords = op_base->flux_bords().view_rw();
+      int size = tab_aretes_coin.size();
+      Kokkos::parallel_for(start_gpu_timer(__KERNEL_NAME__),
+                           Kokkos::MDRangePolicy<Kokkos::Rank<2>>({0, 0}, {size, ncomp}),
+                           KOKKOS_LAMBDA(const int index, const int k)
+      {
+        const int n_arete = aretes_coin(index);
+        const int fac1 = Qdm(n_arete, 0), fac2 = Qdm(n_arete, 1), fac3 = Qdm(n_arete, 2), signe = Qdm(n_arete, 3);
+        double flux;
+        flux_evaluateur.template flux_arete_comp<Arete_Type>(inco, val_imp_face_bord, coeff_frottement_face_bord, a_r, fac1, fac2, fac3, signe, k, flux);
+        Kokkos::atomic_add(&resu(fac3, k), + signe * flux);
+        if (is_PERIO_PAROI)
+          {
+            Kokkos::atomic_add(&flux_bords(fac1, orientation(fac3)), - 0.25 * signe * flux);
+            Kokkos::atomic_add(&flux_bords(fac2, orientation(fac3)), - 0.25 * signe * flux);
+          }
+        else
+          Kokkos::atomic_add(&flux_bords(fac1, orientation(fac3)), - 0.5 * signe * flux);
+      });
+      end_gpu_timer(__KERNEL_NAME__);
 
       // derivees : champ convecte
       Matrice_Morse *matrice = (is_pb_multi && is_conv_op_) ? (mats.count(nom_ch_inco_) && !semi_impl.count(nom_ch_inco_) ? mats.at(nom_ch_inco_) : nullptr) :
                                  (mats.count(nom_ch_inco_) ? mats.at(nom_ch_inco_) : nullptr);
       if (matrice)
       {
-          flux_evaluateur.template coeffs_arete < Arete_Type > (a_r, fac1, fac2, fac3, signe, aii1_2, aii3_4, ajj1_2);
-          for (int i = 0; i < ncomp; i++)
-            fill_coeff_matrice_morse(fac3, i, ncomp, signe, aii3_4, *matrice);
+          Type_Double aii1_2(ncomp), aii3_4(ncomp), ajj1_2(ncomp);
+          for (int index=0; index<tab_aretes_coin.size(); index++)
+            {
+              const int n_arete = tab_aretes_coin(index);
+              const int fac1 = Qdm_(n_arete, 0), fac2 = Qdm_(n_arete, 1), fac3 = Qdm_(n_arete, 2), signe = Qdm_(n_arete, 3);
+              flux_evaluateur.template coeffs_arete<Arete_Type>(val_imp_face_bord_, coeff_frottement_face_bord_, tab_a_r, fac1, fac2, fac3, signe, aii1_2, aii3_4, ajj1_2);
+              for (int i = 0; i < ncomp; i++)
+                fill_coeff_matrice_morse(fac3, i, ncomp, signe, aii3_4, *matrice);
+            }
         }
     }
 }
 
 template <class _TYPE_> template <bool should_calc_flux, Type_Flux_Arete Arete_Type, typename Type_Double>
 std::enable_if_t<Arete_Type == Type_Flux_Arete::COIN_FLUIDE, void>
-Iterateur_VDF_Face<_TYPE_>::ajouter_blocs_aretes_coins_(const int n_arete, const int ncomp, matrices_t mats, DoubleTab& secmem, const tabs_t& semi_impl) const
+Iterateur_VDF_Face<_TYPE_>::ajouter_blocs_aretes_coins_(const IntVect& tab_aretes_coin, const int ncomp, matrices_t mats, DoubleTab& tab_resu, const tabs_t& semi_impl) const
 {
   if (should_calc_flux)
     {
-      Type_Double flux3(ncomp), flux1_2(ncomp), aii1_2(ncomp), aii3_4(ncomp), ajj1_2(ncomp);
-      const int n = le_dom->nb_faces_bord(), fac1 = Qdm(n_arete, 0), fac2 = Qdm(n_arete, 1), fac3 = Qdm(n_arete, 2), signe = Qdm(n_arete, 3);
-      DoubleTab& tab_flux_bords = op_base->flux_bords();
-      const DoubleTab& inco = semi_impl.count(nom_ch_inco_) ? semi_impl.at(nom_ch_inco_) : le_champ_convecte_ou_inc->valeurs();
 
-      const DoubleTab* a_r = (!is_pb_multi || !is_conv_op_) ? nullptr : semi_impl.count("alpha_rho") ? &semi_impl.at("alpha_rho") :
-                             &ref_cast(Pb_Multiphase,op_base->equation().probleme()).equation_masse().champ_conserve().valeurs();
+      const DoubleTab& tab_inco = semi_impl.count(nom_ch_inco_) ? semi_impl.at(nom_ch_inco_) : le_champ_convecte_ou_inc->valeurs();
+
+      const DoubleTab* tab_a_r = (!is_pb_multi || !is_conv_op_) ? nullptr : semi_impl.count("alpha_rho") ? &semi_impl.at("alpha_rho") :
+                                 &ref_cast(Pb_Multiphase,op_base->equation().probleme()).equation_masse().champ_conserve().valeurs();
 
       // second membre
-      flux_evaluateur.template flux_arete < Arete_Type > (inco, a_r, fac1, fac2, fac3, signe, flux3, flux1_2);
-      for (int k = 0; k < ncomp; k++)
-        {
-          secmem(fac3, k) += signe * flux3[k];
-          secmem(fac1, k) += flux1_2[k];
-          if (fac1 < n) tab_flux_bords(fac1, orientation(fac3)) -= 0.5 * signe * flux3[k];
-        }
+      const int n = le_dom->nb_faces_bord();
+      // Copy of flux_evaluateur_ is mandatory here cause it's an attribute of the class
+      _TYPE_ flux_evaluateur = flux_evaluateur_;
+      flux_evaluateur.view_ro();
+      CIntTabView Qdm = Qdm_.view_ro();
+      CIntArrView aretes_coin = tab_aretes_coin.view_ro();
+      CIntArrView orientation = orientation_.view_ro();
+      CDoubleTabView a_r;
+      if (tab_a_r!=nullptr) a_r = tab_a_r->view_ro();
+      CDoubleTabView inco = tab_inco.view_ro();
+      CDoubleTabView val_imp_face_bord = val_imp_face_bord_.view_ro();
+      CDoubleTabView coeff_frottement_face_bord = coeff_frottement_face_bord_.view_ro();
+      DoubleTabView resu = tab_resu.view_rw();
+      DoubleTabView flux_bords = op_base->flux_bords().view_rw();
+      int size = tab_aretes_coin.size();
+      Kokkos::parallel_for(start_gpu_timer(__KERNEL_NAME__),
+                           Kokkos::MDRangePolicy<Kokkos::Rank<2>>({0, 0}, {size, ncomp}),
+                           KOKKOS_LAMBDA(const int index, const int k)
+      {
+        const int n_arete = aretes_coin(index);
+        const int fac1 = Qdm(n_arete, 0), fac2 = Qdm(n_arete, 1), fac3 = Qdm(n_arete, 2), signe = Qdm(n_arete, 3);
+        double flux3, flux1_2;
+        flux_evaluateur.template flux_arete_comp<Arete_Type>(inco, val_imp_face_bord, coeff_frottement_face_bord, a_r, fac1, fac2, fac3, signe, k, flux3, flux1_2);
+        Kokkos::atomic_add(&resu(fac3, k), + signe * flux3);
+        Kokkos::atomic_add(&resu(fac1, k), + flux1_2);
+        if (fac1 < n) Kokkos::atomic_add(&flux_bords(fac1, orientation(fac3)), - 0.5 * signe * flux3);
+      });
+      end_gpu_timer(__KERNEL_NAME__);
 
       // derivees : champ convecte
       Matrice_Morse *matrice = (is_pb_multi && is_conv_op_) ? (mats.count(nom_ch_inco_) && !semi_impl.count(nom_ch_inco_) ? mats.at(nom_ch_inco_) : nullptr) :
                                  (mats.count(nom_ch_inco_) ? mats.at(nom_ch_inco_) : nullptr);
       if (matrice)
       {
-          flux_evaluateur.template coeffs_arete < Arete_Type > (a_r, fac1, fac2, fac3, signe, aii1_2, aii3_4, ajj1_2);
-
-          for (int i = 0; i < ncomp; i++)
+          Type_Double aii1_2(ncomp), aii3_4(ncomp), ajj1_2(ncomp);
+          for (int index=0; index<tab_aretes_coin.size(); index++)
             {
-              fill_coeff_matrice_morse(fac3, i, ncomp, signe, aii3_4, *matrice);
-              fill_coeff_matrice_morse < Type_Double > (fac1, i, ncomp, 1, aii1_2, *matrice);
+              const int n_arete = tab_aretes_coin(index);
+              const int fac1 = Qdm_(n_arete, 0), fac2 = Qdm_(n_arete, 1), fac3 = Qdm_(n_arete, 2), signe = Qdm_(n_arete, 3);
+              flux_evaluateur.template coeffs_arete<Arete_Type>(val_imp_face_bord_, coeff_frottement_face_bord_, tab_a_r, fac1, fac2, fac3, signe, aii1_2, aii3_4, ajj1_2);
+              for (int i = 0; i < ncomp; i++)
+                {
+                  fill_coeff_matrice_morse(fac3, i, ncomp, signe, aii3_4, *matrice);
+                  fill_coeff_matrice_morse<Type_Double>(fac1, i, ncomp, 1, aii1_2, *matrice);
+                }
             }
         }
     }
@@ -346,36 +480,55 @@ Iterateur_VDF_Face<_TYPE_>::ajouter_blocs_aretes_coins_(const int n_arete, const
 
 template <class _TYPE_> template <bool should_calc_flux, Type_Flux_Arete Arete_Type, typename Type_Double>
 std::enable_if_t<Arete_Type == Type_Flux_Arete::PERIODICITE, void>
-Iterateur_VDF_Face<_TYPE_>::ajouter_blocs_aretes_coins_(const int n_arete, const int ncomp, matrices_t mats, DoubleTab& secmem, const tabs_t& semi_impl) const
+Iterateur_VDF_Face<_TYPE_>::ajouter_blocs_aretes_coins_(const IntVect& tab_aretes_coin, const int ncomp, matrices_t mats, DoubleTab& tab_resu, const tabs_t& semi_impl) const
 {
   if (should_calc_flux)
     {
-      Type_Double flux3_4(ncomp), flux1_2(ncomp), aii(ncomp), ajj(ncomp);
-      const int fac1 = Qdm(n_arete, 0), fac2 = Qdm(n_arete, 1), fac3 = Qdm(n_arete, 2), fac4 = Qdm(n_arete, 3);
-      const DoubleTab& inco = semi_impl.count(nom_ch_inco_) ? semi_impl.at(nom_ch_inco_) : le_champ_convecte_ou_inc->valeurs();
+      const DoubleTab& tab_inco = semi_impl.count(nom_ch_inco_) ? semi_impl.at(nom_ch_inco_) : le_champ_convecte_ou_inc->valeurs();
 
       // second membre
-      flux_evaluateur.template flux_arete < Arete_Type > (inco, nullptr, fac1, fac2, fac3, fac4, flux3_4, flux1_2);
-      for (int k = 0; k < ncomp; k++)
-        {
-          secmem(fac3, k) += 0.5 * flux3_4[k];
-          secmem(fac4, k) -= 0.5 * flux3_4[k];
-          secmem(fac1, k) += 0.5 * flux1_2[k];
-          secmem(fac2, k) -= 0.5 * flux1_2[k];
-        }
+      // Copy of flux_evaluateur_ is mandatory here cause it's an attribute of the class
+      _TYPE_ flux_evaluateur = flux_evaluateur_;
+      flux_evaluateur.view_ro();
+      CIntTabView Qdm = Qdm_.view_ro();
+      CIntArrView aretes_coin = tab_aretes_coin.view_ro();
+      CDoubleTabView a_r;
+      CDoubleTabView inco = tab_inco.view_ro();
+      DoubleTabView resu = tab_resu.view_rw();
+      int size = tab_aretes_coin.size();
+      Kokkos::parallel_for(start_gpu_timer(__KERNEL_NAME__),
+                           Kokkos::MDRangePolicy<Kokkos::Rank<2>>({0, 0}, {size, ncomp}),
+                           KOKKOS_LAMBDA(const int index, const int k)
+      {
+        const int n_arete = aretes_coin(index);
+        const int fac1 = Qdm(n_arete, 0), fac2 = Qdm(n_arete, 1), fac3 = Qdm(n_arete, 2), fac4 = Qdm(n_arete, 3);
+        double flux3_4, flux1_2;
+        flux_evaluateur.template flux_arete_comp<Arete_Type>(inco, a_r, fac1, fac2, fac3, fac4, k, flux3_4, flux1_2);
+        Kokkos::atomic_add(&resu(fac3, k), + 0.5 * flux3_4);
+        Kokkos::atomic_add(&resu(fac4, k), - 0.5 * flux3_4);
+        Kokkos::atomic_add(&resu(fac1, k), + 0.5 * flux1_2);
+        Kokkos::atomic_add(&resu(fac2, k), - 0.5 * flux1_2);
+      });
+      end_gpu_timer(__KERNEL_NAME__);
 
       // derivees : champ convecte
       Matrice_Morse *matrice = (is_pb_multi && is_conv_op_) ? (mats.count(nom_ch_inco_) && !semi_impl.count(nom_ch_inco_) ? mats.at(nom_ch_inco_) : nullptr) :
                                  (mats.count(nom_ch_inco_) ? mats.at(nom_ch_inco_) : nullptr);
       if (matrice)
       {
-          flux_evaluateur.template coeffs_arete < Arete_Type > (nullptr, fac3, fac4, fac1, fac2, aii, ajj);
-          for (int i = 0; i < ncomp; i++)
-            fill_coeff_matrice_morse < Type_Double > (fac1, fac2, i, ncomp, aii, ajj, *matrice);
-
-          flux_evaluateur.template coeffs_arete < Arete_Type > (nullptr, fac1, fac2, fac3, fac4, aii, ajj);
-          for (int i = 0; i < ncomp; i++)
-            fill_coeff_matrice_morse < Type_Double > (fac3, fac4, i, ncomp, aii, ajj, *matrice);
+          Type_Double aii(ncomp), ajj(ncomp);
+          for (int index=0; index<tab_aretes_coin.size(); index++)
+            {
+              const int n_arete = tab_aretes_coin(index);
+              const int fac1 = Qdm_(n_arete, 0), fac2 = Qdm_(n_arete, 1), fac3 = Qdm_(n_arete, 2), fac4 = Qdm_(n_arete, 3);
+              flux_evaluateur.template coeffs_arete<Arete_Type>(nullptr, fac3, fac4, fac1, fac2, aii, ajj);
+              for (int i = 0; i < ncomp; i++)
+                fill_coeff_matrice_morse<Type_Double>(fac1, fac2, i, ncomp, aii, ajj, *matrice);
+
+              flux_evaluateur.template coeffs_arete<Arete_Type>(nullptr, fac1, fac2, fac3, fac4, aii, ajj);
+              for (int i = 0; i < ncomp; i++)
+                fill_coeff_matrice_morse<Type_Double>(fac3, fac4, i, ncomp, aii, ajj, *matrice);
+            }
         }
     }
 }
@@ -401,87 +554,87 @@ void Iterateur_VDF_Face<_TYPE_>::ajouter_blocs_aretes_mixtes(const int ncomp, ma
 
 template <class _TYPE_> template <bool should_calc_flux, Type_Flux_Arete Arete_Type, typename Type_Double>
 std::enable_if_t<Arete_Type == Type_Flux_Arete::INTERNE || Arete_Type == Type_Flux_Arete::MIXTE, void>
-Iterateur_VDF_Face<_TYPE_>::ajouter_blocs_aretes_generique_(const int debut, const int fin, const int ncomp, matrices_t mats, DoubleTab& secmem, const tabs_t& semi_impl) const
+Iterateur_VDF_Face<_TYPE_>::ajouter_blocs_aretes_generique_(const int ndeb, const int nfin, const int ncomp, matrices_t mats, DoubleTab& tab_secmem, const tabs_t& semi_impl) const
 {
-  // XXX : tab_flux_bords rempli seulement si MIXTE ... ie pas INTERNE !
+
   if (should_calc_flux)
     {
       constexpr bool is_MIXTE = (Arete_Type == Type_Flux_Arete::MIXTE);
-      Type_Double flux(ncomp), aii(ncomp), ajj(ncomp);
-      DoubleTab& tab_flux_bords = op_base->flux_bords();
-      const DoubleTab& inco = semi_impl.count(nom_ch_inco_) ? semi_impl.at(nom_ch_inco_) : le_champ_convecte_ou_inc->valeurs();
-
-      const DoubleTab* a_r = (!is_pb_multi || !is_conv_op_) ? nullptr : semi_impl.count("alpha_rho") ? &semi_impl.at("alpha_rho") :
-                             &ref_cast(Pb_Multiphase,op_base->equation().probleme()).equation_masse().champ_conserve().valeurs();
-
+      const DoubleTab& tab_inco = semi_impl.count(nom_ch_inco_) ? semi_impl.at(nom_ch_inco_) : le_champ_convecte_ou_inc->valeurs();
+      const DoubleTab* tab_a_r = (!is_pb_multi || !is_conv_op_) ? nullptr : semi_impl.count("alpha_rho") ? &semi_impl.at("alpha_rho") :
+                                 &ref_cast(Pb_Multiphase,op_base->equation().probleme()).equation_masse().champ_conserve().valeurs();
+      const int n = le_dom->nb_faces_bord(), n2 = le_dom->nb_faces_tot(); /* GF pour assurer bilan seq = para */
+      // Copy of flux_evaluateur_ is mandatory here cause it's an attribute of the class
+      _TYPE_ flux_evaluateur = flux_evaluateur_;
+      flux_evaluateur.view_ro();
+      CIntTabView Qdm = Qdm_.view_ro();
+      CIntArrView orientation = orientation_.view_ro();
+      CDoubleTabView inco = tab_inco.view_ro();
+      CDoubleTabView a_r = tab_a_r != nullptr ? tab_a_r->view_ro() : ConstView<double, 2>();
+      DoubleTabView secmem = tab_secmem.view_rw();
+      DoubleTabView flux_bords = op_base->flux_bords().view_rw(); // XXX : flux_bords rempli seulement si MIXTE ... ie pas INTERNE !
       // second membre
-      for (int n_arete = debut; n_arete < fin; n_arete++)
-        {
-          flux = 0.;
-          const int fac1 = Qdm(n_arete, 0), fac2 = Qdm(n_arete, 1), fac3 = Qdm(n_arete, 2), fac4 = Qdm(n_arete, 3);
-          const int n = le_dom->nb_faces_bord(), n2 = le_dom->nb_faces_tot(); /* GF pour assurer bilan seq = para */
-          flux_evaluateur.template flux_arete < Arete_Type > (inco, a_r, fac1, fac2, fac3, fac4, flux);
-          fill_resu_tab < Type_Double > (fac3, fac4, ncomp, flux, secmem);
-
-          if (is_MIXTE)
-            {
-              if (fac4 < n2)
-                {
-                  if (fac1 < n)
-                    for (int k = 0; k < ncomp; k++) tab_flux_bords(fac1, orientation(fac3)) -= flux[k];
-
-                  if (fac2 < n)
-                    for (int k = 0; k < ncomp; k++) tab_flux_bords(fac2, orientation(fac4)) -= flux[k];
-                }
-              if (fac3 < n2)
-                {
-                  if (fac1 < n)
-                    for (int k = 0; k < ncomp; k++) tab_flux_bords(fac1, orientation(fac3)) += flux[k];
-
-                  if (fac2 < n)
-                    for (int k = 0; k < ncomp; k++) tab_flux_bords(fac2, orientation(fac4)) += flux[k];
-                }
-            }
-
-          flux_evaluateur.template flux_arete < Arete_Type > (inco, a_r, fac3, fac4, fac1, fac2, flux);
-          fill_resu_tab < Type_Double > (fac1, fac2, ncomp, flux, secmem);
-          if (is_MIXTE)
-            {
-              if (fac2 < n2)
-                {
-                  if (fac3 < n)
-                    for (int k = 0; k < ncomp; k++) tab_flux_bords(fac3, orientation(fac1)) -= flux[k];
-
-                  if (fac4 < n)
-                    for (int k = 0; k < ncomp; k++) tab_flux_bords(fac4, orientation(fac2)) -= flux[k];
-                }
-              if (fac1 < n2)
-                {
-                  if (fac3 < n)
-                    for (int k = 0; k < ncomp; k++) tab_flux_bords(fac3, orientation(fac1)) += flux[k];
-
-                  if (fac4 < n)
-                    for (int k = 0; k < ncomp; k++) tab_flux_bords(fac4, orientation(fac2)) += flux[k];
-                }
-            }
-        }
+      Kokkos::parallel_for(start_gpu_timer(__KERNEL_NAME__),
+                           Kokkos::MDRangePolicy<Kokkos::Rank<2>>({ndeb, 0}, {nfin, ncomp}),
+                           KOKKOS_LAMBDA(const int n_arete, const int k)
+      {
+        const int fac1 = Qdm(n_arete, 0), fac2 = Qdm(n_arete, 1), fac3 = Qdm(n_arete, 2), fac4 = Qdm(n_arete, 3);
+        double flux_34;
+        flux_evaluateur.template flux_arete_comp<Arete_Type>(inco, a_r, fac1, fac2, fac3, fac4, k, flux_34);
+        Kokkos::atomic_add(&secmem(fac3, k), +flux_34);
+        Kokkos::atomic_add(&secmem(fac4, k), -flux_34);
+        if (is_MIXTE)
+          {
+            if (fac4 < n2)
+              {
+                if (fac1 < n) Kokkos::atomic_add(&flux_bords(fac1, orientation(fac3)), -flux_34);
+                if (fac2 < n) Kokkos::atomic_add(&flux_bords(fac2, orientation(fac4)), -flux_34);
+              }
+            if (fac3 < n2)
+              {
+                if (fac1 < n) Kokkos::atomic_add(&flux_bords(fac1, orientation(fac3)), +flux_34);
+                if (fac2 < n) Kokkos::atomic_add(&flux_bords(fac2, orientation(fac4)), +flux_34);
+              }
+          }
+        double flux_12;
+        flux_evaluateur.template flux_arete_comp<Arete_Type>(inco, a_r, fac3, fac4, fac1, fac2, k, flux_12);
+        Kokkos::atomic_add(&secmem(fac1, k), +flux_12);
+        Kokkos::atomic_add(&secmem(fac2, k), -flux_12);
+        if (is_MIXTE)
+          {
+            if (fac2 < n2)
+              {
+                if (fac3 < n) Kokkos::atomic_add(&flux_bords(fac3, orientation(fac1)), -flux_12);
+                if (fac4 < n) Kokkos::atomic_add(&flux_bords(fac4, orientation(fac2)), -flux_12);
+              }
+            if (fac1 < n2)
+              {
+                if (fac3 < n) Kokkos::atomic_add(&flux_bords(fac3, orientation(fac1)), +flux_12);
+                if (fac4 < n) Kokkos::atomic_add(&flux_bords(fac4, orientation(fac2)), +flux_12);
+              }
+          }
+      });
+      end_gpu_timer(__KERNEL_NAME__);
 
       // derivees : champ convecte
       Matrice_Morse *matrice = (is_pb_multi && is_conv_op_) ? (mats.count(nom_ch_inco_) && !semi_impl.count(nom_ch_inco_) ? mats.at(nom_ch_inco_) : nullptr) : (mats.count(nom_ch_inco_) ? mats.at(nom_ch_inco_) : nullptr);
       if (matrice)
-        for (int n_arete = debut; n_arete < fin; n_arete++)
-          {
-            aii = 0., ajj = 0.;
-            const int fac1 = Qdm(n_arete, 0), fac2 = Qdm(n_arete, 1), fac3 = Qdm(n_arete, 2), fac4 = Qdm(n_arete, 3);
+        {
+          Type_Double aii(ncomp), ajj(ncomp);
+          for (int n_arete = ndeb; n_arete < nfin; n_arete++)
+            {
+              aii = 0., ajj = 0.;
+              const int fac1 = Qdm_(n_arete, 0), fac2 = Qdm_(n_arete, 1), fac3 = Qdm_(n_arete, 2), fac4 = Qdm_(n_arete, 3);
 
-            flux_evaluateur.template coeffs_arete < Arete_Type > (a_r, fac3, fac4, fac1, fac2, aii, ajj);
-            for (int i = 0; i < ncomp; i++)
-              fill_coeff_matrice_morse < Type_Double > (fac1, fac2, i, ncomp, aii, ajj, *matrice);
+              flux_evaluateur.template coeffs_arete<Arete_Type>(tab_a_r, fac3, fac4, fac1, fac2, aii, ajj);
+              for (int i = 0; i < ncomp; i++)
+                fill_coeff_matrice_morse<Type_Double>(fac1, fac2, i, ncomp, aii, ajj, *matrice);
 
-            flux_evaluateur.template coeffs_arete < Arete_Type > (a_r, fac1, fac2, fac3, fac4, aii, ajj);
-            for (int i = 0; i < ncomp; i++)
-              fill_coeff_matrice_morse < Type_Double > (fac3, fac4, i, ncomp, aii, ajj, *matrice);
-          }
+              flux_evaluateur.template coeffs_arete<Arete_Type>(tab_a_r, fac1, fac2, fac3, fac4, aii, ajj);
+              for (int i = 0; i < ncomp; i++)
+                fill_coeff_matrice_morse<Type_Double>(fac3, fac4, i, ncomp, aii, ajj, *matrice);
+            }
+        }
     }
 }
 
@@ -517,44 +670,60 @@ void Iterateur_VDF_Face<_TYPE_>::ajouter_blocs_fa7_sortie_libre(const int ncomp,
 }
 
 template <class _TYPE_> template <bool should_calc_flux, Type_Flux_Fa7 Fa7_Type, typename Type_Double>
-void Iterateur_VDF_Face<_TYPE_>::ajouter_blocs_fa7_sortie_libre_(const int num_cl, const int ncomp , matrices_t mats, DoubleTab& secmem, const tabs_t& semi_impl) const
+void Iterateur_VDF_Face<_TYPE_>::ajouter_blocs_fa7_sortie_libre_(const int num_cl, const int ncomp , matrices_t mats, DoubleTab& tab_resu, const tabs_t& semi_impl) const
 {
   // TODO : FIXME : tab_flux_bords pas rempli ...
   if (should_calc_flux)
     {
-      Type_Double flux(ncomp), aii(ncomp), ajj(ncomp);
-      const DoubleTab& inco = semi_impl.count(nom_ch_inco_) ? semi_impl.at(nom_ch_inco_) : le_champ_convecte_ou_inc->valeurs();
+
+      const DoubleTab& tab_inco = semi_impl.count(nom_ch_inco_) ? semi_impl.at(nom_ch_inco_) : le_champ_convecte_ou_inc->valeurs();
       const Cond_lim& la_cl = la_zcl->les_conditions_limites(num_cl);
       const Front_VF& frontiere_dis = ref_cast(Front_VF, la_cl->frontiere_dis());
       const int ndeb = frontiere_dis.num_premiere_face(), nfin = ndeb + frontiere_dis.nb_faces();
 
-      const DoubleTab* a_r = (!is_pb_multi || !is_conv_op_) ? nullptr : semi_impl.count("alpha_rho") ? &semi_impl.at("alpha_rho") :
-                             &ref_cast(Pb_Multiphase,op_base->equation().probleme()).equation_masse().champ_conserve().valeurs();
+      const DoubleTab* tab_a_r = (!is_pb_multi || !is_conv_op_) ? nullptr : semi_impl.count("alpha_rho") ? &semi_impl.at("alpha_rho") :
+                                 &ref_cast(Pb_Multiphase,op_base->equation().probleme()).equation_masse().champ_conserve().valeurs();
 
       // second membre
-      for (int face = ndeb; face < nfin; face++)
-        {
-          flux_evaluateur.template flux_fa7 < Fa7_Type > (inco, a_r, face, (const Neumann_sortie_libre&) la_cl.valeur(), ndeb, flux);
-          if ((elem(face, 0)) > -1)
-            for (int k = 0; k < ncomp; k++) secmem(face, k) += flux[k];
-
-          if ((elem(face, 1)) > -1)
-            for (int k = 0; k < ncomp; k++) secmem(face, k) -= flux[k];
-        }
+      // Copy of flux_evaluateur_ is mandatory here cause it's an attribute of the class
+      _TYPE_ flux_evaluateur = flux_evaluateur_;
+      flux_evaluateur.view_ro();
+      CIntTabView elem = elem_.view_ro();
+      CDoubleTabView inco = tab_inco.view_ro();
+      CDoubleTabView a_r;
+      if (tab_a_r != nullptr) a_r = tab_a_r->view_ro();
+      CDoubleTabView flux_impose = ref_cast(Neumann_sortie_libre, la_cl.valeur()).tab_flux_impose().view_ro(); // Used by Genepi+ !!!
+      DoubleTabView resu = tab_resu.view_rw();
+      // second membre
+      Kokkos::parallel_for(start_gpu_timer(__KERNEL_NAME__),
+                           Kokkos::MDRangePolicy<Kokkos::Rank<2>>({ndeb, 0}, {nfin, ncomp}),
+                           KOKKOS_LAMBDA(const int face, const int k)
+      {
+        double flux;
+        flux_evaluateur.template flux_fa7_comp<Fa7_Type>(inco, a_r, face, flux_impose, ndeb, k, flux);
+        if (elem(face, 0) > -1) Kokkos::atomic_add(&resu(face, k), +flux);
+        if (elem(face, 1) > -1) Kokkos::atomic_add(&resu(face, k), -flux);
+      });
+      end_gpu_timer(__KERNEL_NAME__);
 
       // derivees : champ convecte
       Matrice_Morse *matrice = (is_pb_multi && is_conv_op_) ? (mats.count(nom_ch_inco_) && !semi_impl.count(nom_ch_inco_) ? mats.at(nom_ch_inco_) : nullptr) :
                                  (mats.count(nom_ch_inco_) ? mats.at(nom_ch_inco_) : nullptr);
       if (matrice)
-        for (int face = ndeb; face < nfin; face++)
-        {
-            flux_evaluateur.template coeffs_fa7 < Fa7_Type > (a_r, face, (const Neumann_sortie_libre&) la_cl.valeur(), aii, ajj);
-            if ((elem(face, 0)) > -1)
-              for (int i = 0; i < ncomp; i++) fill_coeff_matrice_morse < Type_Double > (face, i, ncomp, 1, aii, *matrice);
-
-            if ((elem(face, 1)) > -1)
-              for (int i = 0; i < ncomp; i++) fill_coeff_matrice_morse < Type_Double > (face, i, ncomp, 1, ajj, *matrice);
-          }
+      {
+          Type_Double aii(ncomp), ajj(ncomp);
+          for (int face = ndeb; face < nfin; face++)
+            {
+              flux_evaluateur.template coeffs_fa7<Fa7_Type>(tab_a_r, face, (const Neumann_sortie_libre&) la_cl.valeur(), aii, ajj);
+              if ((elem_(face, 0)) > -1)
+                for (int i = 0; i < ncomp; i++)
+                  fill_coeff_matrice_morse<Type_Double>(face, i, ncomp, 1, aii, *matrice);
+
+              if ((elem_(face, 1)) > -1)
+                for (int i = 0; i < ncomp; i++)
+                  fill_coeff_matrice_morse<Type_Double>(face, i, ncomp, 1, ajj, *matrice);
+            }
+        }
     }
 }
 
@@ -562,109 +731,146 @@ void Iterateur_VDF_Face<_TYPE_>::ajouter_blocs_fa7_sortie_libre_(const int num_c
  * ====== FA7 ELEM =====
  * ===================== */
 template<class _TYPE_> template <typename Type_Double>
-void Iterateur_VDF_Face<_TYPE_>::ajouter_blocs_fa7_elem(const int ncomp, matrices_t mats, DoubleTab& secmem, const tabs_t& semi_impl) const
+void Iterateur_VDF_Face<_TYPE_>::ajouter_blocs_fa7_elem(const int ncomp, matrices_t mats, DoubleTab& tab_secmem, const tabs_t& semi_impl) const
 {
-  DoubleTab& tab_flux_bords = op_base->flux_bords();
-  const DoubleTab& inco = semi_impl.count(nom_ch_inco_) ? semi_impl.at(nom_ch_inco_) : le_champ_convecte_ou_inc->valeurs();
-  Type_Double flux(ncomp), aii(ncomp), ajj(ncomp);
+  const DoubleTab& tab_inco = semi_impl.count(nom_ch_inco_) ? semi_impl.at(nom_ch_inco_) : le_champ_convecte_ou_inc->valeurs();
   const int n_fc_bd = le_dom->nb_faces_bord();
 
-  const DoubleTab* a_r = (!is_pb_multi || !is_conv_op_) ? nullptr : semi_impl.count("alpha_rho") ? &semi_impl.at("alpha_rho") :
-                         &ref_cast(Pb_Multiphase,op_base->equation().probleme()).equation_masse().champ_conserve().valeurs();
-//  const IntTab& f_e = le_dom->face_voisins();
+  const DoubleTab* tab_a_r = (!is_pb_multi || !is_conv_op_) ? nullptr : semi_impl.count("alpha_rho") ? &semi_impl.at("alpha_rho") :
+                             &ref_cast(Pb_Multiphase,op_base->equation().probleme()).equation_masse().champ_conserve().valeurs();
+
+  int dim = dimension;
+  // Copy of flux_evaluateur_ is mandatory here cause it's an attribute of the class
+  _TYPE_ flux_evaluateur = flux_evaluateur_;
+  flux_evaluateur.view_ro();
+  CIntTabView elem_faces = elem_faces_.view_ro();
+  CIntArrView orientation = orientation_.view_ro();
+  CDoubleTabView inco = tab_inco.view_ro();
+  CDoubleTabView a_r = tab_a_r != nullptr ? tab_a_r->view_ro() : ConstView<double, 2>();
+  DoubleTabView secmem = tab_secmem.view_rw();
+  DoubleTabView flux_bords = op_base->flux_bords().view_rw(); // XXX : flux_bords rempli seulement si MIXTE ... ie pas INTERNE !
   // second membre
-  for (int num_elem = 0; num_elem < nb_elem; num_elem++)
-    for (int fa7 = 0; fa7 < dimension; fa7++)
+  Kokkos::parallel_for(start_gpu_timer(__KERNEL_NAME__),
+                       Kokkos::MDRangePolicy<Kokkos::Rank<2>>({0, 0}, {nb_elem, ncomp}),
+                       KOKKOS_LAMBDA(const int num_elem, const int k)
+  {
+    for (int fa7 = 0; fa7 < dim; fa7++)
       {
-        int fac1 = elem_faces(num_elem, fa7), fac2 = elem_faces(num_elem, fa7 + dimension);
-        flux_evaluateur.template flux_fa7 < Type_Flux_Fa7::ELEM > (inco, a_r, num_elem, fac1, fac2, flux);
-
-        fill_resu_tab < Type_Double > (fac1, fac2, ncomp, flux, secmem);
-
-        if (fac1 < n_fc_bd)
-          for (int k = 0; k < ncomp; k++) tab_flux_bords(fac1, orientation(fac1)) += flux[k];
-
-        if (fac2 < n_fc_bd)
-          for (int k = 0; k < ncomp; k++) tab_flux_bords(fac2, orientation(fac2)) -= flux[k];
+        const int fac1 = elem_faces(num_elem, fa7), fac2 = elem_faces(num_elem, fa7 + dim);
+        double flux;
+        flux_evaluateur.template flux_fa7_comp<Type_Flux_Fa7::ELEM>(inco, a_r, num_elem, fac1, fac2, k, flux);
+        Kokkos::atomic_add(&secmem(fac1, k), +flux);
+        Kokkos::atomic_add(&secmem(fac2, k), -flux);
+        if (fac1 < n_fc_bd) Kokkos::atomic_add(&flux_bords(fac1, orientation(fac1)), +flux);
+        if (fac2 < n_fc_bd) Kokkos::atomic_add(&flux_bords(fac2, orientation(fac2)), -flux);
       }
+  });
+  end_gpu_timer(__KERNEL_NAME__);
 
   // derivees : champ convecte
   Matrice_Morse *matrice = (is_pb_multi && is_conv_op_) ? (mats.count(nom_ch_inco_) && !semi_impl.count(nom_ch_inco_) ? mats.at(nom_ch_inco_) : nullptr) :
                              (mats.count(nom_ch_inco_) ? mats.at(nom_ch_inco_) : nullptr);
   if (matrice)
   {
+      Type_Double aii(ncomp), ajj(ncomp);
+      ToDo_Kokkos("critical");
       for (int num_elem = 0; num_elem < nb_elem; num_elem++)
         for (int fa7 = 0; fa7 < dimension; fa7++)
           {
-            const int fac1 = elem_faces(num_elem, fa7), fac2 = elem_faces(num_elem, fa7 + dimension);
-            flux_evaluateur.template coeffs_fa7 < Type_Flux_Fa7::ELEM > (a_r, num_elem, fac1, fac2, aii, ajj);
+            const int fac1 = elem_faces_(num_elem, fa7), fac2 = elem_faces_(num_elem, fa7 + dimension);
+            flux_evaluateur.template coeffs_fa7 < Type_Flux_Fa7::ELEM > (tab_a_r, num_elem, fac1, fac2, aii, ajj);
             for (int i = 0; i < ncomp; i++)
               fill_coeff_matrice_morse < Type_Double > (fac1, fac2, i, ncomp, aii, ajj, *matrice);
           }
     }
 
   // On corrige si cl periodique ...
-  corriger_fa7_elem_periodicite<Type_Double>(ncomp, mats, secmem, semi_impl);
+  corriger_fa7_elem_periodicite<Type_Double>(ncomp, mats, tab_secmem, semi_impl);
 }
 
 template<class _TYPE_> template<typename Type_Double>
-void Iterateur_VDF_Face<_TYPE_>::corriger_fa7_elem_periodicite(const int ncomp, matrices_t mats, DoubleTab& secmem, const tabs_t& semi_impl) const
+void Iterateur_VDF_Face<_TYPE_>::corriger_fa7_elem_periodicite(const int ncomp, matrices_t mats, DoubleTab& tab_resu, const tabs_t& semi_impl) const
 {
-  Type_Double flux(ncomp), aii(ncomp), ajj(ncomp);
-  const DoubleTab& inco = semi_impl.count(nom_ch_inco_) ? semi_impl.at(nom_ch_inco_) : le_champ_convecte_ou_inc->valeurs();
-  Matrice_Morse *matrice = (is_pb_multi && is_conv_op_) ? (mats.count(nom_ch_inco_) && !semi_impl.count(nom_ch_inco_) ? mats.at(nom_ch_inco_) : nullptr) : (mats.count(nom_ch_inco_) ? mats.at(nom_ch_inco_) : nullptr);
+  const DoubleTab& tab_inco = semi_impl.count(nom_ch_inco_) ? semi_impl.at(nom_ch_inco_)
+                              : le_champ_convecte_ou_inc->valeurs();
+  Matrice_Morse *matrice = (is_pb_multi && is_conv_op_) ? (mats.count(nom_ch_inco_) && !semi_impl.count(nom_ch_inco_)
+                                                           ? mats.at(nom_ch_inco_) : nullptr) : (mats.count(
+                                                                                                     nom_ch_inco_) ? mats.at(nom_ch_inco_) : nullptr);
 
   for (int num_cl = 0; num_cl < le_dom->nb_front_Cl(); num_cl++)
-    {
+  {
       const Cond_lim& la_cl = la_zcl->les_conditions_limites(num_cl);
       if (sub_type(Periodique, la_cl.valeur()))
         {
           const Periodique& la_cl_perio = ref_cast(Periodique, la_cl.valeur());
           const Front_VF& le_bord = ref_cast(Front_VF, la_cl_perio.frontiere_dis());
-          int num_elem, signe, fac1, fac2, ndeb = le_bord.num_premiere_face(), nfin = ndeb + le_bord.nb_faces();
+          int ndeb = le_bord.num_premiere_face(), nfin = ndeb + le_bord.nb_faces();
 
           // second membre
-          for (int face = ndeb; face < nfin; face++)
-            {
-              flux = 0.;
-              corriger_fa7_elem_periodicite__(face, num_elem, signe, fac1, fac2);
-
-              flux_evaluateur.template flux_fa7 < Type_Flux_Fa7::ELEM > (inco, nullptr, num_elem, fac1, fac2, flux);
-              for (int k = 0; k < ncomp; k++) secmem(face, k) += signe * flux[k];
-            }
+          // Copy of flux_evaluateur_ is mandatory here cause it's an attribute of the class
+          _TYPE_ flux_evaluateur = flux_evaluateur_;
+          flux_evaluateur.view_ro();
+          int dim = Objet_U::dimension;
+          CIntTabView elem = elem_.view_ro();
+          CIntTabView elem_faces = elem_faces_.view_ro();
+          CIntArrView orientation = orientation_.view_ro();
+          CDoubleTabView a_r;
+          CDoubleTabView inco = tab_inco.view_ro();
+          DoubleTabView resu = tab_resu.view_rw();
+          // second membre
+          Kokkos::parallel_for(start_gpu_timer(__KERNEL_NAME__),
+                               Kokkos::MDRangePolicy<Kokkos::Rank<2>>({ndeb, 0}, {nfin, ncomp}),
+                               KOKKOS_LAMBDA(const int face, const int k)
+          {
+            const int elem1 = elem(face, 0), elem2 = elem(face, 1), ori = orientation(face);
+            int num_elem, signe;
+            if ((face == elem_faces(elem1, ori)) || (face == elem_faces(elem1, ori + dim)))
+              { num_elem = elem2; signe = 1; }
+            else
+              { num_elem = elem1; signe = -1; }
+            const int fac1 = elem_faces(num_elem, ori), fac2 = elem_faces(num_elem, ori + dim);
+            double flux;
+            flux_evaluateur.template flux_fa7_comp<Type_Flux_Fa7::ELEM>(inco, a_r, num_elem, fac1, fac2, k, flux);
+            Kokkos::atomic_add(&resu(face, k), +signe * flux);
+          });
+          end_gpu_timer(__KERNEL_NAME__);
 
           // derivees : champ convecte
           if (matrice)
-            for (int face = ndeb; face < nfin; face++)
-              {
-                aii = 0., ajj = 0.;
-                corriger_fa7_elem_periodicite__(face, num_elem, signe, fac1, fac2);
-
-                flux_evaluateur.template coeffs_fa7 < Type_Flux_Fa7::ELEM > (nullptr, num_elem, fac1, fac2, aii, ajj);
-                const auto& tab1 = (*matrice).get_set_tab1();
-                const auto& tab2 = (*matrice).get_set_tab2();
-                auto& coeff = (*matrice).get_set_coeff();
-                if (signe > 0) /* on a oublie a droite  la contribution de la gauche */
-                  {
-                    for (int i = 0; i < ncomp; i++)
-                      for (auto k = tab1[face * ncomp + i] - 1; k < tab1[face * ncomp + 1 + i] - 1; k++)
-                        if (tab2[k] - 1 == face * ncomp + i) coeff[k] += aii[i];
-
-                    for (int i = 0; i < ncomp; i++)
-                      for (auto k = tab1[face * ncomp + i] - 1; k < tab1[face * ncomp + 1 + i] - 1; k++)
-                        if (tab2[k] - 1 == fac2 * ncomp + i) coeff[k] -= ajj[i];
-                  }
-                else /* on a oublie a gauche  la contribution de la droite */
-                  {
-                    for (int i = 0; i < ncomp; i++)
-                      for (auto k = tab1[face * ncomp + i] - 1; k < tab1[face * ncomp + 1 + i] - 1; k++)
-                        if (tab2[k] - 1 == fac1 * ncomp + i) coeff[k] -= aii[i];
-
-                    for (int i = 0; i < ncomp; i++)
-                      for (auto k = tab1[face * ncomp + i] - 1; k < tab1[face * ncomp + 1 + i] - 1; k++)
-                        if (tab2[k] - 1 == face * ncomp + i) coeff[k] += ajj[i];
-                  }
-              }
+            {
+              Type_Double aii(ncomp), ajj(ncomp);
+              int num_elem, signe, fac1, fac2;
+              for (int face = ndeb; face < nfin; face++)
+                {
+                  aii = 0., ajj = 0.;
+                  corriger_fa7_elem_periodicite__(face, num_elem, signe, fac1, fac2);
+
+                  flux_evaluateur.template coeffs_fa7<Type_Flux_Fa7::ELEM>(nullptr, num_elem, fac1, fac2, aii, ajj);
+                  const auto& tab1 = (*matrice).get_set_tab1();
+                  const auto& tab2 = (*matrice).get_set_tab2();
+                  auto& coeff = (*matrice).get_set_coeff();
+                  if (signe > 0) /* on a oublie a droite  la contribution de la gauche */
+                    {
+                      for (int i = 0; i < ncomp; i++)
+                        for (auto k = tab1[face * ncomp + i] - 1; k < tab1[face * ncomp + 1 + i] - 1; k++)
+                          if (tab2[k] - 1 == face * ncomp + i) coeff[k] += aii[i];
+
+                      for (int i = 0; i < ncomp; i++)
+                        for (auto k = tab1[face * ncomp + i] - 1; k < tab1[face * ncomp + 1 + i] - 1; k++)
+                          if (tab2[k] - 1 == fac2 * ncomp + i) coeff[k] -= ajj[i];
+                    }
+                  else   /* on a oublie a gauche  la contribution de la droite */
+                    {
+                      for (int i = 0; i < ncomp; i++)
+                        for (auto k = tab1[face * ncomp + i] - 1; k < tab1[face * ncomp + 1 + i] - 1; k++)
+                          if (tab2[k] - 1 == fac1 * ncomp + i) coeff[k] -= aii[i];
+
+                      for (int i = 0; i < ncomp; i++)
+                        for (auto k = tab1[face * ncomp + i] - 1; k < tab1[face * ncomp + 1 + i] - 1; k++)
+                          if (tab2[k] - 1 == face * ncomp + i) coeff[k] += ajj[i];
+                    }
+                }
+            }
         }
     }
 }
@@ -672,8 +878,8 @@ void Iterateur_VDF_Face<_TYPE_>::corriger_fa7_elem_periodicite(const int ncomp,
 template<class _TYPE_>
 void Iterateur_VDF_Face<_TYPE_>::corriger_fa7_elem_periodicite__(const int face, int& num_elem, int& signe, int& fac1, int& fac2) const
 {
-  const int elem1 = elem(face, 0), elem2 = elem(face, 1), ori = orientation(face);
-  if ((face == elem_faces(elem1, ori)) || (face == elem_faces(elem1, ori + dimension)))
+  const int elem1 = elem_(face, 0), elem2 = elem_(face, 1), ori = orientation_(face);
+  if ((face == elem_faces_(elem1, ori)) || (face == elem_faces_(elem1, ori + dimension)))
     {
       num_elem = elem2;
       signe = 1;
@@ -683,7 +889,7 @@ void Iterateur_VDF_Face<_TYPE_>::corriger_fa7_elem_periodicite__(const int face,
       num_elem = elem1;
       signe = -1;
     }
-  fac1 = elem_faces(num_elem, ori), fac2 = elem_faces(num_elem, ori + dimension);
+  fac1 = elem_faces_(num_elem, ori), fac2 = elem_faces_(num_elem, ori + dimension);
 }
 
 /* ========================= *
@@ -721,16 +927,6 @@ void Iterateur_VDF_Face<_TYPE_>::ajouter_pour_compressible(const int ncomp, matr
 
 // ===================================================================================================
 
-template<class _TYPE_> template<typename Type_Double>
-inline void Iterateur_VDF_Face<_TYPE_>::fill_resu_tab(const int fac1, const int fac2, const int ncomp, const Type_Double& flux, DoubleTab& resu) const
-{
-  for (int k = 0; k < ncomp; k++)
-    {
-      resu(fac1, k) += flux[k];
-      resu(fac2, k) -= flux[k];
-    }
-}
-
 template<class _TYPE_> template<typename Type_Double>
 void Iterateur_VDF_Face<_TYPE_>::fill_coeff_matrice_morse(const int face, const int i, const int ncomp, const int signe, const Type_Double& A, Matrice_Morse& matrice) const
 {
@@ -759,6 +955,7 @@ void Iterateur_VDF_Face<_TYPE_>::fill_coeff_matrice_morse(const int fac1, const
     }
 }
 
+
 #include <Iterateur_VDF_Face_bis.tpp>
 
 #endif /* Iterateur_VDF_Face_TPP_included */
diff --git a/src/VDF/Operateurs/Iterateurs/Iterateur_VDF_Face_bis.tpp b/src/VDF/Operateurs/Iterateurs/Iterateur_VDF_Face_bis.tpp
index 8cd860ad60..00f076259a 100644
--- a/src/VDF/Operateurs/Iterateurs/Iterateur_VDF_Face_bis.tpp
+++ b/src/VDF/Operateurs/Iterateurs/Iterateur_VDF_Face_bis.tpp
@@ -1,5 +1,5 @@
 /****************************************************************************
-* Copyright (c) 2024, CEA
+* Copyright (c) 2026, CEA
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
@@ -18,26 +18,28 @@
 
 template<class _TYPE_>
 inline Iterateur_VDF_Face<_TYPE_>::Iterateur_VDF_Face(const Iterateur_VDF_Face<_TYPE_>& iter) :
-  Iterateur_VDF_base(iter), flux_evaluateur(iter.flux_evaluateur), nb_elem(iter.nb_elem), premiere_arete_interne(iter.premiere_arete_interne), derniere_arete_interne(iter.derniere_arete_interne),
+  Iterateur_VDF_base(iter), flux_evaluateur_(iter.flux_evaluateur_), nb_elem(iter.nb_elem), premiere_arete_interne(iter.premiere_arete_interne), derniere_arete_interne(iter.derniere_arete_interne),
   premiere_arete_mixte(iter.premiere_arete_mixte), derniere_arete_mixte(iter.derniere_arete_mixte), premiere_arete_bord(iter.premiere_arete_bord), derniere_arete_bord(iter.derniere_arete_bord),
   premiere_arete_coin(iter.premiere_arete_coin), derniere_arete_coin(iter.derniere_arete_coin)
 {
-  orientation.ref(iter.orientation);
-  Qdm.ref(iter.Qdm);
-  elem.ref(iter.elem);
-  elem_faces.ref(iter.elem_faces);
+  orientation_.ref(iter.orientation_);
+  Qdm_.ref(iter.Qdm_);
+  elem_.ref(iter.elem_);
+  elem_faces_.ref(iter.elem_faces_);
   type_arete_bord.ref(iter.type_arete_bord);
   type_arete_coin.ref(iter.type_arete_coin);
+  val_imp_face_bord_.ref(iter.val_imp_face_bord_);
+  coeff_frottement_face_bord_.ref(iter.coeff_frottement_face_bord_);
 }
 
 template<class _TYPE_>
 void Iterateur_VDF_Face<_TYPE_>::completer_()
 {
   nb_elem = le_dom->nb_elem_tot();
-  orientation.ref(le_dom->orientation());
-  Qdm.ref(le_dom->Qdm());
-  elem.ref(le_dom->face_voisins());
-  elem_faces.ref(le_dom->elem_faces());
+  orientation_.ref(le_dom->orientation());
+  Qdm_.ref(le_dom->Qdm());
+  elem_.ref(le_dom->face_voisins());
+  elem_faces_.ref(le_dom->elem_faces());
   type_arete_bord.ref(la_zcl->type_arete_bord());
   type_arete_coin.ref(la_zcl->type_arete_coin());
   premiere_arete_interne = le_dom->premiere_arete_interne();
@@ -62,8 +64,12 @@ inline void Iterateur_VDF_Face<_TYPE_>::multiply_by_rho_if_hydraulique(DoubleTab
         {
           const double coef = rho.valeurs()(0, 0);
           const int nb_faces_bord = le_dom->nb_faces_bord();
-          for (int face = 0; face < nb_faces_bord; face++)
-            for (int k = 0; k < tab_flux_bords.line_size(); k++) tab_flux_bords(face, k) *= coef;
+          DoubleTabView flux_bords = tab_flux_bords.view_rw();
+          Kokkos::parallel_for(start_gpu_timer(__KERNEL_NAME__), range_1D(0, nb_faces_bord), KOKKOS_LAMBDA(const int face)
+          {
+            for (int k = 0; k < (int)flux_bords.extent(1); k++) flux_bords(face, k) *= coef;
+          });
+          end_gpu_timer(__KERNEL_NAME__);
         }
     }
 }
@@ -77,45 +83,51 @@ int Iterateur_VDF_Face<_TYPE_>::impr(Sortie& os) const
   const Schema_Temps_base& sch = la_zcl->equation().probleme().schema_temps();
   DoubleTab& tab_flux_bords = op_base->flux_bords();
   DoubleVect bilan(tab_flux_bords.dimension(1));
-  DoubleTab xgr;
-  if (impr_mom) xgr = le_dom->calculer_xgr();
-  int k, face, nb_front_Cl = le_dom->nb_front_Cl();
-  DoubleTrav flux_bords2(5, nb_front_Cl, tab_flux_bords.dimension(1));
-  flux_bords2 = 0;
+  DoubleTab tab_xgr;
+  if (impr_mom) tab_xgr = le_dom->calculer_xgr();
+  int nb_front_Cl = le_dom->nb_front_Cl();
+  DoubleTrav tab_flux_bords2(5, nb_front_Cl, tab_flux_bords.dimension(1));
+  tab_flux_bords2 = 0;
+  const int dim = Objet_U::dimension;
+  const int ncomp = tab_flux_bords.dimension(1);
+  CDoubleTabView flux_bords = tab_flux_bords.view_ro();
+  DoubleTabView3 flux_bords2 = tab_flux_bords2.view_rw<3>();
+  CDoubleTabView xgr;
+  if (impr_mom) xgr = tab_xgr.view_ro();
   for (int num_cl = 0; num_cl < nb_front_Cl; num_cl++)
     {
       const Cond_lim& la_cl = la_zcl->les_conditions_limites(num_cl);
       const Front_VF& frontiere_dis = ref_cast(Front_VF, la_cl->frontiere_dis());
-      int ndeb = frontiere_dis.num_premiere_face(), nfin = ndeb + frontiere_dis.nb_faces(), periodicite = (type_cl(la_cl) == periodique ? 1 : 0);
-      for (face = ndeb; face < nfin; face++)
-        {
-          for (k = 0; k < tab_flux_bords.dimension(1); k++)
-            {
-              flux_bords2(0, num_cl, k) += tab_flux_bords(face, k);
-              if (periodicite)
-                {
-                  if (face < (ndeb + frontiere_dis.nb_faces() / 2))
-                    flux_bords2(1, num_cl, k) += tab_flux_bords(face, k);
-                  else
-                    flux_bords2(2, num_cl, k) += tab_flux_bords(face, k);
-                }
-              if (mon_dom.bords_a_imprimer_sum().contient(frontiere_dis.le_nom()))
-                flux_bords2(3, num_cl, k) += tab_flux_bords(face, k);
-            } /* fin for k */
-          if (impr_mom)
-            {
-              if (dimension == 2)
-                flux_bords2(4, num_cl, 0) += tab_flux_bords(face, 1) * xgr(face, 0) - tab_flux_bords(face, 0) * xgr(face, 1);
-              else
-                {
-                  flux_bords2(4, num_cl, 0) += tab_flux_bords(face, 2) * xgr(face, 1) - tab_flux_bords(face, 1) * xgr(face, 2);
-                  flux_bords2(4, num_cl, 1) += tab_flux_bords(face, 0) * xgr(face, 2) - tab_flux_bords(face, 2) * xgr(face, 0);
-                  flux_bords2(4, num_cl, 2) += tab_flux_bords(face, 1) * xgr(face, 0) - tab_flux_bords(face, 0) * xgr(face, 1);
-                }
-            }
-        } /* fin for face */
+      const int ndeb = frontiere_dis.num_premiere_face(), nfin = ndeb + frontiere_dis.nb_faces(), periodicite = (type_cl(la_cl) == periodique ? 1 : 0);
+      const int first_half_end = ndeb + frontiere_dis.nb_faces() / 2;
+      const int impr_boundary = mon_dom.bords_a_imprimer_sum().contient(frontiere_dis.le_nom()) ? 1 : 0;
+      Kokkos::parallel_for(start_gpu_timer(__KERNEL_NAME__), range_1D(ndeb, nfin), KOKKOS_LAMBDA(const int face)
+      {
+        for (int k = 0; k < ncomp; k++)
+          {
+            Kokkos::atomic_add(&flux_bords2(0, num_cl, k), flux_bords(face, k));
+            if (periodicite)
+              {
+                if (face < first_half_end) Kokkos::atomic_add(&flux_bords2(1, num_cl, k), flux_bords(face, k));
+                else Kokkos::atomic_add(&flux_bords2(2, num_cl, k), flux_bords(face, k));
+              }
+            if (impr_boundary) Kokkos::atomic_add(&flux_bords2(3, num_cl, k), flux_bords(face, k));
+          }
+        if (impr_mom)
+          {
+            if (dim == 2)
+              Kokkos::atomic_add(&flux_bords2(4, num_cl, 0), flux_bords(face, 1) * xgr(face, 0) - flux_bords(face, 0) * xgr(face, 1));
+            else
+              {
+                Kokkos::atomic_add(&flux_bords2(4, num_cl, 0), flux_bords(face, 2) * xgr(face, 1) - flux_bords(face, 1) * xgr(face, 2));
+                Kokkos::atomic_add(&flux_bords2(4, num_cl, 1), flux_bords(face, 0) * xgr(face, 2) - flux_bords(face, 2) * xgr(face, 0));
+                Kokkos::atomic_add(&flux_bords2(4, num_cl, 2), flux_bords(face, 1) * xgr(face, 0) - flux_bords(face, 0) * xgr(face, 1));
+              }
+          }
+      });
+      end_gpu_timer(__KERNEL_NAME__);
     }
-  mp_sum_for_each_item(flux_bords2);
+  mp_sum_for_each_item(tab_flux_bords2);
   if (je_suis_maitre())
     {
       op_base->ouvrir_fichier(Flux, "", 1);
@@ -130,30 +142,30 @@ int Iterateur_VDF_Face<_TYPE_>::impr(Sortie& os) const
         {
           const Cond_lim& la_cl = la_zcl->les_conditions_limites(num_cl);
           int periodicite = (type_cl(la_cl) == periodique ? 1 : 0);
-          for (k = 0; k < tab_flux_bords.dimension(1); k++)
+          for (int k = 0; k < tab_flux_bords.dimension(1); k++)
             {
               if (periodicite)
                 {
-                  Flux.add_col(flux_bords2(1, num_cl, k));
-                  Flux.add_col(flux_bords2(2, num_cl, k));
+                  Flux.add_col(tab_flux_bords2(1, num_cl, k));
+                  Flux.add_col(tab_flux_bords2(2, num_cl, k));
                 }
               else
-                Flux.add_col(flux_bords2(0, num_cl, k));
+                Flux.add_col(tab_flux_bords2(0, num_cl, k));
 
               if (impr_sum)
-                Flux_sum.add_col(flux_bords2(3, num_cl, k));
-              bilan(k) += flux_bords2(0, num_cl, k);
+                Flux_sum.add_col(tab_flux_bords2(3, num_cl, k));
+              bilan(k) += tab_flux_bords2(0, num_cl, k);
             }
           if (dimension == 3)
             {
-              for (k = 0; k < tab_flux_bords.dimension(1); k++)
+              for (int k = 0; k < tab_flux_bords.dimension(1); k++)
                 if (impr_mom)
-                  Flux_moment.add_col(flux_bords2(4, num_cl, k));
+                  Flux_moment.add_col(tab_flux_bords2(4, num_cl, k));
             }
           else if (impr_mom)
-            Flux_moment.add_col(flux_bords2(4, num_cl, 0));
+            Flux_moment.add_col(tab_flux_bords2(4, num_cl, 0));
         } /* fin for num_cl */
-      for (k = 0; k < tab_flux_bords.dimension(1); k++)
+      for (int k = 0; k < tab_flux_bords.dimension(1); k++)
         Flux.add_col(bilan(k));
       Flux << finl;
       if (impr_sum)
@@ -181,13 +193,13 @@ int Iterateur_VDF_Face<_TYPE_>::impr(Sortie& os) const
                   sch.imprimer_temps_courant(Flux_face);
                   Flux_face << " : " << finl;
                 }
-              for (face = ndeb; face < nfin; face++)
+              for (int face = ndeb; face < nfin; face++)
                 {
                   if (dimension == 2)
                     Flux_face << "# Face a x= " << le_dom->xv(face, 0) << " y= " << le_dom->xv(face, 1) << " : ";
                   else if (dimension == 3)
                     Flux_face << "# Face a x= " << le_dom->xv(face, 0) << " y= " << le_dom->xv(face, 1) << " z= " << le_dom->xv(face, 2) << " : ";
-                  for (k = 0; k < tab_flux_bords.dimension(1); k++)
+                  for (int k = 0; k < tab_flux_bords.dimension(1); k++)
                     Flux_face << tab_flux_bords(face, k) << " ";
                   Flux_face << finl;
                 }
diff --git a/src/VDF/Operateurs/Iterateurs/Iterateur_VDF_base.h b/src/VDF/Operateurs/Iterateurs/Iterateur_VDF_base.h
index 2b04e9208a..879dbd14ca 100644
--- a/src/VDF/Operateurs/Iterateurs/Iterateur_VDF_base.h
+++ b/src/VDF/Operateurs/Iterateurs/Iterateur_VDF_base.h
@@ -1,5 +1,5 @@
 /****************************************************************************
-* Copyright (c) 2025, CEA
+* Copyright (c) 2026, CEA
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
@@ -97,7 +97,9 @@ class Iterateur_VDF_base : public Objet_U
 
   virtual DoubleTab& ajouter(const DoubleTab& inco, DoubleTab& secmem) const final
   {
-    ajouter_blocs({}, secmem, {{ op_base->equation().inconnue().le_nom().getString(), inco }});
+    tabs_t semi_impl;
+    semi_impl[op_base->equation().inconnue().le_nom().getString()].ref(inco); /* evite la copie de inco dans tabs_t */
+    ajouter_blocs({}, secmem, semi_impl);
     return secmem;
   }
 
diff --git a/src/VDF/Operateurs/Op_Conv/Op_Conv_VDF_base.cpp b/src/VDF/Operateurs/Op_Conv/Op_Conv_VDF_base.cpp
index 5aa0396145..6833bf5650 100644
--- a/src/VDF/Operateurs/Op_Conv/Op_Conv_VDF_base.cpp
+++ b/src/VDF/Operateurs/Op_Conv/Op_Conv_VDF_base.cpp
@@ -45,12 +45,21 @@ Entree& Op_Conv_VDF_base::readOn(Entree& s)
   return s;
 }
 
+// ToDo try to templatize:
 inline void eval_fluent(const double psc, const int num1, const int num2, const int n, DoubleTab& fluent)
 {
   if (psc >= 0) fluent(num2, n) += psc;
   else fluent(num1, n) -= psc;
 }
 
+KOKKOS_INLINE_FUNCTION void eval_fluent(const double psc, const int num1, const int num2, const int n, DoubleTabView fluent)
+{
+  if (psc >= 0)
+    Kokkos::atomic_add(&fluent(num2, n), +psc);
+  else
+    Kokkos::atomic_add(&fluent(num1, n), -psc);
+}
+
 void Op_Conv_VDF_base::completer()
 {
   Operateur_base::completer();
@@ -110,18 +119,23 @@ void Op_Conv_VDF_base::dimensionner_blocs_elem(matrices_t mats, const tabs_t& se
           {
             const IntTab& fcl_v = ref_cast(Champ_Face_VDF, vitesse()).fcl();
 
+            ToDo_Kokkos("critical");
             for (f = 0; f < domaine.nb_faces_tot(); f++)
               if (fcl_v(f, 0) < 2)
                 for (i = 0; i < 2; i++)
                   if ((e = f_e(f, i)) >= 0 && e < domaine.nb_elem_tot())
                     for (n = 0; n < N; n++) stencil.append_line(N * e + n, M * f + n * (M > 1));
           }
-        else for (f = 0; f < domaine.nb_faces_tot(); f++)
-            for (i = 0; i < 2; i++)
-              if ((e = f_e(f, i)) >= 0 && e < domaine.nb_elem_tot()) /* inconnues scalaires */
-                for (j = 0; j < 2; j++)
-                  if ((eb = f_e(f, j)) >= 0)
-                    for (n = 0, m = 0; n < N; n++, m += (M > 1)) stencil.append_line(N * e + n, M * eb + m);
+        else
+          {
+            ToDo_Kokkos("critical");
+            for (f = 0; f < domaine.nb_faces_tot(); f++)
+              for (i = 0; i < 2; i++)
+                if ((e = f_e(f, i)) >= 0 && e < domaine.nb_elem_tot()) /* inconnues scalaires */
+                  for (j = 0; j < 2; j++)
+                    if ((eb = f_e(f, j)) >= 0)
+                      for (n = 0, m = 0; n < N; n++, m += (M > 1)) stencil.append_line(N * e + n, M * eb + m);
+          }
 
         tableau_trier_retirer_doublons(stencil);
         const int nl = equation().inconnue().valeurs().size_totale(),
@@ -151,6 +165,7 @@ void Op_Conv_VDF_base::dimensionner_blocs_face(matrices_t matrices, const tabs_t
 
 
   /* agit uniquement aux elements; diagonale omise */
+  ToDo_Kokkos("critical");
   for (int f = 0; f < domaine.nb_faces_tot(); f++)
     if (f_e(f, 0) >= 0 && (f_e(f, 1) >= 0 || fcl(f, 0) == 3))
       for (int i = 0; i < 2; i++)
@@ -179,13 +194,9 @@ double Op_Conv_VDF_base::calculer_dt_stab() const
 {
   const Domaine_VDF& domaine_VDF = iter_->domaine();
   const Domaine_Cl_VDF& domaine_Cl_VDF = iter_->domaine_Cl();
-  const IntTab& face_voisins = domaine_VDF.face_voisins();
-  const DoubleVect& volumes = domaine_VDF.volumes();
-  const DoubleVect& face_surfaces = domaine_VDF.face_surfaces();
   const DoubleTab& vit_associe = vitesse().valeurs();
-  const DoubleTab& vit= (vitesse_pour_pas_de_temps_?vitesse_pour_pas_de_temps_->valeurs(): vit_associe);
-  const int N = std::min(vit.line_size(), equation().inconnue().valeurs().line_size());
-  const DoubleTab* alp = sub_type(Pb_Multiphase, equation().probleme()) ? &ref_cast(Pb_Multiphase, equation().probleme()).equation_masse().inconnue().passe() : nullptr;
+  const DoubleTab& tab_vit = (vitesse_pour_pas_de_temps_?vitesse_pour_pas_de_temps_->valeurs(): vit_associe);
+  const int N = std::min(tab_vit.line_size(), equation().inconnue().valeurs().line_size());
   if (!fluent_.get_md_vector())
     {
       fluent_.resize(0, N);
@@ -193,8 +204,6 @@ double Op_Conv_VDF_base::calculer_dt_stab() const
     }
   fluent_ = 0;
   // Remplissage du tableau fluent
-  double psc;
-  int num1, num2, face, elem1;
 
   // On traite les bords
   for (int n_bord = 0; n_bord < domaine_VDF.nb_front_Cl(); n_bord++)
@@ -203,46 +212,72 @@ double Op_Conv_VDF_base::calculer_dt_stab() const
       if ( sub_type(Dirichlet_entree_fluide,la_cl.valeur()) || sub_type(Neumann_sortie_libre,la_cl.valeur()) )
         {
           const Front_VF& le_bord = ref_cast(Front_VF,la_cl->frontiere_dis());
-          num1 = le_bord.num_premiere_face();
-          num2 = num1 + le_bord.nb_faces();
-          for (face=num1; face<num2; face++)
+          const int num1 = le_bord.num_premiere_face();
+          const int num2 = num1 + le_bord.nb_faces();
+          CIntTabView face_voisins = domaine_VDF.face_voisins().view_ro();
+          CDoubleArrView face_surfaces = domaine_VDF.face_surfaces().view_ro();
+          CDoubleTabView vit = tab_vit.view_ro();
+          DoubleTabView fluent = fluent_.view_rw();
+          Kokkos::parallel_for(start_gpu_timer(__KERNEL_NAME__), range_1D(num1, num2), KOKKOS_LAMBDA(const int face)
+          {
             for (int n = 0; n < N; n++)
               {
-                psc = vit(face, n) * face_surfaces(face);
-                if ( (elem1 = face_voisins(face,0)) != -1)
+                const double psc = vit(face, n) * face_surfaces(face);
+                int elem1 = face_voisins(face,0);
+                if (elem1 != -1)
                   {
-                    if (psc < 0) fluent_(elem1, n) -= psc;
+                    if (psc < 0)
+                      Kokkos::atomic_add(&fluent(elem1, n), -psc);
                   }
                 else // (elem2 != -1)
-                  if (psc > 0) fluent_(face_voisins(face,1), n) += psc;
+                  if (psc > 0)
+                    Kokkos::atomic_add(&fluent(face_voisins(face,1), n), +psc);
               }
+          });
+          end_gpu_timer(__KERNEL_NAME__);
         }
     }
 
   // Boucle sur les faces internes pour remplir fluent
   const int domaine_VDF_nb_faces = domaine_VDF.nb_faces(), premiere_face = domaine_VDF.premiere_face_int();
-  for (face = premiere_face; face < domaine_VDF_nb_faces; face++)
+  CIntTabView face_voisins = domaine_VDF.face_voisins().view_ro();
+  CDoubleArrView face_surfaces = domaine_VDF.face_surfaces().view_ro();
+  CDoubleTabView vit = tab_vit.view_ro();
+  DoubleTabView fluent = fluent_.view_rw();
+  Kokkos::parallel_for(start_gpu_timer(__KERNEL_NAME__), range_1D(premiere_face, domaine_VDF_nb_faces), KOKKOS_LAMBDA(const int face)
+  {
     for (int n = 0; n < N; n++)
       {
-        psc = vit(face, n) * face_surfaces(face);
-        eval_fluent(psc, face_voisins(face, 0), face_voisins(face, 1), n, fluent_);
+        const double psc = vit(face, n) * face_surfaces(face);
+        eval_fluent(psc, face_voisins(face, 0), face_voisins(face, 1), n, fluent);
       }
+  });
+  end_gpu_timer(__KERNEL_NAME__);
 
   // Calcul du pas de temps de stabilite a partir du tableau fluent
   if (vitesse().le_nom()=="rho_u" && equation().probleme().is_dilatable())
     diviser_par_rho_si_dilatable(fluent_,equation().milieu());
 
   const double alpha_min_dt = 1e-3; // avoid stupid time steps during vanishing phase
-  double dt_stab = 1.e30;
+  double dt_stab;
   int domaine_VDF_nb_elem=domaine_VDF.nb_elem();
+  bool is_pbm = sub_type(Pb_Multiphase, equation().probleme());
+  const DoubleTab* ptr_alpha = is_pbm ? &ref_cast(Pb_Multiphase, equation().probleme()).equation_masse().inconnue().passe() : &fluent_ /* not used */;
   // dt_stab = min ( 1 / ( |U|/dx + |V|/dy + |W|/dz ) )
-  for (int num_poly=0; num_poly<domaine_VDF_nb_elem; num_poly++)
+  CDoubleTabView alpha = ptr_alpha->view_ro();
+  CDoubleArrView volumes = domaine_VDF.volumes().view_ro();
+  fluent_.view_ro();
+  Kokkos::parallel_reduce(start_gpu_timer(__KERNEL_NAME__), domaine_VDF_nb_elem, KOKKOS_LAMBDA(const int num_poly, double& dt_stab_)
+  {
     for (int n = 0; n < N; n++)
-      if ((!alp || (*alp)(num_poly, n) > alpha_min_dt))
+      if ((!is_pbm || alpha(num_poly, n) > alpha_min_dt))
         {
-          double dt_elem = volumes(num_poly)/(fluent_(num_poly, n)+DMINFLOAT);
-          if (dt_elem<dt_stab) dt_stab = dt_elem;
+          double dt_elem = volumes(num_poly)/(fluent(num_poly, n)+DMINFLOAT);
+          if (dt_elem<dt_stab_) dt_stab_ = dt_elem;
         }
+  }, Kokkos::Min<double>(dt_stab));
+  end_gpu_timer(__KERNEL_NAME__);
+  dt_stab = std::min(1.e30, dt_stab); // Kokkos initialize to std::numeric_limits<double>::max() which is not what we wanted (1e30). Cause a division per 0 later...
 
   dt_stab = Process::mp_min(dt_stab);
 
@@ -284,6 +319,7 @@ void Op_Conv_VDF_base::calculer_dt_local(DoubleTab& dt_face) const
 
   // Boucle sur les faces internes pour remplir fluent
   const int domaine_VDF_nb_faces = domaine_VDF.nb_faces(), premiere_face = domaine_VDF.premiere_face_int();
+  ToDo_Kokkos("critical");
   for (int face = premiere_face; face < domaine_VDF_nb_faces; face++)
     {
       const double value = vit[face]*face_surfaces(face);
@@ -303,6 +339,7 @@ void Op_Conv_VDF_base::calculer_dt_local(DoubleTab& dt_face) const
       const Cond_lim& la_cl = domaine_Cl_VDF.les_conditions_limites(n_bord);
       const Front_VF& le_bord = ref_cast(Front_VF,la_cl->frontiere_dis());
       const int ndeb = le_bord.num_premiere_face(), nfin = ndeb + le_bord.nb_faces();
+      ToDo_Kokkos("critical");
       for (int num_face = ndeb; num_face < nfin; num_face++)
         {
           if( sup_strict(fluent[num_face], 1.e-16) ) dt_face(num_face)= volumes_entrelaces(num_face)/fluent[num_face];
@@ -311,6 +348,7 @@ void Op_Conv_VDF_base::calculer_dt_local(DoubleTab& dt_face) const
     }
 
   // Boucle sur les faces internes
+  ToDo_Kokkos("critical");
   for (int num_face = premiere_face; num_face<domaine_VDF_nb_faces; num_face++)
     {
       if( sup_strict(fluent[num_face], 1.e-16) ) dt_face(num_face)= volumes_entrelaces(num_face)/fluent[num_face];
@@ -332,6 +370,7 @@ void Op_Conv_VDF_base::calculer_dt_local(DoubleTab& dt_face) const
           const Periodique& la_cl_perio = ref_cast(Periodique,la_cl.valeur());
           const Front_VF& le_bord = ref_cast(Front_VF,la_cl->frontiere_dis());
           const int nb_faces_bord = le_bord.nb_faces();
+          ToDo_Kokkos("critical");
           for (int ind_face = 0; ind_face < nb_faces_bord; ind_face++)
             {
               int ind_face_associee = la_cl_perio.face_associee(ind_face);
@@ -379,6 +418,7 @@ void Op_Conv_VDF_base::calculer_pour_post(Champ_base& espace_stockage,const Nom&
               const Front_VF& le_bord = ref_cast(Front_VF,la_cl->frontiere_dis());
               num1 = le_bord.num_premiere_face();
               num2 = num1 + le_bord.nb_faces();
+              ToDo_Kokkos("critical");
               for (face = num1; face < num2; face++)
                 {
                   psc = vit[face]*face_surfaces(face);
@@ -394,6 +434,7 @@ void Op_Conv_VDF_base::calculer_pour_post(Champ_base& espace_stockage,const Nom&
 
       // Boucle sur les faces internes pour remplir fluent
       const int domaine_VDF_nb_faces = domaine_VDF.nb_faces();
+      ToDo_Kokkos("critical");
       for (face = domaine_VDF.premiere_face_int(); face < domaine_VDF_nb_faces; face++)
         {
           psc = vit[face]*face_surfaces(face);
@@ -490,6 +531,7 @@ void Op_Conv_VDF_base::mettre_a_jour(double temps)
             {
               Champ_Face_VDF& c_ph = ref_cast(Champ_Face_VDF, cc_phases_[n].valeur());
               DoubleTab& v_ph = c_ph.valeurs();
+              ToDo_Kokkos("critical");
               for (f = 0; f < domaine.nb_faces(); v_ph(f) *= vit(f, m) * pf(f), f++)
                 for (v_ph(f) = 0, i = 0; i < 2; i++) v_ph(f) += (1. + (vit(f, m) * (i ? -1 : 1) >= 0 ? 1. : -1.) * 1.0 /* FIXME : amont */) / 2 * ((e = f_e(f, i)) >= 0 ? vcc(e, n) : bcc(f, n));
               c_ph.changer_temps(temps);
@@ -503,6 +545,7 @@ void Op_Conv_VDF_base::mettre_a_jour(double temps)
               Champ_Face_VDF& c_ph = ref_cast(Champ_Face_VDF, vd_phases_[n].valeur());
               DoubleTab& v_ph = c_ph.valeurs();
               /* on remplit la partie aux faces, puis on demande au champ d'interpoler aux elements */
+              ToDo_Kokkos("critical");
               for (f = 0; f < domaine.nb_faces(); v_ph(f) *= vit(f, m) * pf(f), f++)
                 for (v_ph(f) = 0, i = 0; i < 2; i++) v_ph(f) += (1. + (vit(f, m) * (i ? -1 : 1) >= 0 ? 1. : -1.) * 1.0 /* FIXME : amont */) / 2 * ((e = f_e(f, i)) >= 0 ? alp(e, n) : balp(f, n));
               c_ph.changer_temps(temps);
@@ -510,6 +553,7 @@ void Op_Conv_VDF_base::mettre_a_jour(double temps)
 
       DoubleTrav G(N), v(N, D);
       double Gt;
+      ToDo_Kokkos("critical");
       if (x_phases_.size())
         for (e = 0; e < domaine.nb_elem(); e++) //titre : aux elements
           {
diff --git a/src/VDF/Operateurs/Op_Conv/Op_Conv_VDF_base.h b/src/VDF/Operateurs/Op_Conv/Op_Conv_VDF_base.h
index 8cd850dcff..0bed89b59a 100644
--- a/src/VDF/Operateurs/Op_Conv/Op_Conv_VDF_base.h
+++ b/src/VDF/Operateurs/Op_Conv/Op_Conv_VDF_base.h
@@ -89,5 +89,6 @@ class Op_Conv_VDF_base : public Operateur_Conv_base
 
 // Fonction utile pour le calcul du pas de temps de stabilite
 inline void eval_fluent(const double , const int , const int , const int , DoubleTab& );
+KOKKOS_INLINE_FUNCTION void eval_fluent(const double , const int , const int , const int , DoubleTabView );
 
 #endif /* Op_Conv_VDF_base_included */
diff --git a/src/VDF/Operateurs/Op_Diff_Dift/Op_Diff_Dift_base/Op_Diff_VDF_Face_Axi_base.cpp b/src/VDF/Operateurs/Op_Diff_Dift/Op_Diff_Dift_base/Op_Diff_VDF_Face_Axi_base.cpp
index de08db801e..5862899aeb 100644
--- a/src/VDF/Operateurs/Op_Diff_Dift/Op_Diff_Dift_base/Op_Diff_VDF_Face_Axi_base.cpp
+++ b/src/VDF/Operateurs/Op_Diff_Dift/Op_Diff_Dift_base/Op_Diff_VDF_Face_Axi_base.cpp
@@ -48,6 +48,7 @@ double Op_Diff_VDF_Face_Axi_base::calculer_dt_stab() const
 void Op_Diff_VDF_Face_Axi_base::ajouter_elem(const DoubleTab& inco, DoubleTab& resu) const
 {
   if (inco.line_size() > 1) not_implemented(__func__);
+  ToDo_Kokkos("critical");
   for (int num_elem = 0; num_elem < le_dom_vdf->nb_elem(); num_elem++)
     {
       const int fx0 = elem_faces(num_elem,0), fx1 = elem_faces(num_elem,dimension), fy0 = elem_faces(num_elem,1), fy1 = elem_faces(num_elem,1+dimension);
@@ -74,6 +75,7 @@ void Op_Diff_VDF_Face_Axi_base::ajouter_elem(const DoubleTab& inco, DoubleTab& r
 
 void Op_Diff_VDF_Face_Axi_base::ajouter_elem_3D(const DoubleTab& inco, DoubleTab& resu) const
 {
+  ToDo_Kokkos("critical");
   for (int num_elem = 0; num_elem < le_dom_vdf->nb_elem(); num_elem++)
     {
       const int fz0 = elem_faces(num_elem,2), fz1 = elem_faces(num_elem,2+dimension);
@@ -98,15 +100,14 @@ void  Op_Diff_VDF_Face_Axi_base::ajouter_aretes_bords(const DoubleTab& inco, Dou
         case TypeAreteBordVDF::PAROI_FLUIDE:
           {
             const int fac1 = Qdm(n_arete,0), fac2 = Qdm(n_arete,1), fac3 = Qdm(n_arete,2), signe  = Qdm(n_arete,3), ori1 = orientation(fac1), ori3 = orientation(fac3);
-            const int rang1 = fac1 - le_dom_vdf->premiere_face_bord(), rang2 = fac2 - le_dom_vdf->premiere_face_bord();
             double vit_imp, dist3, tps = inconnue->temps();
 
             if (n_type == TypeAreteBordVDF::PAROI_FLUIDE) // arete paroi_fluide :il faut determiner qui est la face fluide
               {
-                if (est_egal(inco[fac1],0)) vit_imp = Champ_Face_get_val_imp_face_bord(tps,rang2,ori3,la_zcl_vdf.valeur());
-                else vit_imp = Champ_Face_get_val_imp_face_bord(tps,rang1,ori3,la_zcl_vdf.valeur());
+                if (est_egal(inco[fac1],0)) vit_imp = Champ_Face_get_val_imp_face_bord(tps,fac2,ori3,la_zcl_vdf.valeur());
+                else vit_imp = Champ_Face_get_val_imp_face_bord(tps,fac1,ori3,la_zcl_vdf.valeur());
               }
-            else vit_imp = 0.5*(Champ_Face_get_val_imp_face_bord(tps,rang1,ori3,la_zcl_vdf.valeur())+Champ_Face_get_val_imp_face_bord(tps,rang2,ori3,la_zcl_vdf.valeur()));
+            else vit_imp = 0.5*(Champ_Face_get_val_imp_face_bord(tps,fac1,ori3,la_zcl_vdf.valeur())+Champ_Face_get_val_imp_face_bord(tps,fac2,ori3,la_zcl_vdf.valeur()));
 
             const double db_diffusivite = nu_mean_2_pts_(face_voisins(fac3,0),face_voisins(fac3,1));
 
@@ -313,6 +314,7 @@ void Op_Diff_VDF_Face_Axi_base::ajouter_contribution_elem(const DoubleTab& inco,
   const auto& tab1 = matrice.get_set_tab1();
   const auto& tab2 = matrice.get_set_tab2();
   auto& coeff = matrice.get_set_coeff();
+  ToDo_Kokkos("critical");
   for (int num_elem = 0; num_elem < le_dom_vdf->nb_elem(); num_elem++)
     {
       const int fx0 = elem_faces(num_elem,0), fx1 = elem_faces(num_elem,dimension), fy0 = elem_faces(num_elem,1), fy1 = elem_faces(num_elem,1+dimension);
@@ -345,6 +347,7 @@ void Op_Diff_VDF_Face_Axi_base::ajouter_contribution_elem(const DoubleTab& inco,
 
 void Op_Diff_VDF_Face_Axi_base::ajouter_contribution_elem_3D(Matrice_Morse& matrice) const
 {
+  ToDo_Kokkos("critical");
   for (int num_elem = 0; num_elem < le_dom_vdf->nb_elem(); num_elem++)
     {
       const int fz0 = elem_faces(num_elem,2), fz1 = elem_faces(num_elem,2+dimension);
@@ -572,17 +575,17 @@ void Op_Diff_VDF_Face_Axi_base::contribue_au_second_membre(DoubleTab& resu) cons
         case TypeAreteBordVDF::PAROI_FLUIDE:
           {
             const int fac1 = Qdm(n_arete,0), fac2 = Qdm(n_arete,1), fac3 = Qdm(n_arete,2), signe  = Qdm(n_arete,3);
-            const int ori1 = orientation(fac1), ori3 = orientation(fac3), rang1 = fac1 - le_dom_vdf->premiere_face_bord(), rang2 = fac2 - le_dom_vdf->premiere_face_bord();
+            const int ori1 = orientation(fac1), ori3 = orientation(fac3);
             double vit_imp, tps = inconnue->temps();
 
             if (n_type == TypeAreteBordVDF::PAROI_FLUIDE) // arete paroi_fluide :il faut determiner qui est la face fluide
               {
                 if (est_egal(inconnue->valeurs()(fac1), 0))
-                  vit_imp = Champ_Face_get_val_imp_face_bord(tps, rang2, ori3, la_zcl_vdf.valeur());
+                  vit_imp = Champ_Face_get_val_imp_face_bord(tps, fac2, ori3, la_zcl_vdf.valeur());
                 else
-                  vit_imp = Champ_Face_get_val_imp_face_bord(tps, rang1, ori3, la_zcl_vdf.valeur());
+                  vit_imp = Champ_Face_get_val_imp_face_bord(tps, fac1, ori3, la_zcl_vdf.valeur());
               }
-            else vit_imp = 0.5*(Champ_Face_get_val_imp_face_bord(tps,rang1,ori3,la_zcl_vdf.valeur())+Champ_Face_get_val_imp_face_bord(tps,rang2,ori3,la_zcl_vdf.valeur()));
+            else vit_imp = 0.5*(Champ_Face_get_val_imp_face_bord(tps,fac1,ori3,la_zcl_vdf.valeur())+Champ_Face_get_val_imp_face_bord(tps,fac2,ori3,la_zcl_vdf.valeur()));
 
             const double db_diffusivite =  nu_mean_2_pts_(face_voisins(fac3,0),face_voisins(fac3,1));
 
diff --git a/src/VDF/Operateurs/Op_Diff_Dift/Op_Diff_Dift_base/Op_Diff_VDF_base.cpp b/src/VDF/Operateurs/Op_Diff_Dift/Op_Diff_Dift_base/Op_Diff_VDF_base.cpp
index a01d47ed95..bfde8592b3 100644
--- a/src/VDF/Operateurs/Op_Diff_Dift/Op_Diff_Dift_base/Op_Diff_VDF_base.cpp
+++ b/src/VDF/Operateurs/Op_Diff_Dift/Op_Diff_Dift_base/Op_Diff_VDF_base.cpp
@@ -122,6 +122,7 @@ void Op_Diff_VDF_base::ajoute_terme_pour_axi(matrices_t matrices, DoubleTab& sec
               if (tab_diffusivite.size() == 1) diffu_tot = tab_diffusivite(0, 0);
               else diffu_tot = tab_diffusivite;
 
+              ToDo_Kokkos("critical");
               for (face = 0; face < nb_faces; face++)
                 for (int n = 0; n < N; n++)
                   if (ori(face) == 0)
@@ -173,39 +174,49 @@ double Op_Diff_VDF_base::calculer_dt_stab_(const Domaine_VDF& zone_VDF) const
   //      initial (comme en thermique) et non le Max sur les volumes de Qdm.
   double dt_stab = DMAXFLOAT;
   const Champ_base& ch_diffu = has_champ_masse_volumique() ? diffusivite() : diffusivite_pour_pas_de_temps();
-  const DoubleTab& diffu = ch_diffu.valeurs(), *alp = sub_type(Pb_Multiphase, equation().probleme()) ? &ref_cast(Pb_Multiphase, equation().probleme()).equation_masse().inconnue().passe() : nullptr;
+  const DoubleTab& tab_diffu = ch_diffu.valeurs(), *tab_alp = sub_type(Pb_Multiphase, equation().probleme()) ? &ref_cast(Pb_Multiphase, equation().probleme()).equation_masse().inconnue().passe() : nullptr;
   const bool Cdiffu = sub_type(Champ_Uniforme, ch_diffu);
+  const int nb_elem = zone_VDF.nb_elem();
+  const int nb_comp = tab_diffu.dimension(1);
+  const bool has_rho = has_champ_masse_volumique();
 
   // Si la diffusivite est variable, ce doit etre un champ aux elements.
-  assert(Cdiffu || diffu.size() == diffu.line_size() * zone_VDF.nb_elem());
+  assert(Cdiffu || tab_diffu.size() == tab_diffu.line_size() * nb_elem);
 
   int rho_comme_diff = 0;
-  if (has_champ_masse_volumique())
+  int dim = Objet_U::dimension;
+  Domaine_VDF_View dom_VDF(zone_VDF); // Use a view on Domaine_VDF to use dim_elem() method
+  CDoubleTabView rho;
+  if (has_rho)
     {
-      const DoubleTab& rho = get_champ_masse_volumique().valeurs();
-      rho_comme_diff = (rho.dimension(1) == diffu.dimension(1));
-    }
-
-  for (int elem = 0; elem < zone_VDF.nb_elem(); elem++)
-    {
-      double h = 0;
-      for (int d = 0 ; d < dimension; d++)
-        {
-          const double l = zone_VDF.dim_elem(elem, d);
-          h += 1. / (l * l);
-        }
-      for (int n = 0; n < diffu.dimension(1); n++)
-        {
-          double alpha_loc = diffu(Cdiffu ? 0 : elem, n);
-          if (has_champ_masse_volumique())
-            {
-              const DoubleTab& rho = get_champ_masse_volumique().valeurs();
-              alpha_loc/= rho(elem, rho_comme_diff * n);
-            }
-          const double dt_loc = (alp ? (*alp)(elem, n) : 1.0) * 0.5 / ((alpha_loc + DMINFLOAT) * h);
-          if (dt_loc < dt_stab) dt_stab = dt_loc;
-        }
+      rho = get_champ_masse_volumique().valeurs().view_ro();
+      rho_comme_diff = (get_champ_masse_volumique().valeurs().dimension(1) == tab_diffu.dimension(1));
     }
+  CDoubleTabView alp;
+  if (tab_alp) alp = tab_alp->view_ro();
+  CDoubleTabView diffu = tab_diffu.view_ro();
+  Kokkos::parallel_reduce(start_gpu_timer(__KERNEL_NAME__), nb_elem, KOKKOS_LAMBDA(const int elem, double& dtstab)
+  {
+    // Calculate mesh coefficient h = sum(1/(l*l)) for each spatial dimension
+    double h = 0.0;
+    for (int d = 0; d < dim; d++)
+      {
+        const double l = dom_VDF.dim_elem(elem, d);
+        h += 1.0 / (l * l);
+      }
+    // Loop over components
+    for (int n = 0; n < nb_comp; n++)
+      {
+        double alpha_loc = diffu(Cdiffu ? 0 : elem, n);
+        if (has_rho)
+          {
+            alpha_loc /= rho(elem, rho_comme_diff * n);
+          }
+        const double dt_loc = (alp.data() ? alp(elem, n) : 1.0) * 0.5 / ((alpha_loc + DMINFLOAT) * h);
+        if (dt_loc < dtstab) dtstab = dt_loc;
+      }
+  }, Kokkos::Min<double>(dt_stab));
+  end_gpu_timer(__KERNEL_NAME__);
 
   return Process::mp_min(dt_stab);
 }
diff --git a/src/VDF/Operateurs/Op_Diff_Dift/Op_Diff_Dift_base/Op_Diff_VDF_base.h b/src/VDF/Operateurs/Op_Diff_Dift/Op_Diff_Dift_base/Op_Diff_VDF_base.h
index dd7e7f96f7..d624bcdee1 100644
--- a/src/VDF/Operateurs/Op_Diff_Dift/Op_Diff_Dift_base/Op_Diff_VDF_base.h
+++ b/src/VDF/Operateurs/Op_Diff_Dift/Op_Diff_Dift_base/Op_Diff_VDF_base.h
@@ -43,8 +43,10 @@ class Op_Diff_VDF_base : public Operateur_Diff_base
   void contribuer_au_second_membre(DoubleTab& resu) const override { iter_->contribuer_au_second_membre(resu); }
   void check_multiphase_compatibility() const override { }
 
-protected:
+  protected_but_public_for_cuda
   double calculer_dt_stab_(const Domaine_VDF& zone_VDF) const;
+
+protected:
   void ajoute_terme_pour_axi(matrices_t , DoubleTab& , const tabs_t& ) const;
 
   OWN_PTR(Iterateur_VDF_base) iter_;
diff --git a/src/VDF/Operateurs/Op_Diff_Dift/Op_Diff_Dift_base/Op_Dift_VDF_Elem_base.cpp b/src/VDF/Operateurs/Op_Diff_Dift/Op_Diff_Dift_base/Op_Dift_VDF_Elem_base.cpp
index 79eb7f7a07..9e84b572f2 100644
--- a/src/VDF/Operateurs/Op_Diff_Dift/Op_Diff_Dift_base/Op_Dift_VDF_Elem_base.cpp
+++ b/src/VDF/Operateurs/Op_Diff_Dift/Op_Diff_Dift_base/Op_Dift_VDF_Elem_base.cpp
@@ -1,5 +1,5 @@
 /****************************************************************************
-* Copyright (c) 2025, CEA
+* Copyright (c) 2026, CEA
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
@@ -32,36 +32,46 @@ double Op_Dift_VDF_Elem_base::calculer_dt_stab_elem() const
 {
   double dt_stab, coef = -1.e10;
   const Domaine_VDF& domaine_VDF = iter_->domaine();
-  const IntTab& elem_faces = domaine_VDF.elem_faces();
+  const IntTab& tab_elem_faces = domaine_VDF.elem_faces();  // need elem_faces ro
   const DoubleVect& alpha_t = diffusivite_turbulente().valeurs();
   bool is_concentration = (equation().que_suis_je().debute_par("Convection_Diffusion_Concentration") || equation().que_suis_je().debute_par("Convection_Diffusion_Espece"));
 
-  ArrOfInt numfa(2*dimension);
-  for (int elem = 0; elem < domaine_VDF.nb_elem(); elem++)
-    {
+  ArrOfInt tab_numfa(2*dimension); // An array of probably 4 or 6 ints
+  ToDo_Kokkos("critical");
+  CIntTabView elem_faces = tab_elem_faces.view_ro();
+  const int Ccp = sub_type(Champ_Uniforme, mon_equation->milieu().capacite_calorifique());
+  const int Cr = sub_type(Champ_Uniforme, mon_equation->milieu().masse_volumique());
+  const DoubleTab& tab_Cp = mon_equation->milieu().capacite_calorifique().valeurs();    // NEED tab_Cp ro
+  const DoubleTab& tab_r = mon_equation->milieu().masse_volumique().valeurs();          // NEED tab_r  ro
+  CDoubleTabView Cp = tab_Cp.view_ro();
+  CDoubleTabView r = tab_r.view_ro();
+  IntArrView numfa = tab_numfa.view_rw();
+  auto* self = this;
+  static int l_dim = dimension;
+  Kokkos::parallel_reduce(start_gpu_timer(__KERNEL_NAME__), range_1D(0, domaine_VDF.nb_elem()), KOKKOS_LAMBDA (const int elem, double& local_max)
+  {
       // choix du facteur
       double rcp = 1.;
       if (!is_concentration)
         {
-          const int Ccp = sub_type(Champ_Uniforme, mon_equation->milieu().capacite_calorifique());
-          const int Cr = sub_type(Champ_Uniforme, mon_equation->milieu().masse_volumique());
-          const DoubleTab& tab_Cp = mon_equation->milieu().capacite_calorifique().valeurs(), tab_r = mon_equation->milieu().masse_volumique().valeurs();
-          rcp = tab_r(Cr ? 0 : elem, 0) * tab_Cp(Ccp ? 0 : elem, 0);
+          rcp = r(Cr ? 0 : elem, 0) * Cp(Ccp ? 0 : elem, 0);
         }
 
       double moy = 0.;
-      for (int i = 0; i < 2 * dimension; i++) numfa[i] = elem_faces(elem, i);
+      for (int i = 0; i < 2 * l_dim; i++) numfa[i] = elem_faces(elem, i); // small loop over 4 to 6 elements
 
       // XXX : E Saikali j'ai corrige pour multi inco parce que c'etait 1/dx et pas 1/dx^2 ... donc attention si ecart !
-      // c'etait comme ca : for (int d = 0; d < dimension; d++) moy += 1. / (domaine_VDF.dist_face(numfa[d], numfa[dimension + d], d));
-      for (int d = 0; d < dimension; d++)
+      // c'etait comme ca : for (int d = 0; d < l_dim; d++) moy += 1. / (domaine_VDF.dist_face(numfa[d], numfa[l_dim + d], d));
+      for (int d = 0; d < l_dim; d++) // Also small
         {
-          const double hd = domaine_VDF.dist_face(numfa[d], numfa[dimension + d], d);
+          const double hd = domaine_VDF.dist_face(numfa[d], numfa[l_dim + d], d);
           moy += 1. / (hd * hd);
         }
-      const double alpha_local = (alpha_(elem) + alpha_t(elem)) / rcp * moy;
-      coef = std::max(coef, alpha_local);
-    }
+      const double alpha_local = (self->alpha_(elem) + alpha_t(elem)) / rcp * moy;
+      local_max = std::max(coef, alpha_local);
+  },
+  Kokkos::Max<double>(coef)
+  );
 
   coef = Process::mp_max(coef);
   dt_stab = 1. / (2. * (coef + DMINFLOAT));
@@ -88,6 +98,7 @@ double Op_Dift_VDF_Elem_base::calculer_dt_stab_elem_axi() const
   if (dimension == 2)
     {
       int numfa[4];
+      ToDo_Kokkos("critical");
       for (int elem=0; elem<domaine_VDF.nb_elem(); elem++)
         {
           for (int i=0; i<4; i++) numfa[i] = elem_faces(elem,i);
@@ -100,6 +111,7 @@ double Op_Dift_VDF_Elem_base::calculer_dt_stab_elem_axi() const
   else if (dimension == 3)
     {
       int numfa[6];
+      ToDo_Kokkos("critical");
       for (int elem=0; elem<domaine_VDF.nb_elem(); elem++)
         {
           for (int i=0; i<6; i++) numfa[i] = elem_faces(elem,i);
@@ -126,6 +138,7 @@ double Op_Dift_VDF_Elem_base::calculer_dt_stab_elem_var_axi() const
   IntVect numfa(2 * D);
   DoubleVect h(D);
 
+  ToDo_Kokkos("critical");
   for (int e = 0; e < domaine_VDF.nb_elem(); e++)
     {
       for (int i = 0; i < 2 * D; i++)
diff --git a/src/VDF/Operateurs/Op_Diff_Dift/Op_Diff_Dift_base/Op_Dift_VDF_Elem_base.h b/src/VDF/Operateurs/Op_Diff_Dift/Op_Diff_Dift_base/Op_Dift_VDF_Elem_base.h
index 6e93b495ac..1a29a65e78 100644
--- a/src/VDF/Operateurs/Op_Diff_Dift/Op_Diff_Dift_base/Op_Dift_VDF_Elem_base.h
+++ b/src/VDF/Operateurs/Op_Diff_Dift/Op_Diff_Dift_base/Op_Dift_VDF_Elem_base.h
@@ -32,6 +32,7 @@ class Op_Dift_VDF_Elem_base : public Op_Dift_VDF_base, public Op_VDF_Elem
   void dimensionner_blocs(matrices_t matrices, const tabs_t& semi_impl) const override;
 
 protected:
+  KOKKOS_INLINE_FUNCTION
   virtual double alpha_(const int ) const = 0;
 };
 
diff --git a/src/VDF/Operateurs/Op_Diff_Dift/Op_Diff_Dift_base/Op_Dift_VDF_Face_Axi_base.cpp b/src/VDF/Operateurs/Op_Diff_Dift/Op_Diff_Dift_base/Op_Dift_VDF_Face_Axi_base.cpp
index defb36de62..33a6eb75c9 100644
--- a/src/VDF/Operateurs/Op_Diff_Dift/Op_Diff_Dift_base/Op_Dift_VDF_Face_Axi_base.cpp
+++ b/src/VDF/Operateurs/Op_Diff_Dift/Op_Diff_Dift_base/Op_Dift_VDF_Face_Axi_base.cpp
@@ -72,6 +72,7 @@ void Op_Dift_VDF_Face_Axi_base::associer_modele_turbulence(const Modele_turbulen
 
 void Op_Dift_VDF_Face_Axi_base::ajouter_elem(const DoubleVect& visco_turb, const DoubleTab& tau_diag, DoubleTab& resu) const
 {
+  ToDo_Kokkos("critical");
   for (int num_elem = 0; num_elem < le_dom_vdf->nb_elem(); num_elem++)
     {
       const int fx0 = elem_faces(num_elem,0), fx1 = elem_faces(num_elem,dimension), fy0 = elem_faces(num_elem,1), fy1 = elem_faces(num_elem,1+dimension);
@@ -91,6 +92,7 @@ void Op_Dift_VDF_Face_Axi_base::ajouter_elem(const DoubleVect& visco_turb, const
 
 void Op_Dift_VDF_Face_Axi_base::ajouter_elem_3D(const DoubleVect& visco_turb, const DoubleTab& tau_diag, DoubleTab& resu) const
 {
+  ToDo_Kokkos("critical");
   for (int num_elem = 0; num_elem < le_dom_vdf->nb_elem(); num_elem++)
     {
       const int fz0 = elem_faces(num_elem,2), fz1 = elem_faces(num_elem,2+dimension);
@@ -112,18 +114,17 @@ void Op_Dift_VDF_Face_Axi_base::ajouter_aretes_bords(const DoubleVect& visco_tur
         case TypeAreteBordVDF::PAROI_PAROI: // paroi-paroi
           {
             const int fac1 = Qdm(n_arete,0), fac2 = Qdm(n_arete,1), fac3 = Qdm(n_arete,2), ori3 = orientation(fac3);
-            const int rang1 = (fac1 - le_dom_vdf->premiere_face_bord()), rang2 = (fac2-le_dom_vdf->premiere_face_bord());
             double coef;
             if (is_var()) // XXX : E Saikali : sais pas quoi faire sinon ecarts ...
               {
                 // Calcul du frottement identique a celui de TRIOVF : On calcule la moyenne des u_star et on l'eleve au carre. On calcule la moyenne des surfaces
-                const double tau_tan_1 = tau_tan(rang1,ori3), tau_tan_2 = tau_tan(rang2,ori3) ;
+                const double tau_tan_1 = tau_tan(fac1,ori3), tau_tan_2 = tau_tan(fac2,ori3) ;
                 double tau = 0.5*(tau_tan_1 + tau_tan_2 ), surf = 0.5*(surface(fac1)+surface(fac2));
                 coef = tau*tau*surf;
               }
             else // Autre solution pour le calcul du frottement : On calcule u_star*u_star*surf sur chaque partie de la facette de Qdm
               {
-                const double tau1 = tau_tan(rang1,ori3)*0.5*surface(fac1), tau2 = tau_tan(rang2,ori3)*0.5*surface(fac2);
+                const double tau1 = tau_tan(fac1,ori3)*0.5*surface(fac1), tau2 = tau_tan(fac2,ori3)*0.5*surface(fac2);
                 coef = tau1+tau2;
               }
 
@@ -345,6 +346,7 @@ void Op_Dift_VDF_Face_Axi_base::ajouter_contribution_elem(const DoubleVect& visc
   auto& tab1 = matrice.get_set_tab1();
   auto& tab2 = matrice.get_set_tab2();
   auto& coeff = matrice.get_set_coeff();
+  ToDo_Kokkos("critical");
   for (int num_elem = 0; num_elem < le_dom_vdf->nb_elem(); num_elem++)
     {
       const int fx0 = elem_faces(num_elem,0), fx1 = elem_faces(num_elem,dimension), fy0 = elem_faces(num_elem,1), fy1 = elem_faces(num_elem,1+dimension);
@@ -380,6 +382,7 @@ void Op_Dift_VDF_Face_Axi_base::ajouter_contribution_elem(const DoubleVect& visc
 
 void Op_Dift_VDF_Face_Axi_base::ajouter_contribution_elem_3D(const DoubleVect& visco_turb, const DoubleTab& tau_diag, Matrice_Morse& matrice) const
 {
+  ToDo_Kokkos("critical");
   for (int num_elem = 0; num_elem < le_dom_vdf->nb_elem(); num_elem++)
     {
       const int fz0 = elem_faces(num_elem,2), fz1 = elem_faces(num_elem,2+dimension);
@@ -593,17 +596,16 @@ void Op_Dift_VDF_Face_Axi_base::contribue_au_second_membre(DoubleTab& resu ) con
         case TypeAreteBordVDF::PAROI_PAROI:
           {
             const int fac1 = Qdm(n_arete,0), fac2 = Qdm(n_arete,1), fac3 = Qdm(n_arete,2), ori3 = orientation(fac3);
-            const int rang1 = (fac1 - le_dom_vdf->premiere_face_bord()), rang2 = (fac2 - le_dom_vdf->premiere_face_bord());
             double coef;
             if (is_var())
               {
                 // Calcul du frottement identique a celui de TRIOVF : On calcule la moyenne des u_star et on l'eleve au carre. On calcule la moyenne des surfaces
-                const double tau = 0.5*(sqrt(tau_tan(rang1,ori3)) + sqrt(tau_tan(rang2,ori3))), surf = 0.5*(surface(fac1)+surface(fac2));
+                const double tau = 0.5*(sqrt(tau_tan(fac1,ori3)) + sqrt(tau_tan(fac2,ori3))), surf = 0.5*(surface(fac1)+surface(fac2));
                 coef = tau*tau*surf;
               }
             else // Autre solution pour le calcul du frottement : On calcule u_star*u_star*surf sur chaque partie de la facette de Qdm
               {
-                const double tau1 = tau_tan(rang1,ori3)*0.5*surface(fac1), tau2 = tau_tan(rang2,ori3)*0.5*surface(fac2);
+                const double tau1 = tau_tan(fac1,ori3)*0.5*surface(fac1), tau2 = tau_tan(fac2,ori3)*0.5*surface(fac2);
                 coef = tau1+tau2;
               }
 
diff --git a/src/VDF/Operateurs/Op_Diff_Dift/Op_Diff_Dift_base/Op_Dift_VDF_Face_base.cpp b/src/VDF/Operateurs/Op_Diff_Dift/Op_Diff_Dift_base/Op_Dift_VDF_Face_base.cpp
index 5883abcf0f..ef62eba3e5 100644
--- a/src/VDF/Operateurs/Op_Diff_Dift/Op_Diff_Dift_base/Op_Dift_VDF_Face_base.cpp
+++ b/src/VDF/Operateurs/Op_Diff_Dift/Op_Diff_Dift_base/Op_Dift_VDF_Face_base.cpp
@@ -1,5 +1,5 @@
 /****************************************************************************
-* Copyright (c) 2024, CEA
+* Copyright (c) 2026, CEA
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
@@ -36,67 +36,67 @@ double Op_Dift_VDF_Face_base::calculer_dt_stab() const
  */
 double Op_Dift_VDF_Face_base::calculer_dt_stab(const Domaine_VDF& domaine_VDF) const
 {
-  double dt_stab, coef = -1.e10;
-  const DoubleTab& diffu = diffusivite().valeurs(), &diffu_turb = diffusivite_turbulente().valeurs();
+  double coef = -1.e10;
+  const DoubleTab& tab_diffu = diffusivite().valeurs(), &tab_diffu_turb = diffusivite_turbulente().valeurs();
 
   // B.Mat. 9/3/2005: pour traiter monophasique/qc/front-tracking de facon generique. Mettre a jour le qc et l'ancien ft pour utiliser ce mecanisme
   const int nb_elem = domaine_VDF.nb_elem(), dim = Objet_U::dimension;
+  Domaine_VDF_View dom_VDF(domaine_VDF);
+  CDoubleTabView diffu = tab_diffu.view_ro();
+  CDoubleTabView diffu_turb = tab_diffu_turb.view_ro();
+  const int nb_comp_diffu = tab_diffu.line_size(), nb_comp_diffu_turb = tab_diffu_turb.line_size();
+
   if (has_champ_masse_volumique())
     {
-      const DoubleTab& valeurs_rho = get_champ_masse_volumique().valeurs();
-      for (int elem = 0; elem < nb_elem; elem++)
-        {
-          double diflo = 0.;
-          for (int i = 0; i < dim; i++)
-            {
-              const double h = domaine_VDF.dim_elem(elem, i);
-              diflo += 1. / (h * h);
-            }
-          double mu_physique = diffu(elem, 0),  mu_turbulent = diffu_turb(elem, 0);
-
-          for (int ncomp = 1; ncomp < diffu.line_size(); ncomp++) mu_physique = std::max(mu_physique, diffu(elem, ncomp));
-          for (int ncomp = 1; ncomp < diffu_turb.line_size(); ncomp++) mu_turbulent = std::max(mu_turbulent, diffu_turb(elem, ncomp));
-
-          const double inv_rho = 1./valeurs_rho(elem) ;
-          diflo *= (mu_physique + mu_turbulent) * inv_rho;
-          coef = std::max(coef, diflo);
-        }
+      const DoubleTab& tab_rho = get_champ_masse_volumique().valeurs();
+      CDoubleTabView rho = tab_rho.view_ro();
+      Kokkos::parallel_reduce(start_gpu_timer(__KERNEL_NAME__), nb_elem, KOKKOS_LAMBDA(const int elem, double& coef_)
+      {
+        double diflo = 0.;
+        for (int i = 0; i < dim; i++)
+          {
+            const double h = dom_VDF.dim_elem(elem, i);
+            diflo += 1. / (h * h);
+          }
+        double mu_physique = diffu(elem, 0), mu_turbulent = diffu_turb(elem, 0);
+        for (int ncomp = 1; ncomp < nb_comp_diffu; ncomp++) mu_physique = Kokkos::fmax(mu_physique, diffu(elem, ncomp));
+        for (int ncomp = 1; ncomp < nb_comp_diffu_turb; ncomp++) mu_turbulent = Kokkos::fmax(mu_turbulent, diffu_turb(elem, ncomp));
+        diflo *= (mu_physique + mu_turbulent) / rho(elem, 0);
+        if (diflo > coef_) coef_ = diflo;
+      }, Kokkos::Max<double>(coef));
+      end_gpu_timer(__KERNEL_NAME__);
     }
   else
     {
       const Champ_base& champ_diffu = diffusivite_pour_pas_de_temps();
-      const DoubleTab& diffu_dt = champ_diffu.valeurs();
-      const int diffu_dt_variable = (diffu_dt.dimension(0) == 1) ? 0 : 1, diffu_variable = (diffu.dimension(0) == 1) ? 0 : 1;
-      for (int elem = 0; elem < nb_elem; elem++)
-        {
-          double diflo = 0.;
-          for (int i = 0; i < dim; i++)
-            {
-              const double h = domaine_VDF.dim_elem(elem, i);
-              diflo += 1. / (h * h);
-            }
-
-          int item = (diffu_variable ? elem : 0);
-          double mu_physique = diffu(item, 0), mu_turbulent = diffu_turb(elem, 0);
-
-          for (int ncomp = 1; ncomp < diffu.line_size(); ncomp++) mu_physique = std::max(mu_physique, diffu(item, ncomp));
-          for (int ncomp = 1; ncomp < diffu_turb.line_size(); ncomp++) mu_turbulent = std::max(mu_turbulent, diffu_turb(elem, ncomp));
-
-          item = (diffu_dt_variable ? elem : 0);
-          double diffu_dt_l = diffu_dt(item, 0);
-
-          for (int ncomp = 1; ncomp < diffu_dt.line_size(); ncomp++) diffu_dt_l = std::max(diffu_dt_l, diffu_dt(item, ncomp));
-
-          // si on a associe mu au lieu de nu , on a nu sans diffu_dt
-          // le pas de temps de stab est nu+nu_t, on calcule (mu+mu_t)*(nu/mu)=(mu+mu_t)/rho=nu+nu_t (avantage par rapport a la division par rho ca marche aussi pour alpha et lambda et en VEF
-          diflo *= (mu_physique + mu_turbulent)*(diffu_dt_l)/mu_physique ;
-          coef = std::max(coef, diflo);
-        }
+      const DoubleTab& tab_diffu_dt = champ_diffu.valeurs();
+      const int diffu_dt_variable = (tab_diffu_dt.dimension(0) == 1) ? 0 : 1, diffu_variable = (tab_diffu.dimension(0) == 1) ? 0 : 1;
+      const int nb_comp_diffu_dt = tab_diffu_dt.line_size();
+      CDoubleTabView diffu_dt = tab_diffu_dt.view_ro();
+      Kokkos::parallel_reduce(start_gpu_timer(__KERNEL_NAME__), nb_elem, KOKKOS_LAMBDA(const int elem, double& coef_)
+      {
+        double diflo = 0.;
+        for (int i = 0; i < dim; i++)
+          {
+            const double h = dom_VDF.dim_elem(elem, i);
+            diflo += 1. / (h * h);
+          }
+        const int item = (diffu_variable ? elem : 0);
+        double mu_physique = diffu(item, 0), mu_turbulent = diffu_turb(elem, 0);
+        for (int ncomp = 1; ncomp < nb_comp_diffu; ncomp++) mu_physique = Kokkos::fmax(mu_physique, diffu(item, ncomp));
+        for (int ncomp = 1; ncomp < nb_comp_diffu_turb; ncomp++) mu_turbulent = Kokkos::fmax(mu_turbulent, diffu_turb(elem, ncomp));
+        const int item_dt = (diffu_dt_variable ? elem : 0);
+        double diffu_dt_l = diffu_dt(item_dt, 0);
+        for (int ncomp = 1; ncomp < nb_comp_diffu_dt; ncomp++) diffu_dt_l = Kokkos::fmax(diffu_dt_l, diffu_dt(item_dt, ncomp));
+        // si on a associe mu au lieu de nu , on a nu sans diffu_dt
+        // le pas de temps de stab est nu+nu_t, on calcule (mu+mu_t)*(nu/mu)=(mu+mu_t)/rho=nu+nu_t (avantage par rapport a la division par rho ca marche aussi pour alpha et lambda et en VEF
+        diflo *= (mu_physique + mu_turbulent) * diffu_dt_l / mu_physique;
+        if (diflo > coef_) coef_ = diflo;
+      }, Kokkos::Max<double>(coef));
+      end_gpu_timer(__KERNEL_NAME__);
     }
   coef = Process::mp_max(coef);
-  dt_stab = 0.5 / (coef+DMINFLOAT);
-
-  return dt_stab;
+  return 0.5 / (coef + DMINFLOAT);
 }
 
 void Op_Dift_VDF_Face_base::calculer_borne_locale(DoubleVect& borne_visco_turb,double dt,double dt_diff_sur_dt_conv) const
@@ -106,6 +106,7 @@ void Op_Dift_VDF_Face_base::calculer_borne_locale(DoubleVect& borne_visco_turb,d
   const DoubleVect& diffu = champ_diffu.valeurs();
   const int diffu_variable = (diffu.size() == 1) ? 0 : 1, nb_elem = domaine_VDF.nb_elem();
   const double diffu_constante = (diffu_variable ? 0. : diffu(0));
+  ToDo_Kokkos("critical");
   for (int elem=0; elem<nb_elem; elem++)
     {
       double h_inv = 0;
diff --git a/src/VDF/Operateurs/Op_Diff_Dift/Op_Diff_Dift_base/Op_Dift_VDF_base.cpp b/src/VDF/Operateurs/Op_Diff_Dift/Op_Diff_Dift_base/Op_Dift_VDF_base.cpp
index d1b32debb5..15bd18c60b 100644
--- a/src/VDF/Operateurs/Op_Diff_Dift/Op_Diff_Dift_base/Op_Dift_VDF_base.cpp
+++ b/src/VDF/Operateurs/Op_Diff_Dift/Op_Diff_Dift_base/Op_Dift_VDF_base.cpp
@@ -1,5 +1,5 @@
 /****************************************************************************
-* Copyright (c) 2025, CEA
+* Copyright (c) 2026, CEA
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
@@ -92,6 +92,7 @@ void Op_Dift_VDF_base::ajoute_terme_pour_axi_turb(matrices_t matrices, DoubleTab
               Process::exit();
             }
 
+          ToDo_Kokkos("critical");
           for (face = 0; face < nb_faces; face++)
             for (int n = 0; n < N; n++)
               if (ori(face) == 0)
diff --git a/src/VDF/Operateurs/Op_Diff_Dift/Op_Dift_Multiphase_VDF_Elem.cpp b/src/VDF/Operateurs/Op_Diff_Dift/Op_Dift_Multiphase_VDF_Elem.cpp
index a3778dda70..0627fd7731 100644
--- a/src/VDF/Operateurs/Op_Diff_Dift/Op_Dift_Multiphase_VDF_Elem.cpp
+++ b/src/VDF/Operateurs/Op_Diff_Dift/Op_Dift_Multiphase_VDF_Elem.cpp
@@ -89,6 +89,7 @@ double Op_Dift_Multiphase_VDF_Elem::calculer_dt_stab() const
   double mu_turbulent, mu_physique, nu_physique, alfa;
 
   ArrOfInt numfa(2 * dim);
+  ToDo_Kokkos("critical");
   for (int elem = 0; elem < domaine_VDF.nb_elem(); elem++)
     {
       double diflo = 0.;
diff --git a/src/VDF/Operateurs/Op_Diff_Dift/Op_Dift_Multiphase_VDF_Elem.h b/src/VDF/Operateurs/Op_Diff_Dift/Op_Dift_Multiphase_VDF_Elem.h
index 4e6689b884..a86c80fa36 100644
--- a/src/VDF/Operateurs/Op_Diff_Dift/Op_Dift_Multiphase_VDF_Elem.h
+++ b/src/VDF/Operateurs/Op_Diff_Dift/Op_Dift_Multiphase_VDF_Elem.h
@@ -37,7 +37,14 @@ class Op_Dift_Multiphase_VDF_Elem : public Op_Dift_VDF_Elem_base, public Op_Diff
 
   bool is_turb() const override { return true; }
   double calculer_dt_stab() const override;
-  double alpha_(const int i) const override { throw; }
+  KOKKOS_INLINE_FUNCTION double alpha_(const int i) const override {
+    KOKKOS_IF_ON_HOST((
+      throw;
+    ))
+    KOKKOS_IF_ON_DEVICE((
+      Kokkos::abort("alpha_(int) on device not supported for multiphase");
+    ))
+  }
   const Correlation_base* correlation_viscosite_turbulente() const override { return &(corr_.valeur()); }
   inline const Correlation_base& correlation() const { return corr_.valeur(); }
   inline const DoubleTab& alpha_() const { return tab_alpha_impl<Eval_Dift_Multiphase_VDF_Elem>(); }
diff --git a/src/VDF/Operateurs/Op_Diff_Dift/Op_Dift_Multiphase_VDF_Face.cpp b/src/VDF/Operateurs/Op_Diff_Dift/Op_Dift_Multiphase_VDF_Face.cpp
index 15d6a6dc56..eac0189304 100644
--- a/src/VDF/Operateurs/Op_Diff_Dift/Op_Dift_Multiphase_VDF_Face.cpp
+++ b/src/VDF/Operateurs/Op_Diff_Dift/Op_Dift_Multiphase_VDF_Face.cpp
@@ -79,6 +79,7 @@ double Op_Dift_Multiphase_VDF_Face::calculer_dt_stab() const
 
   double mu_turbulent, mu_physique, nu_physique;
 
+  ToDo_Kokkos("critical");
   for (int elem = 0; elem < domaine_VDF.nb_elem(); elem++)
     {
       double diflo = 0.;
diff --git a/src/VDF/Operateurs/Op_Diff_Dift/Op_Dift_VDF_Elem_leaves.h b/src/VDF/Operateurs/Op_Diff_Dift/Op_Dift_VDF_Elem_leaves.h
index 27d0d25e15..0c3f9f2ee1 100644
--- a/src/VDF/Operateurs/Op_Diff_Dift/Op_Dift_VDF_Elem_leaves.h
+++ b/src/VDF/Operateurs/Op_Diff_Dift/Op_Dift_VDF_Elem_leaves.h
@@ -34,7 +34,7 @@ class Op_Dift_VDF_Elem : public Op_Dift_VDF_Elem_base, public Op_Diff_Dift_VDF<O
 public:
   Op_Dift_VDF_Elem();
   inline double calculer_dt_stab() const override { return calculer_dt_stab_elem(); }
-  inline double alpha_(const int i) const override { return alpha_impl<Eval_Dift_VDF_Elem>(i); }
+  KOKKOS_INLINE_FUNCTION double alpha_(const int i) const override { return alpha_impl<Eval_Dift_VDF_Elem>(i); }
   inline void associer_loipar(const Turbulence_paroi_scal_base& lp ) { associer_loipar_impl<Type_Operateur::Op_DIFT_ELEM,Eval_Dift_VDF_Elem>(lp); }
   inline void associer(const Domaine_dis_base& zd, const Domaine_Cl_dis_base& zcd, const Champ_Inc_base& ch) override { associer_impl<Type_Operateur::Op_DIFT_ELEM,Eval_Dift_VDF_Elem>(zd,zcd,ch); }
   inline void associer_diffusivite_turbulente(const Champ_Fonc_base& ch) { associer_diffusivite_turbulente_impl<Type_Operateur::Op_DIFT_ELEM,Eval_Dift_VDF_Elem>(ch); }
@@ -55,7 +55,7 @@ class Op_Dift_VDF_Elem_Axi : public Op_Dift_VDF_Elem_base, public Op_Diff_Dift_V
 public:
   Op_Dift_VDF_Elem_Axi();
   inline double calculer_dt_stab() const override { return calculer_dt_stab_elem_axi(); }
-  inline double alpha_(const int i) const override { return alpha_impl<Eval_Dift_VDF_Elem_Axi>(i); }
+  KOKKOS_INLINE_FUNCTION double alpha_(const int i) const override { return alpha_impl<Eval_Dift_VDF_Elem_Axi>(i); }
   inline void associer_loipar(const Turbulence_paroi_scal_base& lp ) { associer_loipar_impl<Type_Operateur::Op_DIFT_ELEM,Eval_Dift_VDF_Elem_Axi>(lp); }
   inline void associer(const Domaine_dis_base& zd, const Domaine_Cl_dis_base& zcd, const Champ_Inc_base& ch) override { associer_impl<Type_Operateur::Op_DIFT_ELEM,Eval_Dift_VDF_Elem_Axi>(zd,zcd,ch); }
   inline void associer_diffusivite_turbulente(const Champ_Fonc_base& ch) { associer_diffusivite_turbulente_impl<Type_Operateur::Op_DIFT_ELEM,Eval_Dift_VDF_Elem_Axi>(ch); }
@@ -76,12 +76,11 @@ class Op_Dift_VDF_Multi_inco_Elem : public Op_Dift_VDF_Elem_base, public Op_Diff
 public:
   Op_Dift_VDF_Multi_inco_Elem();
   inline double calculer_dt_stab() const override { return calculer_dt_stab_elem(); }
-  inline double alpha_(const int i) const override
+  KOKKOS_INLINE_FUNCTION double alpha_(const int i) const override
   {
     const DoubleTab& alpha = diffusivite_pour_pas_de_temps().valeurs();
-    const int is_var = sub_type(Champ_Uniforme, diffusivite()) ? 0 : 1;
-    double alpha_lam = alpha(is_var * i,0);
-    for (int k = 1; k < alpha.line_size(); k++) alpha_lam = std::max(alpha_lam, alpha(is_var * i,k));
+    double alpha_lam = alpha(is_var_ * i,0);
+    for (int k = 1; k < alpha.line_size(); k++) alpha_lam = std::max(alpha_lam, alpha(is_var_ * i,k));
     return alpha_lam;
   }
   inline void associer_loipar(const Turbulence_paroi_scal_base& lp ) { associer_loipar_impl<Type_Operateur::Op_DIFT_ELEM,Eval_Dift_VDF_Multi_inco_Elem>(lp); }
@@ -94,6 +93,9 @@ class Op_Dift_VDF_Multi_inco_Elem : public Op_Dift_VDF_Elem_base, public Op_Diff
     completer_impl<Type_Operateur::Op_DIFT_ELEM, Eval_Dift_VDF_Multi_inco_Elem>();
     associer_pb<Eval_Dift_VDF_Multi_inco_Elem>(equation().probleme());
   }
+
+private:
+  const int is_var_ = sub_type(Champ_Uniforme, diffusivite()) ? 0 : 1;
 };
 
 // ===========================================================================================================================================
diff --git a/src/VDF/Operateurs/Op_Divers/Op_Div_VDF_Elem.cpp b/src/VDF/Operateurs/Op_Divers/Op_Div_VDF_Elem.cpp
index 1f463c6b9d..bff2fc795a 100644
--- a/src/VDF/Operateurs/Op_Divers/Op_Div_VDF_Elem.cpp
+++ b/src/VDF/Operateurs/Op_Divers/Op_Div_VDF_Elem.cpp
@@ -1,5 +1,5 @@
 /****************************************************************************
-* Copyright (c) 2024, CEA
+* Copyright (c) 2025, CEA
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
@@ -14,6 +14,8 @@
 *****************************************************************************/
 
 #include <Op_Div_VDF_Elem.h>
+#include <kokkos++.h>
+#include <TRUSTArray_kokkos.tpp>
 
 Implemente_instanciable_sans_constructeur(Op_Div_VDF_Elem,"Op_Div_VDF_Face",Op_Div_VDF_base);
 
@@ -37,11 +39,16 @@ void Op_Div_VDF_Elem::associer(const Domaine_dis_base& domaine_dis, const Domain
   la_zcl_vdf = zclvdf;
 }
 
-void Op_Div_VDF_Elem::volumique(DoubleTab& div) const
+void Op_Div_VDF_Elem::volumique(DoubleTab& tab_div) const
 {
   const Domaine_VDF& domaine_VDF = le_dom_vdf.valeur();
-  const DoubleVect& vol = domaine_VDF.volumes();
   const int nb_elem = domaine_VDF.domaine().nb_elem_tot();
 
-  for(int num_elem = 0; num_elem < nb_elem; num_elem++) div(num_elem) /= vol(num_elem);
+  CDoubleArrView vol = domaine_VDF.volumes().view_ro();
+  DoubleArrView div = static_cast<ArrOfDouble&>(tab_div).view_rw();
+  Kokkos::parallel_for(start_gpu_timer(__KERNEL_NAME__), range_1D(0, nb_elem), KOKKOS_LAMBDA(const int num_elem)
+  {
+    div(num_elem) /= vol(num_elem);
+  });
+  end_gpu_timer(__KERNEL_NAME__);
 }
diff --git a/src/VDF/Operateurs/Op_Divers/Op_Grad_P0_to_Face.cpp b/src/VDF/Operateurs/Op_Divers/Op_Grad_P0_to_Face.cpp
index ecc95c7b6f..423bce2eed 100644
--- a/src/VDF/Operateurs/Op_Divers/Op_Grad_P0_to_Face.cpp
+++ b/src/VDF/Operateurs/Op_Divers/Op_Grad_P0_to_Face.cpp
@@ -1,5 +1,5 @@
 /****************************************************************************
-* Copyright (c) 2025, CEA
+* Copyright (c) 2026, CEA
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
@@ -59,49 +59,61 @@ void Op_Grad_P0_to_Face::ajouter_blocs(matrices_t matrices, DoubleTab& secmem, c
         const int ndeb = le_bord.num_premiere_face(), nfin = ndeb + le_bord.nb_faces();
 
         if (sub_type(Periodique, la_cl.valeur())) // Correction periodicite
-          for (int num_face = ndeb; num_face < nfin; num_face++)
-            {
-              const int n0 = face_voisins(num_face, 0), n1 = face_voisins(num_face, 1);
-              const double dist = volume_entrelaces(num_face) / face_surfaces(num_face);
-              secmem(num_face, k) -= (inco(n1, k) - inco(n0, k)) / dist;
-            }
+          {
+            ToDo_Kokkos("critical");
+            for (int num_face = ndeb; num_face < nfin; num_face++)
+              {
+                const int n0 = face_voisins(num_face, 0), n1 = face_voisins(num_face, 1);
+                const double dist = volume_entrelaces(num_face) / face_surfaces(num_face);
+                secmem(num_face, k) -= (inco(n1, k) - inco(n0, k)) / dist;
+              }
+          }
         else if (sub_type(Dirichlet, la_cl.valeur())) // Cas CL Dirichlet
           {
             const Dirichlet& cl = ref_cast(Dirichlet, la_cl.valeur());
             // XXX Elie Saikali : on calcule pas si champ_front_var n'est pas initialise
             if (cl.champ_front().has_valeurs_au_temps(cl.champ_front().get_temps_defaut()))
-              for (int num_face = ndeb, num_face_cl = 0; num_face < nfin; num_face++, num_face_cl++)
-                {
-                  int n0 = face_voisins(num_face, 0);
-                  if (n0 < 0)
-                    n0 = face_voisins(num_face, 1);
-                  const int ori = orientation(num_face);
-                  secmem(num_face, k) -= (inco(n0, k) - cl.val_imp(num_face_cl, k)) / (xp(n0, ori) - xv(num_face, ori));
-                }
+              {
+                ToDo_Kokkos("critical");
+                for (int num_face = ndeb, num_face_cl = 0; num_face < nfin; num_face++, num_face_cl++)
+                  {
+                    int n0 = face_voisins(num_face, 0);
+                    if (n0 < 0)
+                      n0 = face_voisins(num_face, 1);
+                    const int ori = orientation(num_face);
+                    secmem(num_face, k) -= (inco(n0, k) - cl.val_imp(num_face_cl, k)) / (xp(n0, ori) - xv(num_face, ori));
+                  }
+              }
           }
         else if (sub_type(Dirichlet_homogene, la_cl.valeur())) // Cas Dirichlet homogene, i.e. valeur nulle a la paroi
-          for (int num_face = ndeb; num_face < nfin; num_face++)
-            {
-              int n0 = face_voisins(num_face, 0);
-              if (n0 < 0)
-                n0 = face_voisins(num_face, 1);
-              const int ori = orientation(num_face);
-              secmem(num_face, k) -= inco(n0, k) / (xp(n0, ori) - xv(num_face, ori));
-            }
+          {
+            ToDo_Kokkos("critical");
+            for (int num_face = ndeb; num_face < nfin; num_face++)
+              {
+                int n0 = face_voisins(num_face, 0);
+                if (n0 < 0)
+                  n0 = face_voisins(num_face, 1);
+                const int ori = orientation(num_face);
+                secmem(num_face, k) -= inco(n0, k) / (xp(n0, ori) - xv(num_face, ori));
+              }
+          }
         else if (sub_type(Echange_impose_base, la_cl.valeur())) // Cas Echange_impose_base
           {
             const Echange_impose_base& cl = ref_cast(Echange_impose_base, la_cl.valeur());
             if (cl.has_h_imp_grad())
-              for (int num_face = ndeb, num_face_cl = 0; num_face < nfin; num_face++, num_face_cl++)
-                {
-                  int n0 = face_voisins(num_face, 0);
-                  if (n0 < 0)
-                    n0 = face_voisins(num_face, 1);
-                  if (face_voisins(num_face, 0) >= 0)
-                    secmem(num_face, k) -= (inco(n0, k) - cl.T_ext(num_face_cl, k)) * cl.h_imp_grad(num_face_cl, k); // Si bien oriente
-                  else
-                    secmem(num_face, k) += (inco(n0, k) - cl.T_ext(num_face_cl, k)) * cl.h_imp_grad(num_face_cl, k); // Si oriente a envers
-                }
+              {
+                ToDo_Kokkos("critical");
+                for (int num_face = ndeb, num_face_cl = 0; num_face < nfin; num_face++, num_face_cl++)
+                  {
+                    int n0 = face_voisins(num_face, 0);
+                    if (n0 < 0)
+                      n0 = face_voisins(num_face, 1);
+                    if (face_voisins(num_face, 0) >= 0)
+                      secmem(num_face, k) -= (inco(n0, k) - cl.T_ext(num_face_cl, k)) * cl.h_imp_grad(num_face_cl, k); // Si bien oriente
+                    else
+                      secmem(num_face, k) += (inco(n0, k) - cl.T_ext(num_face_cl, k)) * cl.h_imp_grad(num_face_cl, k); // Si oriente a envers
+                  }
+              }
             else { /* Do nothing */ }
           }
         else if (sub_type(Neumann_paroi, la_cl.valeur())) // Cas Neumann_paroi
@@ -109,36 +121,43 @@ void Op_Grad_P0_to_Face::ajouter_blocs(matrices_t matrices, DoubleTab& secmem, c
             const Neumann_paroi& cl = ref_cast(Neumann_paroi, la_cl.valeur());
             // XXX Elie Saikali : on calcule pas si champ_front_var n'est pas initialise
             if (cl.champ_front().has_valeurs_au_temps(cl.champ_front().get_temps_defaut()))
-              for (int num_face = ndeb, num_face_cl = 0; num_face < nfin; num_face++, num_face_cl++)
-                {
-                  if (face_voisins(num_face, 0) >= 0)
-                    secmem(num_face, k) -= cl.flux_impose(num_face_cl, k); // Si bien oriente
-                  else
-                    secmem(num_face, k) += cl.flux_impose(num_face_cl, k); // Si oriente a envers
-                }
+              {
+                ToDo_Kokkos("critical");
+                for (int num_face = ndeb, num_face_cl = 0; num_face < nfin; num_face++, num_face_cl++)
+                  {
+                    if (face_voisins(num_face, 0) >= 0)
+                      secmem(num_face, k) -= cl.flux_impose(num_face_cl, k); // Si bien oriente
+                    else
+                      secmem(num_face, k) += cl.flux_impose(num_face_cl, k); // Si oriente a envers
+                  }
+              }
           }
         else if (!sub_type(Neumann_homogene, la_cl.valeur())) // En Neumann homogene, i.e. symetrie, la derivee a la face est nulle => on fait rien
-          for (int num_face = ndeb; num_face < nfin; num_face++)
-            {
-              int n0 = face_voisins(num_face, 0);
-              if (n0 < 0)
-                n0 = face_voisins(num_face, 1);
+          {
+            ToDo_Kokkos("critical");
+            for (int num_face = ndeb; num_face < nfin; num_face++)
+              {
+                int n0 = face_voisins(num_face, 0);
+                if (n0 < 0)
+                  n0 = face_voisins(num_face, 1);
 
-              const int ori = orientation(num_face);
-              int face_opposee = zvdf.elem_faces(n0, ori);
-              if (face_opposee == num_face)
-                face_opposee = zvdf.elem_faces(n0, ori + dimension);
+                const int ori = orientation(num_face);
+                int face_opposee = zvdf.elem_faces(n0, ori);
+                if (face_opposee == num_face)
+                  face_opposee = zvdf.elem_faces(n0, ori + dimension);
 
-              int n1 = face_voisins(face_opposee, 0);
-              if ((n1 < 0) || ((n1 == n0) && face_voisins(face_opposee, 1) >= 0))
-                n1 = face_voisins(face_opposee, 1);
+                int n1 = face_voisins(face_opposee, 0);
+                if ((n1 < 0) || ((n1 == n0) && face_voisins(face_opposee, 1) >= 0))
+                  n1 = face_voisins(face_opposee, 1);
 
-              if (n1 != n0)
-                secmem(num_face, k) -= (inco(n1, k) - inco(n0, k)) / (xp(n1, ori) - xp(n0, ori));
-            }
+                if (n1 != n0)
+                  secmem(num_face, k) -= (inco(n1, k) - inco(n0, k)) / (xp(n1, ori) - xp(n0, ori));
+              }
+          }
       }
 
   // Boucle sur les faces internes
+  ToDo_Kokkos("critical");
   for (int num_face = zvdf.premiere_face_int(); num_face < zvdf.nb_faces(); num_face++)
     for (int k = 0; k < N; k++)
       {
diff --git a/src/VDF/Operateurs/Op_Divers/Op_Grad_VDF_Face.cpp b/src/VDF/Operateurs/Op_Divers/Op_Grad_VDF_Face.cpp
index cb23356d01..bf23150b81 100644
--- a/src/VDF/Operateurs/Op_Divers/Op_Grad_VDF_Face.cpp
+++ b/src/VDF/Operateurs/Op_Divers/Op_Grad_VDF_Face.cpp
@@ -47,8 +47,6 @@ void Op_Grad_VDF_Face::calculer_flux_bords() const
   const Domaine_Cl_VDF& zclvdf = la_zcl_vdf.valeur();
   const Navier_Stokes_std& eqn_hydr = ref_cast(Navier_Stokes_std,equation());
   const Champ_P0_VDF& la_pression_P0 = ref_cast(Champ_P0_VDF,eqn_hydr.pression_pa());
-  const DoubleTab& pression_P0 = la_pression_P0.valeurs();
-  const DoubleVect& face_surfaces = zvdf.face_surfaces();
   int nb_bord = zvdf.nb_front_Cl();
   for (int n_bord=0; n_bord<nb_bord; n_bord++)
     {
@@ -56,18 +54,25 @@ void Op_Grad_VDF_Face::calculer_flux_bords() const
       const Front_VF& le_bord = ref_cast(Front_VF,la_cl->frontiere_dis());
       int ndeb = le_bord.num_premiere_face();
       int nfin = ndeb + le_bord.nb_faces();
-      for (int face=ndeb; face<nfin; face++)
-        {
-          int elem0 = face_voisins(face,0);
-          int ori = orientation(face);
-          double n0 = face_surfaces(face)*porosite_surf(face);
-          if (elem0 != -1) flux_bords_(face,ori) = (pression_P0(elem0))*n0 ;
-          else
-            {
-              int elem1 = face_voisins(face,1);
-              flux_bords_(face,ori) = -(pression_P0(elem1))*n0 ;
-            }
-        } // fin for face
+      CIntTabView face_voisins_v = face_voisins.view_ro();
+      CIntArrView orientation = orientation_.view_ro();
+      CDoubleArrView face_surfaces = zvdf.face_surfaces().view_ro();
+      CDoubleArrView porosite_surf_v = porosite_surf.view_ro();
+      CDoubleArrView pression_P0 = static_cast<const ArrOfDouble&>(la_pression_P0.valeurs()).view_ro();
+      DoubleTabView flux_bords = flux_bords_.view_wo();
+      Kokkos::parallel_for(start_gpu_timer(__KERNEL_NAME__), range_1D(ndeb, nfin), KOKKOS_LAMBDA(const int face)
+      {
+        int elem0 = face_voisins_v(face,0);
+        int ori = orientation(face);
+        double n0 = face_surfaces(face)*porosite_surf_v(face);
+        if (elem0 != -1) flux_bords(face,ori) = (pression_P0(elem0))*n0 ;
+        else
+          {
+            int elem1 = face_voisins_v(face,1);
+            flux_bords(face,ori) = -(pression_P0(elem1))*n0 ;
+          }
+      });
+      end_gpu_timer(__KERNEL_NAME__);
     } // fin for n_bord
 }
 
@@ -79,9 +84,8 @@ int Op_Grad_VDF_Face::impr(Sortie& os) const
   const Schema_Temps_base& sch = equation().probleme().schema_temps();
   const Domaine_VDF& zvdf = le_dom_vdf.valeur();
   const Domaine_Cl_VDF& zclvdf = la_zcl_vdf.valeur();
-  int face, ori;
-  DoubleTab xgr;
-  if (impr_mom) xgr = zvdf.calculer_xgr();
+  DoubleTrav tab_xgr;
+  if (impr_mom) tab_xgr = zvdf.calculer_xgr();
   // flux_bords contains the sum of flux on each boundary:
   DoubleTrav tab_flux_bords(3,zvdf.nb_front_Cl(),3);
   tab_flux_bords=0.;
@@ -89,7 +93,7 @@ int Op_Grad_VDF_Face::impr(Sortie& os) const
       flux_bord_perio1(k)   ->   flux_bords2(1,num_cl,k)
       flux_bord_perio2(k)   ->   flux_bords2(2,num_cl,k)
       moment(k)             ->   flux_bords2(3,num_cl,k) */
-  int nb_bord =  zvdf.nb_front_Cl();
+  int nb_bord = zvdf.nb_front_Cl();
   for (int n_bord=0; n_bord<nb_bord; n_bord++)
     {
       const Cond_lim& la_cl = zclvdf.les_conditions_limites(n_bord);
@@ -97,38 +101,50 @@ int Op_Grad_VDF_Face::impr(Sortie& os) const
       int impr_boundary = (zvdf.domaine().bords_a_imprimer_sum().contient(le_bord.le_nom()) ? 1 : 0);
       int ndeb = le_bord.num_premiere_face();
       int nfin = ndeb + le_bord.nb_faces();
+      int dim = Objet_U::dimension;
+      CIntArrView orientation = zvdf.orientation().view_ro();
+      CDoubleTabView flux_bords = flux_bords_.view_ro();
+      DoubleTabView3 sum_flux_bords = tab_flux_bords.view_rw<3>();
+      CDoubleTabView xgr;
+      if (impr_mom) xgr = tab_xgr.view_ro();
+      Kokkos::parallel_for(start_gpu_timer(__KERNEL_NAME__), range_1D(ndeb, nfin), KOKKOS_LAMBDA(const int face)
+      {
+        int ori = orientation(face);
+        Kokkos::atomic_add(&sum_flux_bords(0, n_bord, ori), + flux_bords(face, ori));
 
-      for (face=ndeb; face<nfin; face++)
-        {
-          ori = orientation(face);
-          tab_flux_bords(0, n_bord, ori) += flux_bords_(face,ori) ;
-
-          if (dimension == 2)
-            {
-              if (impr_mom)
-                tab_flux_bords(2, n_bord, 2) +=flux_bords_(face,1)*xgr(face,0)-flux_bords_(face,0)*xgr(face,1);
-              if (impr_boundary)
-                {
-                  tab_flux_bords(1, n_bord, 0) += flux_bords_(face,0) ;
-                  tab_flux_bords(1, n_bord, 1) += flux_bords_(face,1) ;
-                }
-            }
-          else if (dimension == 3)
-            {
-              if (impr_mom)
-                {
-                  tab_flux_bords(2, n_bord, 0) +=flux_bords_(face,2)*xgr(face,1)-flux_bords_(face,1)*xgr(face,2);
-                  tab_flux_bords(2, n_bord, 1) +=flux_bords_(face,0)*xgr(face,2)-flux_bords_(face,2)*xgr(face,0);
-                  tab_flux_bords(2, n_bord, 2) +=flux_bords_(face,1)*xgr(face,0)-flux_bords_(face,0)*xgr(face,1);
-                }
-              if (impr_boundary)
-                {
-                  tab_flux_bords(1, n_bord, 0) += flux_bords_(face,0) ;
-                  tab_flux_bords(1, n_bord, 1) += flux_bords_(face,1) ;
-                  tab_flux_bords(1, n_bord, 2) += flux_bords_(face,2) ;
-                }
-            }
-        } // fin for face
+        if (dim == 2)
+          {
+            if (impr_mom)
+              {
+                double moment_contrib = flux_bords(face,1)*xgr(face,0) - flux_bords(face,0)*xgr(face,1);
+                Kokkos::atomic_add(&sum_flux_bords(2, n_bord, 2), + moment_contrib);
+              }
+            if (impr_boundary)
+              {
+                Kokkos::atomic_add(&sum_flux_bords(1, n_bord, 0), + flux_bords(face,0));
+                Kokkos::atomic_add(&sum_flux_bords(1, n_bord, 1), + flux_bords(face,1));
+              }
+          }
+        else if (dim == 3)
+          {
+            if (impr_mom)
+              {
+                double moment_x = flux_bords(face,2)*xgr(face,1) - flux_bords(face,1)*xgr(face,2);
+                double moment_y = flux_bords(face,0)*xgr(face,2) - flux_bords(face,2)*xgr(face,0);
+                double moment_z = flux_bords(face,1)*xgr(face,0) - flux_bords(face,0)*xgr(face,1);
+                Kokkos::atomic_add(&sum_flux_bords(2, n_bord, 0), + moment_x);
+                Kokkos::atomic_add(&sum_flux_bords(2, n_bord, 1), + moment_y);
+                Kokkos::atomic_add(&sum_flux_bords(2, n_bord, 2), + moment_z);
+              }
+            if (impr_boundary)
+              {
+                Kokkos::atomic_add(&sum_flux_bords(1, n_bord, 0), + flux_bords(face,0));
+                Kokkos::atomic_add(&sum_flux_bords(1, n_bord, 1), + flux_bords(face,1));
+                Kokkos::atomic_add(&sum_flux_bords(1, n_bord, 2), + flux_bords(face,2));
+              }
+          }
+      });
+      end_gpu_timer(__KERNEL_NAME__);
     } // fin for n_bord
 
   // Sum on all process:
@@ -200,7 +216,7 @@ int Op_Grad_VDF_Face::impr(Sortie& os) const
                   sch.imprimer_temps_courant(Flux_grad_face);
                   Flux_grad_face << " : " << finl;
                 }
-              for (face=ndeb; face<nfin; face++)
+              for (int face=ndeb; face<nfin; face++)
                 {
                   Flux_grad_face << "# Face a x= " << zvdf.xv(face,0) << " y= " << zvdf.xv(face,1);
                   if (dimension==3) Flux_grad_face << " z= " << zvdf.xv(face,2);
@@ -227,6 +243,7 @@ void Op_Grad_VDF_Face::dimensionner_blocs(matrices_t matrices, const tabs_t& sem
   const int N = vit.line_size(), M = press.line_size();
   Matrice_Morse *mat = matrices["pression"], mat2;
 
+  ToDo_Kokkos("critical");
   for (int f = 0; f < zvdf.nb_faces(); f++)
     for (int i = 0, e; i < 2; i++)
       if ((e = zvdf.face_voisins(f, i)) >= 0)
@@ -239,20 +256,30 @@ void Op_Grad_VDF_Face::dimensionner_blocs(matrices_t matrices, const tabs_t& sem
 }
 
 
-void Op_Grad_VDF_Face::ajouter_blocs(matrices_t matrices, DoubleTab& secmem, const tabs_t& semi_impl) const
+void Op_Grad_VDF_Face::ajouter_blocs(matrices_t matrices, DoubleTab& tab_secmem, const tabs_t& semi_impl) const
 {
-  Matrice_Morse *mat = matrices.count("pression") ? matrices.at("pression") : nullptr;
-  const DoubleTab& inco = semi_impl.count("pression") ? semi_impl.at("pression") : (le_champ_inco ? le_champ_inco->valeurs() : ref_cast(Navier_Stokes_std, equation()).pression().valeurs()),
-                   *alp = sub_type(Pb_Multiphase, equation().probleme()) ? &ref_cast(Pb_Multiphase, equation().probleme()).equation_masse().inconnue().passe() : nullptr;
+  const DoubleTab& tab_inco = semi_impl.count("pression") ? semi_impl.at("pression") : (le_champ_inco ? le_champ_inco->valeurs() : ref_cast(Navier_Stokes_std, equation()).pression().valeurs());
+  const bool is_pbm = sub_type(Pb_Multiphase, equation().probleme());
+  const DoubleTab *alp = is_pbm ? &ref_cast(Pb_Multiphase, equation().probleme()).equation_masse().inconnue().passe() : nullptr;
+  Matrice_Morse *ptr_mat = matrices.count("pression") ? matrices.at("pression") : nullptr;
 
-  assert_espace_virtuel_vect(inco);
+  assert_espace_virtuel_vect(tab_inco);
 
   const Domaine_VDF& zvdf = le_dom_vdf.valeur();
   const Domaine_Cl_VDF& zclvdf = la_zcl_vdf.valeur();
-  const DoubleVect& face_surfaces = zvdf.face_surfaces(), &vf = zvdf.volumes_entrelaces();
-  const DoubleTab& vfd = zvdf.volumes_entrelaces_dir();
-  const int M = inco.line_size(), N = secmem.line_size();
+  const int M = tab_inco.line_size(), N = tab_secmem.line_size();
 
+  Matrice_Morse_View mat;
+  if (ptr_mat) mat.set(*ptr_mat);
+  CDoubleTabView alpha;
+  if (is_pbm) alpha = alp->view_ro();
+  CIntTabView face_voisins_v = face_voisins.view_ro();
+  CDoubleArrView porosite_surf_v = porosite_surf.view_ro();
+  CDoubleArrView face_surfaces = zvdf.face_surfaces().view_ro();
+  CDoubleArrView vf = zvdf.volumes_entrelaces().view_ro();
+  CDoubleTabView vfd = zvdf.volumes_entrelaces_dir().view_ro();
+  CDoubleTabView inco = tab_inco.view_ro();
+  DoubleTabView secmem = tab_secmem.view_rw();
   // Boucle sur les bords pour traiter les conditions aux limites
   for (int n_bord = 0; n_bord < zvdf.nb_front_Cl(); n_bord++)
     {
@@ -263,57 +290,66 @@ void Op_Grad_VDF_Face::ajouter_blocs(matrices_t matrices, DoubleTab& secmem, con
       if ( sub_type(Neumann_sortie_libre,la_cl.valeur()) )
         {
           const Neumann_sortie_libre& la_cl_typee = ref_cast(Neumann_sortie_libre, la_cl.valeur());
-          for (int num_face = ndeb; num_face < nfin; num_face++)
+          const double coeff_P = Option_VDF::coeff_P_neumann;
+          CDoubleTabView flux_impose = la_cl_typee.tab_flux_impose().view_ro();
+          Kokkos::parallel_for(start_gpu_timer(__KERNEL_NAME__), range_1D(ndeb, nfin), KOKKOS_LAMBDA(const int num_face)
+          {
             for (int n = 0, m = 0; n < N; n++, m += (M > 1))
               {
-                const double P_imp = la_cl_typee.flux_impose(num_face-ndeb, m);
-
-                const int n0 = face_voisins(num_face,0);
+                const double P_imp = flux_impose(num_face - ndeb, m);
+                const int n0 = face_voisins_v(num_face, 0);
                 if (n0 != -1)
                   {
-                    const double coef = face_surfaces(num_face)*porosite_surf(num_face) * Option_VDF::coeff_P_neumann * (alp ? (*alp)(n0, n) : 1);
-                    if(mat) (*mat)(N * num_face + n, M * n0 + m) -= coef;
+                    const double coef = face_surfaces(num_face) * porosite_surf_v(num_face) * coeff_P * (is_pbm ? alpha(n0, n) : 1.0);
+                    if (ptr_mat) mat.atomic_add(N * num_face + n, M * n0 + m, -coef);
                     secmem(num_face, n) -= coef * (P_imp - inco(n0, m));
                   }
                 else
                   {
-                    const int n1 = face_voisins(num_face,1);
-                    const double coef = face_surfaces(num_face)*porosite_surf(num_face) * Option_VDF::coeff_P_neumann * (alp ? (*alp)(n1, n) : 1.0);
-                    if(mat) (*mat)(N * num_face + n, M * n1 + m) += coef;
+                    const int n1 = face_voisins_v(num_face, 1);
+                    const double coef = face_surfaces(num_face) * porosite_surf_v(num_face) * coeff_P * (is_pbm ? alpha(n1, n) : 1.0);
+                    if (ptr_mat) mat.atomic_add(N * num_face + n, M * n1 + m, coef);
                     secmem(num_face, n) -= coef * (inco(n1, m) - P_imp);
                   }
               }
+          });
+          end_gpu_timer(__KERNEL_NAME__);
         }
       else if (sub_type(Periodique,la_cl.valeur())) // Correction periodicite
         {
-          for (int f = ndeb; f < nfin; f++)
+          Kokkos::parallel_for(start_gpu_timer(__KERNEL_NAME__), range_1D(ndeb, nfin), KOKKOS_LAMBDA(const int f)
+          {
             for (int n = 0, m = 0; n < N; n++, m += (M > 1))
               {
-                const int n0 = face_voisins(f, 0), n1 = face_voisins(f, 1);
-                const double alpha_face = alp ? (vfd(f, 0) * (*alp)(n0, n) + vfd(f, 1) * (*alp)(n1, n)) / vf(f) : 1.0;
-                const double coef = face_surfaces(f) * porosite_surf(f) * alpha_face;
+                const int n0 = face_voisins_v(f, 0), n1 = face_voisins_v(f, 1);
+                const double alpha_face = is_pbm ? (vfd(f, 0) * alpha(n0, n) + vfd(f, 1) * alpha(n1, n)) / vf(f) : 1.0;
+                const double coef = face_surfaces(f) * porosite_surf_v(f) * alpha_face;
                 secmem(f, n) -= coef * (inco(n1, m) - inco(n0, m));
               }
+          });
+          end_gpu_timer(__KERNEL_NAME__);
         }
       else if (sub_type(Symetrie,la_cl.valeur())) { /* Do nothing */ }
       else if ( (sub_type(Dirichlet,la_cl.valeur())) || (sub_type(Dirichlet_homogene,la_cl.valeur())) ) { /* Do nothing */ }
     }
 
   // Boucle sur les faces internes
-  for (int f = zvdf.premiere_face_int(); f < zvdf.nb_faces(); f++)
+  Kokkos::parallel_for(start_gpu_timer(__KERNEL_NAME__), range_1D(zvdf.premiere_face_int(), zvdf.nb_faces()), KOKKOS_LAMBDA(const int f)
+  {
     for (int n = 0, m = 0; n < N; n++, m += (M > 1))
       {
-        const int n0 = face_voisins(f, 0), n1 = face_voisins(f, 1);
+        const int n0 = face_voisins_v(f, 0), n1 = face_voisins_v(f, 1);
         // XXX : Elie Saikali : attention : on code alpha grad(P) et pas grad(alpha.P) !! Sinon on manque des termes ... (voir avec Antoine sinon)
-        const double alpha_face = alp ? (vfd(f, 0) * (*alp)(n0, n) + vfd(f, 1) * (*alp)(n1, n)) / vf(f) : 1.0;
-        const double coef = face_surfaces(f) * porosite_surf(f) * alpha_face;
-        if(mat)
+        const double alpha_face = is_pbm ? (vfd(f, 0) * alpha(n0, n) + vfd(f, 1) * alpha(n1, n)) / vf(f) : 1.0;
+        const double coef = face_surfaces(f) * porosite_surf_v(f) * alpha_face;
+        if (ptr_mat)
           {
-            (*mat)(N * f + n, M * n0 + m) -= coef;
-            (*mat)(N * f + n, M * n1 + m) += coef;
+            mat.atomic_add(N * f + n, M * n0 + m, -coef);
+            mat.atomic_add(N * f + n, M * n1 + m, +coef);
           }
         secmem(f, n) -= coef * (inco(n1, m) - inco(n0, m));
       }
-
-  secmem.echange_espace_virtuel();
+  });
+  end_gpu_timer(__KERNEL_NAME__);
+  tab_secmem.echange_espace_virtuel();
 }
diff --git a/src/VDF/Operateurs/Op_Divers/Op_Grad_VDF_Face.h b/src/VDF/Operateurs/Op_Divers/Op_Grad_VDF_Face.h
index 15a7a8fede..2130345ae0 100644
--- a/src/VDF/Operateurs/Op_Divers/Op_Grad_VDF_Face.h
+++ b/src/VDF/Operateurs/Op_Divers/Op_Grad_VDF_Face.h
@@ -1,5 +1,5 @@
 /****************************************************************************
-* Copyright (c) 2024, CEA
+* Copyright (c) 2026, CEA
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
diff --git a/src/VDF/Operateurs/Op_Divers/Op_Grad_VDF_Face_base.cpp b/src/VDF/Operateurs/Op_Divers/Op_Grad_VDF_Face_base.cpp
index c980579fbc..b81417f8ab 100644
--- a/src/VDF/Operateurs/Op_Divers/Op_Grad_VDF_Face_base.cpp
+++ b/src/VDF/Operateurs/Op_Divers/Op_Grad_VDF_Face_base.cpp
@@ -1,5 +1,5 @@
 /****************************************************************************
-* Copyright (c) 2024, CEA
+* Copyright (c) 2025, CEA
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
@@ -31,7 +31,7 @@ void Op_Grad_VDF_Face_base::associer(const Domaine_dis_base& domaine_dis, const
   porosite_surf.ref(la_zcl_vdf->equation().milieu().porosite_face());
   volume_entrelaces.ref(zvdf.volumes_entrelaces());
   face_voisins.ref(zvdf.face_voisins());
-  orientation.ref(zvdf.orientation());
+  orientation_.ref(zvdf.orientation());
   xp.ref(zvdf.xp());
 }
 
diff --git a/src/VDF/Operateurs/Op_Divers/Op_Grad_VDF_Face_base.h b/src/VDF/Operateurs/Op_Divers/Op_Grad_VDF_Face_base.h
index 844b63bcf9..2ad4d84255 100644
--- a/src/VDF/Operateurs/Op_Divers/Op_Grad_VDF_Face_base.h
+++ b/src/VDF/Operateurs/Op_Divers/Op_Grad_VDF_Face_base.h
@@ -1,5 +1,5 @@
 /****************************************************************************
-* Copyright (c) 2024, CEA
+* Copyright (c) 2025, CEA
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
@@ -42,15 +42,15 @@ class Op_Grad_VDF_Face_base : public Operateur_Grad_base
   inline const double& volume_entrelaces_(int i) const { return volume_entrelaces(i); }
   inline double porosite_surf_(int i)  { return porosite_surf(i); }
   inline const double& porosite_surf_(int i) const { return porosite_surf(i); }
-  inline int orientation_(int face) { return orientation(face); }
-  inline const int& orientation_(int face) const { return orientation(face); }
+  inline int orientation(int face) { return orientation_(face); }
+  inline const int& orientation(int face) const { return orientation_(face); }
   inline double xp_(int elem, int ori) { return xp(elem,ori); }
   inline const double& xp_(int elem, int ori) const { return xp(elem,ori); }
 
 protected:
   OBS_PTR(Domaine_VDF) le_dom_vdf;
   OBS_PTR(Domaine_Cl_VDF) la_zcl_vdf;
-  IntVect orientation;
+  IntVect orientation_;
   IntTab face_voisins;
   DoubleVect porosite_surf, volume_entrelaces;
   DoubleTab xp;
diff --git a/src/VDF/Operateurs/Op_Divers/Op_VDF_Elem.cpp b/src/VDF/Operateurs/Op_Divers/Op_VDF_Elem.cpp
index 156bea107c..65fc10f069 100644
--- a/src/VDF/Operateurs/Op_Divers/Op_VDF_Elem.cpp
+++ b/src/VDF/Operateurs/Op_Divers/Op_VDF_Elem.cpp
@@ -54,6 +54,7 @@ void Op_VDF_Elem::dimensionner_old(const Domaine_VDF& le_dom, const Domaine_Cl_V
   IntVect rang_voisin(n1*nb_comp);
   rang_voisin = 1;
 
+  ToDo_Kokkos("critical");
   for (int num_face = ndeb; num_face < nfin; num_face++)
     {
       const int elem1 = face_voisins(num_face,0), elem2 = face_voisins(num_face,1);
@@ -73,6 +74,7 @@ void Op_VDF_Elem::dimensionner_old(const Domaine_VDF& le_dom, const Domaine_Cl_V
           int ind_face_global;
           IntVect fait(nfaces);
           fait = 0;
+          ToDo_Kokkos("critical");
           for (int face = 0; face < nfaces; face++)
             {
               if (fait[face] == 0)
@@ -104,6 +106,7 @@ void Op_VDF_Elem::dimensionner_old(const Domaine_VDF& le_dom, const Domaine_Cl_V
     }
 
   // on traite les faces internes pour les voisins
+  ToDo_Kokkos("critical");
   for (int num_face = ndeb; num_face < nfin; num_face++)
     {
       const int elem1 = face_voisins(num_face,0), elem2 = face_voisins(num_face,1);
diff --git a/src/VDF/Operateurs/Op_Divers/Op_VDF_Face.cpp b/src/VDF/Operateurs/Op_Divers/Op_VDF_Face.cpp
index 75732b4ea3..984affc339 100644
--- a/src/VDF/Operateurs/Op_Divers/Op_VDF_Face.cpp
+++ b/src/VDF/Operateurs/Op_Divers/Op_VDF_Face.cpp
@@ -42,6 +42,7 @@ void Op_VDF_Face::dimensionner(const Domaine_VDF& le_dom, const Domaine_Cl_VDF&
   IntVect rang_voisin(nfin);
   rang_voisin = 1;
 
+  ToDo_Kokkos("critical");
   for (int num_face = ndeb; num_face < nfin; num_face++)
     {
       const int ori = orientation(num_face), face1 = le_dom.face_amont_princ(num_face,0), face2 = le_dom.face_amont_princ(num_face,1),
@@ -62,11 +63,13 @@ void Op_VDF_Face::dimensionner(const Domaine_VDF& le_dom, const Domaine_Cl_VDF&
 
   // on balaye les faces pour dimensionner tab1 et tab2
   tab1(0) = 1;
+  ToDo_Kokkos("critical");
   for (int num_face = ndeb; num_face < nfin; num_face++)
     for (int k = 0; k < nb_comp; k++) tab1(num_face*nb_comp+1+k) = rang_voisin(num_face) + tab1(num_face*nb_comp+k);
 
   la_matrice.dimensionner(nfin*nb_comp,tab1(nfin*nb_comp)-1);
 
+  ToDo_Kokkos("critical");
   for (int num_face = ndeb; num_face < nfin; num_face++)
     {
       const int ori = orientation(num_face), face1 = le_dom.face_amont_princ(num_face,0), face2 = le_dom.face_amont_princ(num_face,1),
@@ -102,6 +105,7 @@ void Op_VDF_Face::dimensionner(const Domaine_VDF& le_dom, const Domaine_Cl_VDF&
           const Front_VF& la_front_dis = ref_cast(Front_VF,la_cl->frontiere_dis());
           const int ndeb_p = la_front_dis.num_premiere_face(), nfaces = la_front_dis.nb_faces(), nfin_p = ndeb_p + nfaces;
 
+          ToDo_Kokkos("critical");
           for (int num_face = ndeb_p; num_face < nfin_p; num_face++)
             {
               const int ori = orientation(num_face);
@@ -165,6 +169,7 @@ void Op_VDF_Face::modifier_pour_Cl(const Domaine_VDF& le_dom, const Domaine_Cl_V
         {
           const Dirichlet& la_cl_Dirichlet = ref_cast(Dirichlet, la_cl.valeur());
 
+          ToDo_Kokkos("critical");
           for (int face = numdeb; face < (numdeb + nfaces); face++)
             for (int comp = 0; comp < nb_comp; comp++)
               {
@@ -177,6 +182,7 @@ void Op_VDF_Face::modifier_pour_Cl(const Domaine_VDF& le_dom, const Domaine_Cl_V
 
       if (sub_type(Symetrie, la_cl.valeur()))
         {
+          ToDo_Kokkos("critical");
           for (int face = numdeb; face < numdeb + nfaces; face++)
             for (int comp = 0; comp < nb_comp; comp++)
               {
@@ -189,6 +195,7 @@ void Op_VDF_Face::modifier_pour_Cl(const Domaine_VDF& le_dom, const Domaine_Cl_V
         {
           const Dirichlet_homogene& la_cl_Dirichlet_homogene = ref_cast(Dirichlet_homogene, la_cl.valeur());
 
+          ToDo_Kokkos("critical");
           for (int face = numdeb; face < numdeb + nfaces; face++)
             for (int comp = 0; comp < nb_comp; comp++)
               {
diff --git a/src/VDF/Solveurs/Assembleur_P_VDF.cpp b/src/VDF/Solveurs/Assembleur_P_VDF.cpp
index e4a28e1360..21452913d2 100644
--- a/src/VDF/Solveurs/Assembleur_P_VDF.cpp
+++ b/src/VDF/Solveurs/Assembleur_P_VDF.cpp
@@ -136,6 +136,7 @@ int Assembleur_P_VDF::construire(Matrice& la_matrice)
   const int nb_faces_periodiques = liste_faces_periodiques(liste_faces_perio);
   const int nb_faces_internes = domaine_vdf.nb_faces_internes();
   const int premiere_face_interne = domaine_vdf.premiere_face_int();
+  ToDo_Kokkos("critical");
   for (i = 0; i < nb_faces_internes + nb_faces_periodiques; i++)
     {
       int face;
@@ -326,6 +327,7 @@ int Assembleur_P_VDF::remplir(Matrice& la_matrice, const DoubleVect& volumes_ent
   const int nb_faces_periodiques = liste_faces_periodiques(liste_faces_perio);
   const int nb_faces_internes = domaine_vdf.nb_faces_internes();
   const int premiere_face_interne = domaine_vdf.premiere_face_int();
+  ToDo_Kokkos("critical");
   for (int i_face = 0; i_face < nb_faces_internes + nb_faces_periodiques; i_face++)
     {
 
@@ -415,6 +417,7 @@ int Assembleur_P_VDF::remplir(Matrice& la_matrice, const DoubleVect& volumes_ent
           carre.set_est_definie(1);
           const int ndeb = la_front_dis.num_premiere_face();
           const int nfin = ndeb + la_front_dis.nb_faces();
+          ToDo_Kokkos("critical");
           for (int num_face = ndeb; num_face < nfin; num_face++)
             {
               // Calcul de rho sur cette face
@@ -447,6 +450,10 @@ int Assembleur_P_VDF::remplir(Matrice& la_matrice, const DoubleVect& volumes_ent
         }
     }
   has_P_ref = (int)mp_max(has_P_ref);
+  // PL: we specify that the matrix has null space or not
+  // It will be used by PETSc solver for a better handling of such a case
+  // ToDo: Same for VEF, EF, PolyMAC but remove the *2 on coefficient ? Nothing was done for VDF
+  la_matrice->set_has_constant_nullspace(!has_P_ref);
 
   // Verification sanitaire: pas d'element nul sur la diagonale
   for (int i = 0; i < nb_elem; i++)
@@ -565,6 +572,7 @@ void Assembleur_P_VDF::modifier_secmem_pression_imposee(const Neumann_sortie_lib
     {
       const int nb_faces = frontiere_vf.nb_faces();
       const int num_premiere_face = frontiere_vf.num_premiere_face();
+      ToDo_Kokkos("critical");
       for (int i = 0; i < nb_faces; i++)
         {
           const int num_face = num_premiere_face + i;
@@ -584,13 +592,9 @@ void Assembleur_P_VDF::modifier_secmem_pression_imposee(const Neumann_sortie_lib
  */
 void Assembleur_P_VDF::modifier_secmem_vitesse_imposee(const Entree_fluide_vitesse_imposee& cond_lim,
                                                        const Front_VF& frontiere_vf,
-                                                       DoubleTab& secmem)
+                                                       DoubleTab& tab_secmem)
 {
   const Champ_front_base& champ_front = cond_lim.champ_front();
-  const Domaine_VDF& le_dom = le_dom_VDF.valeur();
-  const DoubleVect& face_surfaces = le_dom.face_surfaces();
-  const IntTab& face_voisins = le_dom.face_voisins();
-
   if (get_resoudre_en_u())
     {
       if (champ_front.instationnaire())
@@ -600,22 +604,30 @@ void Assembleur_P_VDF::modifier_secmem_vitesse_imposee(const Entree_fluide_vites
           bool ch_unif = (tab_gpoint.nb_dim()==1 || tab_gpoint.dimension(0)==1);
           const int nb_faces = frontiere_vf.nb_faces();
           const int num_premiere_face = frontiere_vf.num_premiere_face();
-          for (int i = 0; i < nb_faces; i++)
-            {
-              const int num_face = num_premiere_face + i;
-              const double surface = face_surfaces(num_face);
-              const int elem0 = face_voisins(num_face, 0);
-              const int elem1 = face_voisins(num_face, 1);
-              // gpoint est relatif a la normale a la face (elle pointe vers elem1)
-              // La normale est-elle entrante ou sortante ?
-              const double signe = (elem0 < 0) ? 1. : -1.;
-              // Numero de l'element adjacent a la face de bord
-              const int elem = elem0 + elem1 + 1;
-              const int ori = le_dom.orientation(num_face);
-              const double gpoint = nb_dim==1 ? tab_gpoint(ori) : tab_gpoint(ch_unif ? 0 : i, ori);
-
-              secmem[elem] += signe * surface * gpoint;
-            }
+          const Domaine_VDF& le_dom = le_dom_VDF.valeur();
+          const bool nb_dim_1 = (nb_dim == 1);
+          const int ncols = nb_dim_1 ? 1 : tab_gpoint.dimension(1);
+          CDoubleArrView gpoint = static_cast<const ArrOfDouble&>(tab_gpoint).view_ro();
+          CDoubleArrView face_surfaces = static_cast<const ArrOfDouble&>(le_dom.face_surfaces()).view_ro();
+          CIntTabView face_voisins = le_dom.face_voisins().view_ro();
+          CIntArrView orientation = le_dom.orientation().view_ro();
+          DoubleArrView secmem = static_cast<ArrOfDouble&>(tab_secmem).view_rw();
+          Kokkos::parallel_for(start_gpu_timer(__KERNEL_NAME__), range_1D(0, nb_faces), KOKKOS_LAMBDA(const int i)
+          {
+            const int num_face = num_premiere_face + i;
+            const double surface = face_surfaces(num_face);
+            const int elem0 = face_voisins(num_face, 0);
+            const int elem1 = face_voisins(num_face, 1);
+            // gpoint est relatif a la normale a la face (elle pointe vers elem1)
+            // La normale est-elle entrante ou sortante ?
+            const double signe = (elem0 < 0) ? 1. : -1.;
+            // Numero de l'element adjacent a la face de bord
+            const int elem = elem0 + elem1 + 1;
+            const int ori = orientation(num_face);
+            const int row = nb_dim_1 ? 0 : (ch_unif ? 0 : i);
+            secmem(elem) += signe * surface * gpoint(row * ncols + ori);
+          });
+          end_gpu_timer(__KERNEL_NAME__);
         }
       else
         {
@@ -628,27 +640,29 @@ void Assembleur_P_VDF::modifier_secmem_vitesse_imposee(const Entree_fluide_vites
     }
 }
 
-int Assembleur_P_VDF::modifier_solution(DoubleTab& pression)
+int Assembleur_P_VDF::modifier_solution(DoubleTab& tab_pression)
 {
-  // Projection :
-  double press_0;
   if(!has_P_ref)
     {
       // On prend la pression minimale comme pression de reference
       // afin d'avoir la meme pression de reference en sequentiel et parallele
-      press_0=DMAXFLOAT;
+      double press_0;
       int nb_elem=le_dom_VDF->domaine().nb_elem();
-      for(int n=0; n<nb_elem; n++)
-        if (pression[n] < press_0)
-          press_0 = pression[n];
+      CDoubleArrView pression = static_cast<const ArrOfDouble&>(tab_pression).view_ro();
+      Kokkos::parallel_reduce(start_gpu_timer(__KERNEL_NAME__), nb_elem, KOKKOS_LAMBDA(const int n, double& press_min)
+      {
+        if (pression(n) < press_min) press_min = pression(n);
+      }, Kokkos::Min<double>(press_0));
+      end_gpu_timer(__KERNEL_NAME__);
       press_0 = mp_min(press_0);
-      pression -=press_0;
-      pression.echange_espace_virtuel();
+      tab_pression -= press_0;
+      tab_pression.echange_espace_virtuel();
     }
   return 1;
 }
 int Assembleur_P_VDF::assembler_mat(Matrice& matrice,const DoubleVect& volumes_entrelaces,int incr_pression,int resoudre_en_u)
 {
+  statistics().begin_count(STD_COUNTERS::matrix_assembly,statistics().get_last_opened_counter_level()+1);
   if (!matrice)
     {
       if (je_suis_maitre())
@@ -660,6 +674,7 @@ int Assembleur_P_VDF::assembler_mat(Matrice& matrice,const DoubleVect& volumes_e
   set_resoudre_en_u(resoudre_en_u);
 
   remplir(matrice,volumes_entrelaces, 0);
+  statistics().end_count(STD_COUNTERS::matrix_assembly);
   return 1;
 }
 
@@ -677,7 +692,6 @@ int Assembleur_P_VDF::assembler(Matrice& matrice)
   set_resoudre_en_u(1);
   construire(matrice);
   const Domaine_VDF& domaine_vdf   = le_dom_VDF.valeur();
-
   const DoubleVect& volumes_entrelaces = domaine_vdf.volumes_entrelaces();
   remplir(matrice,volumes_entrelaces, 0);
   return 1;
diff --git a/src/VDF/Solveurs/Assembleur_P_VDF.h b/src/VDF/Solveurs/Assembleur_P_VDF.h
index 314dc27192..aa4e5f7886 100644
--- a/src/VDF/Solveurs/Assembleur_P_VDF.h
+++ b/src/VDF/Solveurs/Assembleur_P_VDF.h
@@ -45,11 +45,13 @@ class Assembleur_P_VDF: public Assembleur_base
   void assembler_continuite(matrices_t matrices, DoubleTab& secmem, int aux_only = 0) const override;
   DoubleTab norme_continuite() const override;
 
+  protected_but_public_for_cuda
+  void modifier_secmem_vitesse_imposee(const Entree_fluide_vitesse_imposee& cond_lim, const Front_VF& frontiere_vf, DoubleTab& secmem);
+
 protected:
   int construire(Matrice& la_matrice);
   int remplir(Matrice& la_matrice, const DoubleVect& volumes_entrelaces, const Champ_Don_base *rho_ptr);
   void modifier_secmem_pression_imposee(const Neumann_sortie_libre& cond_lim, const Front_VF& frontiere_vf, DoubleTab& secmem);
-  void modifier_secmem_vitesse_imposee(const Entree_fluide_vitesse_imposee& cond_lim, const Front_VF& frontiere_vf, DoubleTab& secmem);
   int liste_faces_periodiques(ArrOfInt& faces);
 
   OBS_PTR(Domaine_VDF) le_dom_VDF;
diff --git a/src/VDF/Solveurs/Assembleur_P_VDF_Q4.cpp b/src/VDF/Solveurs/Assembleur_P_VDF_Q4.cpp
index a0451650aa..ef9391bab8 100644
--- a/src/VDF/Solveurs/Assembleur_P_VDF_Q4.cpp
+++ b/src/VDF/Solveurs/Assembleur_P_VDF_Q4.cpp
@@ -58,6 +58,7 @@ int Assembleur_P_VDF_Q4::assembler(Matrice& la_matrice)
   tab2 = 0;
   coeff = 0;
   rang_voisins = 1;
+  ToDo_Kokkos("critical");
   for (face=0 ; face<nbfaces ; face++)
     {
       som0 = FaceSoms(face,0);
@@ -73,6 +74,7 @@ int Assembleur_P_VDF_Q4::assembler(Matrice& la_matrice)
   rang_voisins = 1;
   mat.dimensionner(nbsom,tab1(nbsom)-1);
   //calcul coefficient matrice
+  ToDo_Kokkos("critical");
   for (face=0 ; face<nbfaces ; face++)
     {
       ori = Orientation(face);
@@ -154,6 +156,7 @@ int Assembleur_P_VDF_Q4::modifier_secmem(const DoubleTab& tab_secmem_, DoubleVec
   secmem.resize(nbsom);
   secmem = 0.;
 
+  ToDo_Kokkos("critical");
   for (face=0 ; face<nbfaces ; face++)
     {
       surf = 0.;
diff --git a/src/VDF/Solveurs/Masse_VDF_Elem.cpp b/src/VDF/Solveurs/Masse_VDF_Elem.cpp
index c727f384e6..bc8689d9d0 100644
--- a/src/VDF/Solveurs/Masse_VDF_Elem.cpp
+++ b/src/VDF/Solveurs/Masse_VDF_Elem.cpp
@@ -33,48 +33,62 @@ void Masse_VDF_Elem::preparer_calcul()
   if (use_proto_) preparer_calcul_proto();
 }
 
-DoubleTab& Masse_VDF_Elem::appliquer_impl(DoubleTab& sm) const
+DoubleTab& Masse_VDF_Elem::appliquer_impl(DoubleTab& tab_sm) const
 {
-  if (use_proto_) return appliquer_impl_proto(sm);
+  if (use_proto_) return appliquer_impl_proto(tab_sm);
   else
     {
       const Domaine_VDF& domaine_VDF = le_dom_VDF.valeur();
-      const DoubleVect& volumes = domaine_VDF.volumes(), &porosite_elem = equation().milieu().porosite_elem();
       int nb_elem = domaine_VDF.nb_elem();
       if (nb_elem == 0)
         {
-          sm.echange_espace_virtuel();
-          return sm;
+          tab_sm.echange_espace_virtuel();
+          return tab_sm;
         }
-      int nb_comp = sm.size() / nb_elem;
-      int nb_dim = sm.nb_dim();
-      assert((nb_comp * nb_elem == sm.size()) || (nb_dim == 3));
+      int nb_comp = tab_sm.size() / nb_elem;
+      int nb_dim = tab_sm.nb_dim();
+      assert((nb_comp * nb_elem == tab_sm.size()) || (nb_dim == 3));
+
+      CDoubleArrView volumes = domaine_VDF.volumes().view_ro();
+      CDoubleArrView porosite_elem = equation().milieu().porosite_elem().view_ro();
       if (nb_dim == 1)
-        for (int num_elem = 0; num_elem < nb_elem; num_elem++)
-          sm(num_elem) /= (volumes(num_elem) * porosite_elem(num_elem));
+        {
+          DoubleArrView sm = static_cast<ArrOfDouble&>(tab_sm).view_rw();
+          Kokkos::parallel_for(start_gpu_timer(__KERNEL_NAME__), range_1D(0, nb_elem), KOKKOS_LAMBDA( const int num_elem)
+          {
+            sm(num_elem) /= (volumes(num_elem) * porosite_elem(num_elem));
+          });
+        }
       else if (nb_dim == 2)
         {
-          for (int num_elem = 0; num_elem < nb_elem; num_elem++)
+          DoubleTabView sm = tab_sm.view_rw();
+          Kokkos::parallel_for(start_gpu_timer(__KERNEL_NAME__), range_1D(0, nb_elem), KOKKOS_LAMBDA(const int num_elem)
+          {
             for (int k = 0; k < nb_comp; k++)
               sm(num_elem, k) /= (volumes(num_elem) * porosite_elem(num_elem));
+          });
         }
-      else if (sm.nb_dim() == 3)
+      else if (nb_dim == 3)
         {
           //int d0=sm.dimension(0);
-          int d1 = sm.dimension(1);
-          int d2 = sm.dimension(2);
-          for (int num_elem = 0; num_elem < nb_elem; num_elem++)
+          int d1 = tab_sm.dimension(1);
+          int d2 = tab_sm.dimension(2);
+          DoubleTabView3 sm = tab_sm.view_rw<3>();
+          Kokkos::parallel_for(start_gpu_timer(__KERNEL_NAME__), range_1D(0, nb_elem), KOKKOS_LAMBDA(const int num_elem)
+          {
             for (int k = 0; k < d1; k++)
               for (int d = 0; d < d2; d++)
                 sm(num_elem, k, d) /= (volumes(num_elem) * porosite_elem(num_elem));
+          });
         }
       else
         {
-          Cerr << "Masse_VDF_Elem::appliquer ne peut pas s'appliquer a un DoubleTab a " << sm.nb_dim() << " dimensions" << finl;
+          Cerr << "Masse_VDF_Elem::appliquer ne peut pas s'appliquer a un DoubleTab a " << nb_dim << " dimensions" << finl;
           Process::exit();
         }
-      sm.echange_espace_virtuel();
-      return sm;
+      end_gpu_timer(__KERNEL_NAME__);
+      tab_sm.echange_espace_virtuel();
+      return tab_sm;
     }
 }
 
diff --git a/src/VDF/Solveurs/Masse_VDF_Face.cpp b/src/VDF/Solveurs/Masse_VDF_Face.cpp
index 7b15583f02..215daf86bf 100644
--- a/src/VDF/Solveurs/Masse_VDF_Face.cpp
+++ b/src/VDF/Solveurs/Masse_VDF_Face.cpp
@@ -38,66 +38,55 @@ void Masse_VDF_Face::completer()
   Solveur_Masse_Face_proto::associer_masse_proto(*this,le_dom_VDF.valeur());
 }
 
-DoubleTab& Masse_VDF_Face::appliquer_impl(DoubleTab& sm) const
+DoubleTab& Masse_VDF_Face::appliquer_impl(DoubleTab& tab_sm) const
 {
-  if (sub_type(Pb_Multiphase, equation().probleme())) return Solveur_Masse_Face_proto::appliquer_impl_proto(sm);
+  if (sub_type(Pb_Multiphase, equation().probleme())) return Solveur_Masse_Face_proto::appliquer_impl_proto(tab_sm);
   else
     {
-
       assert(le_dom_VDF);
       assert(le_dom_Cl_VDF);
       const Domaine_VDF& domaine_VDF = le_dom_VDF.valeur();
-      const DoubleVect& porosite_face = equation().milieu().porosite_face();
-      const DoubleVect& volumes_entrelaces = domaine_VDF.volumes_entrelaces();
-      const int nb_faces = domaine_VDF.nb_faces(), N = sm.line_size();
-
-      if (sm.dimension(0) != nb_faces)
-        {
-          Cerr << "Masse_VDF_Face::appliquer :  erreur dans la taille de sm" << finl;
-          Process::exit();
-        }
+      const int nb_faces = domaine_VDF.nb_faces(), N = tab_sm.line_size();
 
-      // Boucle sur les faces joint
+      if (tab_sm.dimension(0) != nb_faces) Process::exit("Masse_VDF_Face::appliquer : erreur dans la taille de tab_sm");
 
       // Boucle sur les bords
       // Sur les faces qui portent des conditions aux limites de Dirichlet ou de Symetrie
       // la vitesse normale reste egale a sa valeur initiale.
       // Donc sur ces faces vpoint doit rester a 0.
-
+      CDoubleArrView porosite_face = equation().milieu().porosite_face().view_ro();
+      CDoubleArrView volumes_entrelaces = domaine_VDF.volumes_entrelaces().view_ro();
+      DoubleTabView sm = tab_sm.view_rw();
       for (int n_bord = 0; n_bord < domaine_VDF.nb_front_Cl(); n_bord++)
         {
-
           // pour chaque Condition Limite on regarde son type
           const Cond_lim& la_cl = le_dom_Cl_VDF->les_conditions_limites(n_bord);
           const Front_VF& la_front_dis = ref_cast(Front_VF, la_cl->frontiere_dis());
           const int ndeb = la_front_dis.num_premiere_face();
           const int nfin = ndeb + la_front_dis.nb_faces();
-
-          if ( sub_type(Dirichlet,la_cl.valeur()) || sub_type(Dirichlet_homogene, la_cl.valeur()))
-            // Pour les faces de Dirichlet on met sm a 0
-            for (int f = ndeb; f < nfin; f++)
-              for (int n = 0; n < N; n++)
-                sm(f, n) = 0;
-          else if (sub_type(Symetrie, la_cl.valeur()))
-            // Pour les faces de Symetrie on met vpoint a 0
-            for (int f = ndeb; f < nfin; f++)
-              for (int n = 0; n < N; n++)
-                sm(f, n) = 0;
-          else
-            for (int f = ndeb; f < nfin; f++)
-              for (int n = 0; n < N; n++)
-                sm(f, n) /= (volumes_entrelaces(f) * porosite_face(f));
-
+          bool null = sub_type(Dirichlet,la_cl.valeur()) || sub_type(Dirichlet_homogene, la_cl.valeur()) || sub_type(Symetrie, la_cl.valeur());
+          // Pour les faces de Dirichlet ou Symertie on met sm a 0
+          Kokkos::parallel_for(start_gpu_timer(__KERNEL_NAME__), range_1D(ndeb, nfin), KOKKOS_LAMBDA(const int f)
+          {
+            for (int n = 0; n < N; n++)
+              {
+                if (null) sm(f, n) = 0;
+                else sm(f, n) /= (volumes_entrelaces(f) * porosite_face(f));
+              }
+          });
+          end_gpu_timer(__KERNEL_NAME__);
         }
 
       // Boucle sur les faces internes
       const int ndeb = domaine_VDF.premiere_face_int();
-      for (int f = ndeb; f < nb_faces; f++)
+      Kokkos::parallel_for(start_gpu_timer(__KERNEL_NAME__), range_1D(ndeb, nb_faces), KOKKOS_LAMBDA(const int f)
+      {
         for (int n = 0; n < N; n++)
           sm(f, n) /= (volumes_entrelaces(f) * porosite_face(f));
+      });
+      end_gpu_timer(__KERNEL_NAME__);
       //sm.echange_espace_virtuel();
-      //Debog::verifier("Masse_VDF_Face::appliquer sm",sm);
-      return sm;
+      return tab_sm;
     }
 }
 
@@ -133,6 +122,7 @@ void Masse_VDF_Face::ajouter_blocs(matrices_t matrices, DoubleTab& secmem, doubl
 
       /* faces : si CLs, pas de produit par alpha * rho en multiphase */
       DoubleTrav masse(N, N), masse_e(N, N); //masse alpha * rho, contribution
+      ToDo_Kokkos("critical");
       for (f = 0; f < domaine.nb_faces(); f++) //faces reelles
         {
           if (!pbm || fcl(f, 0) >= 2)
@@ -176,6 +166,7 @@ DoubleTab& Masse_VDF_Face::corriger_solution(DoubleTab& x, const DoubleTab& y, i
   const DoubleVect& fs = domaine.face_surfaces();
   int f, n, N = x.line_size(), d, D = dimension;
 
+  ToDo_Kokkos("critical");
   for (f = 0; f < domaine.nb_faces_tot(); f++)
     if (fcl(f, 0) == 2 || fcl(f, 0) == 4)
       for (n = 0; n < N; n++) x(f, n) = incr ? -vit(f, n) : 0; //Dirichlet homogene / Symetrie: on revient a 0
diff --git a/src/VDF/Solveurs/Rayo_semi_transp_solver_VDF.cpp b/src/VDF/Solveurs/Rayo_semi_transp_solver_VDF.cpp
index b7b47c048a..a5ef967c58 100644
--- a/src/VDF/Solveurs/Rayo_semi_transp_solver_VDF.cpp
+++ b/src/VDF/Solveurs/Rayo_semi_transp_solver_VDF.cpp
@@ -86,6 +86,7 @@ void Rayo_semi_transp_solver_VDF::modifier_matrice()
               assert(fluide.longueur_rayo().nb_comp() == 1);
               assert(fluide.kappa().nb_comp() == 1);
 
+              ToDo_Kokkos("critical");
               for (int face = ndeb; face < nfin; face++)
                 {
                   int elem = face_voisins(face, 0);
@@ -222,6 +223,7 @@ void Rayo_semi_transp_solver_VDF::resoudre(double temps)
   assert(fluide.kappa().nb_comp() == 1);
 
   double n = -123., k = -123.;
+  ToDo_Kokkos("critical");
   for (int elem = 0; elem < nb_elem; elem++)
     {
       if (sub_type(Champ_Uniforme, fluide.indice()))
diff --git a/src/VDF/Sources/Canal/Terme_Source_Canal_RANS_LES_VDF_Elem.cpp b/src/VDF/Sources/Canal/Terme_Source_Canal_RANS_LES_VDF_Elem.cpp
index ee67c4c7f3..1d763126ca 100644
--- a/src/VDF/Sources/Canal/Terme_Source_Canal_RANS_LES_VDF_Elem.cpp
+++ b/src/VDF/Sources/Canal/Terme_Source_Canal_RANS_LES_VDF_Elem.cpp
@@ -1,5 +1,5 @@
 /****************************************************************************
-* Copyright (c) 2024, CEA
+* Copyright (c) 2026, CEA
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
@@ -169,6 +169,7 @@ void Terme_Source_Canal_RANS_LES_VDF_Elem::init()
       SFichier fic_verif("Tverif.RANS");
       EFichier fic_vit("temperature_RANS.dat");
 
+      ToDo_Kokkos("critical");
       for(int num_elem=0 ; num_elem<nb_elems ; num_elem++)
         {
           int elem;
@@ -178,6 +179,7 @@ void Terme_Source_Canal_RANS_LES_VDF_Elem::init()
         }
     }
 
+  ToDo_Kokkos("critical");
   for(int num_elem=0 ; num_elem<nb_elems ; num_elem++)
     {
       tau(num_elem) = alpha_tau;
@@ -206,6 +208,7 @@ void Terme_Source_Canal_RANS_LES_VDF_Elem::init()
       vit_umoy2 >> trash;
       Cerr << "trash = " << trash << finl;
 
+      ToDo_Kokkos("critical");
       for(int num_elem = 0 ; num_elem<nb_elems ; num_elem++)
         {
           vit_umoy >> utemp_sum(num_elem);
@@ -266,6 +269,7 @@ void Terme_Source_Canal_RANS_LES_VDF_Elem::mettre_a_jour(double temps)
         {
           if (cptbis==0)
             {
+              ToDo_Kokkos("critical");
               for (int num_elem=0; num_elem<nb_elems; num_elem++)
                 {
                   utemp_sum(num_elem) = vitesse(num_elem)*(tps-(f_start-t_av));
@@ -275,6 +279,7 @@ void Terme_Source_Canal_RANS_LES_VDF_Elem::mettre_a_jour(double temps)
             }
           else
             {
+              ToDo_Kokkos("critical");
               for (int num_elem=0; num_elem<nb_elems; num_elem++)
                 {
                   utemp_sum(num_elem) += vitesse(num_elem)*dt;
@@ -287,6 +292,7 @@ void Terme_Source_Canal_RANS_LES_VDF_Elem::mettre_a_jour(double temps)
 
       if(tps>=f_start)
         {
+          ToDo_Kokkos("critical");
           for (int num_elem=0; num_elem<nb_elems; num_elem++)
             {
               utemp_sum(num_elem) += dt*vitesse(num_elem);
@@ -307,6 +313,7 @@ void Terme_Source_Canal_RANS_LES_VDF_Elem::mettre_a_jour(double temps)
         {
           if (cptbis==0)
             {
+              ToDo_Kokkos("critical");
               for (int num_elem=0; num_elem<nb_elems; num_elem++)
                 {
                   utemp_sum(num_elem) = vitesse(num_elem)*dt;
@@ -316,6 +323,7 @@ void Terme_Source_Canal_RANS_LES_VDF_Elem::mettre_a_jour(double temps)
             }
           else
             {
+              ToDo_Kokkos("critical");
               for (int num_elem=0; num_elem<nb_elems; num_elem++)
                 {
                   utemp_sum(num_elem) += vitesse(num_elem)*dt;
@@ -360,6 +368,7 @@ void Terme_Source_Canal_RANS_LES_VDF_Elem::ajouter_blocs(matrices_t matrices, Do
 
   if(((tps>f_start)&&(compteur_reprise > 1))||((moyenne==3)&&(tps>dt_min)))
     {
+      ToDo_Kokkos("critical");
       for(int num_elem = 0 ; num_elem<nb_elems ; num_elem++)
         {
           vol = volume(num_elem);
diff --git a/src/VDF/Sources/Canal/Terme_Source_Canal_RANS_LES_VDF_Face.cpp b/src/VDF/Sources/Canal/Terme_Source_Canal_RANS_LES_VDF_Face.cpp
index d9a8d4cf7c..3f873a48f0 100644
--- a/src/VDF/Sources/Canal/Terme_Source_Canal_RANS_LES_VDF_Face.cpp
+++ b/src/VDF/Sources/Canal/Terme_Source_Canal_RANS_LES_VDF_Face.cpp
@@ -1,5 +1,5 @@
 /****************************************************************************
-* Copyright (c) 2024, CEA
+* Copyright (c) 2026, CEA
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
@@ -195,6 +195,7 @@ void Terme_Source_Canal_RANS_LES_VDF_Face::init()
       SFichier fic_verif("verif.RANS");
       EFichier fic_vit("vitesse_RANS.dat");
 
+      ToDo_Kokkos("critical");
       for(int num_face=0 ; num_face<nb_faces ; num_face++)
         {
           int face;
@@ -203,6 +204,7 @@ void Terme_Source_Canal_RANS_LES_VDF_Face::init()
           fic_verif << face << " " << U_RANS(face) << finl;
         }
     }
+  ToDo_Kokkos("critical");
   for(int num_face=0 ; num_face<nb_faces ; num_face++)
     {
       tau(num_face) = alpha_tau;
@@ -248,6 +250,7 @@ void Terme_Source_Canal_RANS_LES_VDF_Face::init()
       vit_umoy2 >> trash;
       Cerr << "trash = " << trash << finl;
 
+      ToDo_Kokkos("critical");
       for(int num_face = 0 ; num_face<nb_faces ; num_face++)
         {
           vit_umoy >> utemp_sum(num_face);
@@ -330,6 +333,7 @@ void Terme_Source_Canal_RANS_LES_VDF_Face::init_calcul_moyenne_spat()
   // remplissage des tableaux ci-dessus
 
   // Pour le calcul de u, v, w sur les plans d hmogeneite
+  ToDo_Kokkos("critical");
   for (num_face=0; num_face<nb_faces; num_face++)
     {
       ori = orientation(num_face);
@@ -435,6 +439,7 @@ DoubleTab Terme_Source_Canal_RANS_LES_VDF_Face::norme_vit() const
   double vit1, vit2, vit3;
   DoubleTab norme(nb_faces);
 
+  ToDo_Kokkos("critical");
   for(int num_elem = 0 ; num_elem<nb_elems ; num_elem++)
     {
       //WARNING 3D ONLY !!!!!!
@@ -460,6 +465,7 @@ DoubleTab Terme_Source_Canal_RANS_LES_VDF_Face::norme_vit() const
 
   //Redistribution de la norme sur les faces
 
+  ToDo_Kokkos("critical");
   for(int num_face = 0 ; num_face<nb_faces ; num_face++)
     {
       int elem0 = face_voisins(num_face,0);
@@ -491,6 +497,7 @@ void Terme_Source_Canal_RANS_LES_VDF_Face::moy_spat(DoubleVect& champ, DoubleVec
   compt_y=0;
   compt_z=0;
 
+  ToDo_Kokkos("critical");
   for (int num_face=0; num_face<nb_faces; num_face++)
     {
       int ori = orientation[num_face];
@@ -672,6 +679,7 @@ void Terme_Source_Canal_RANS_LES_VDF_Face::mettre_a_jour(double temps)
           Ny = nb_elems/(nb_faces_y-nb_elems);
           Nz = nb_elems/(nb_faces_z-nb_elems);
 
+          ToDo_Kokkos("critical");
           for (num_face=0; num_face<nb_faces; num_face++)
             {
               int ori = orientation[num_face];
@@ -710,6 +718,7 @@ void Terme_Source_Canal_RANS_LES_VDF_Face::mettre_a_jour(double temps)
 
           // Redistribution du profil moyen spatial sur tout le champ de vitesse
 
+          ToDo_Kokkos("critical");
           for(num_face = 0 ; num_face<nb_faces ; num_face++)
             {
               int ori = orientation(num_face);
@@ -827,6 +836,7 @@ void Terme_Source_Canal_RANS_LES_VDF_Face::mettre_a_jour(double temps)
         {
           if (cpt==0)
             {
+              ToDo_Kokkos("critical");
               for (int num_face=0; num_face<nb_faces; num_face++)
                 {
                   utemp_sum(num_face) = vitesse(num_face)*dt;
@@ -836,6 +846,7 @@ void Terme_Source_Canal_RANS_LES_VDF_Face::mettre_a_jour(double temps)
             }
           else if (tps <= t_av)
             {
+              ToDo_Kokkos("critical");
               for (int num_face=0; num_face<nb_faces; num_face++)
                 {
                   utemp_sum(num_face) += vitesse(num_face)*dt;
@@ -844,6 +855,7 @@ void Terme_Source_Canal_RANS_LES_VDF_Face::mettre_a_jour(double temps)
             }
           else
             {
+              ToDo_Kokkos("critical");
               for (int num_face=0; num_face<nb_faces; num_face++)
                 {
                   umoy(num_face) = (dt/t_av)*vitesse(num_face)
@@ -896,6 +908,7 @@ void Terme_Source_Canal_RANS_LES_VDF_Face::mettre_a_jour(double temps)
         {
           if (cpt==0)
             {
+              ToDo_Kokkos("critical");
               for (int num_face=0; num_face<nb_faces; num_face++)
                 {
                   utemp_sum(num_face) = vitesse(num_face)*dt;
@@ -905,6 +918,7 @@ void Terme_Source_Canal_RANS_LES_VDF_Face::mettre_a_jour(double temps)
             }
           else
             {
+              ToDo_Kokkos("critical");
               for (int num_face=0; num_face<nb_faces; num_face++)
                 {
                   utemp_sum(num_face) += vitesse(num_face)*dt;
@@ -942,6 +956,7 @@ void Terme_Source_Canal_RANS_LES_VDF_Face::mettre_a_jour(double temps)
         {
           if (cpt==0)
             {
+              ToDo_Kokkos("critical");
               for (int num_face=0; num_face<nb_faces; num_face++)
                 {
                   //                   duree=100*dt;
@@ -957,6 +972,7 @@ void Terme_Source_Canal_RANS_LES_VDF_Face::mettre_a_jour(double temps)
             }
           else
             {
+              ToDo_Kokkos("critical");
               for (int num_face=0; num_face<nb_faces; num_face++)
                 {
                   utemp_sum(num_face) += vitesse(num_face)*dt;
@@ -997,6 +1013,7 @@ void Terme_Source_Canal_RANS_LES_VDF_Face::mettre_a_jour(double temps)
   double vol=0;
   DoubleTab norm=norme_vit();
 
+  ToDo_Kokkos("critical");
   for(int num_face = 0 ; num_face<nb_faces ; num_face++)
     {
       vol = volumes_entrelaces(num_face)*porosite_surf(num_face);
@@ -1038,6 +1055,7 @@ void Terme_Source_Canal_RANS_LES_VDF_Face::mettre_a_jour(double temps)
   static int cpt_sonde=0;
   if(cpt_sonde==0)
     {
+      ToDo_Kokkos("critical");
       for(int face = 0 ; face<nb_faces ; face++)
         {
           if((xv(face,1) > 1.1)&&(xv(face,1) < 0.9)&&(orientation(face)==0))
@@ -1086,6 +1104,7 @@ void Terme_Source_Canal_RANS_LES_VDF_Face::ajouter_blocs(matrices_t matrices, Do
   if((tps>dt_min)&&(tps>f_start))
     {
 
+      ToDo_Kokkos("critical");
       for(int num_face = 0 ; num_face<nb_faces ; num_face++)
         {
           vol = volumes_entrelaces(num_face)*porosite_surf(num_face);
diff --git a/src/VDF/Sources/Canal/Terme_Source_Canal_perio_VDF_Face.cpp b/src/VDF/Sources/Canal/Terme_Source_Canal_perio_VDF_Face.cpp
index edc8fe7f9a..da63f25513 100644
--- a/src/VDF/Sources/Canal/Terme_Source_Canal_perio_VDF_Face.cpp
+++ b/src/VDF/Sources/Canal/Terme_Source_Canal_perio_VDF_Face.cpp
@@ -1,5 +1,5 @@
 /****************************************************************************
-* Copyright (c) 2024, CEA
+* Copyright (c) 2026, CEA
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
@@ -41,16 +41,22 @@ void Terme_Source_Canal_perio_VDF_Face::associer_domaines(const Domaine_dis_base
   le_dom_Cl_VDF = ref_cast(Domaine_Cl_VDF, domaine_Cl_dis);
 }
 
-void Terme_Source_Canal_perio_VDF_Face::ajouter_blocs(matrices_t matrices, DoubleTab& secmem, const tabs_t& semi_impl) const
+void Terme_Source_Canal_perio_VDF_Face::ajouter_blocs(matrices_t matrices, DoubleTab& tab_secmen, const tabs_t& semi_impl) const
 {
   const Domaine_VF& domaine_VF = le_dom_VDF.valeur();
   const Domaine_Cl_dis_base& domaine_Cl_dis = le_dom_Cl_VDF.valeur();
-  const IntVect& orientation = le_dom_VDF->orientation();
-  const DoubleVect& porosite_surf = equation().milieu().porosite_face();
-  const DoubleVect& volumes_entrelaces = domaine_VF.volumes_entrelaces();
-  int ncomp;
-  ArrOfDouble s(source());
+  const IntVect& tab_orientation = le_dom_VDF->orientation();
+  const DoubleVect& tab_porosite_surf = equation().milieu().porosite_face();
+  const DoubleVect& tab_volumes_entrelaces = domaine_VF.volumes_entrelaces();
+  ArrOfDouble tab_s(source());
 
+
+  CDoubleArrView volumes_entrelaces = tab_volumes_entrelaces.view_ro();
+  CDoubleArrView porosite_surf = tab_porosite_surf.view_ro();
+  CIntArrView orientation = tab_orientation.view_ro();
+  DoubleTabView secmen = tab_secmen.view_rw();
+
+  CDoubleArrView s = tab_s.view_ro();
   // Boucle sur les conditions limites pour traiter les faces de bord
   int n_bord, ndeb, nfin;
   for (n_bord = 0; n_bord < domaine_VF.nb_front_Cl(); n_bord++)
@@ -69,12 +75,14 @@ void Terme_Source_Canal_perio_VDF_Face::ajouter_blocs(matrices_t matrices, Doubl
           ndeb = le_bord.num_premiere_face();
           nfin = ndeb + le_bord.nb_faces();
 
-          for (int num_face = ndeb; num_face < nfin; num_face++)
-            {
-              double vol = volumes_entrelaces(num_face) * porosite_surf(num_face);
-              ncomp = orientation(num_face);
-              secmem(num_face) += s[ncomp] * vol;
-            }
+          Kokkos::parallel_for(start_gpu_timer(__KERNEL_NAME__), Kokkos::RangePolicy<>(ndeb, nfin),KOKKOS_LAMBDA(const int num_face)
+          {
+            double vol = volumes_entrelaces(num_face) * porosite_surf(num_face);
+            int ncomp = orientation(num_face);
+
+            secmen(num_face,0) += s[ncomp] * vol;
+          });
+          end_gpu_timer(__KERNEL_NAME__);
         }
       else if ((sub_type(Dirichlet, la_cl.valeur())) || (sub_type(Dirichlet_homogene, la_cl.valeur())))
         {
@@ -85,22 +93,20 @@ void Terme_Source_Canal_perio_VDF_Face::ajouter_blocs(matrices_t matrices, Doubl
   // Boucle sur les faces internes
   ndeb = domaine_VF.premiere_face_int();
   int nb_faces = domaine_VF.nb_faces();
-  for (int num_face = ndeb; num_face < nb_faces; num_face++)
-    {
-      double vol = volumes_entrelaces(num_face) * porosite_surf(num_face);
-      ncomp = orientation(num_face);
-      secmem(num_face) += s[ncomp] * vol;
-    }
-
+  Kokkos::parallel_for(start_gpu_timer(__KERNEL_NAME__), Kokkos::RangePolicy<>(ndeb, nb_faces),KOKKOS_LAMBDA(const int num_face)
+  {
+    double vol = volumes_entrelaces(num_face) * porosite_surf(num_face);
+    int ncomp = orientation(num_face);
+
+    secmen(num_face,0) += s[ncomp] * vol;
+  });
+  end_gpu_timer(__KERNEL_NAME__);
 }
 
 void Terme_Source_Canal_perio_VDF_Face::calculer_debit(double& debit_e) const
 {
   const Domaine_VF& domaine_VF = le_dom_VDF.valeur();
   const Domaine_Cl_dis_base& domaine_Cl_dis = le_dom_Cl_VDF.valeur();
-  const DoubleTab& vitesse = equation().inconnue().valeurs();
-  const DoubleVect& porosite_surf = equation().milieu().porosite_face();
-  int ndeb, nfin, num_face;
   int nb_bords = domaine_VF.nb_front_Cl();
   for (int n_bord = 0; n_bord < nb_bords; n_bord++)
     {
@@ -115,31 +121,31 @@ void Terme_Source_Canal_perio_VDF_Face::calculer_debit(double& debit_e) const
               int axe = perio.direction_periodicite();
               assert(axe == direction_ecoulement_);
               debit_e = 0.;
-              ndeb = le_bord.num_premiere_face();
-              nfin = ndeb + le_bord.nb_faces() / 2;
+              int ndeb = le_bord.num_premiere_face();
+              int nfin = ndeb + le_bord.nb_faces() / 2;
 
+              CDoubleArrView porosite = equation().milieu().porosite_face().view_ro();
+              CDoubleArrView vitesse = static_cast<const ArrOfDouble&>(equation().inconnue().valeurs()).view_ro();
+              CDoubleTabView face_normales = domaine_VF.face_normales().view_ro();
               if (equation().probleme().is_dilatable() == 1)
                 {
                   // Si l'on est en Quasi/Weakly Compressible, il faut conserver
                   // le debit massique et non pas le debit volumique.
-                  // C'est pour cela que dans le cas QC/WC, on multiplie les vecteurs vitesse
-                  // par la masse volumique discretisee aux faces pour que lorsqu'on integre sur la surface,
-                  // on obtienne bien un debit massique et non pas un debit volumique.
                   const DoubleTab& tab_rho_face = ref_cast(Fluide_Dilatable_base,equation().milieu()).rho_discvit();
-
-                  for (num_face = ndeb; num_face < nfin; num_face++)
-                    {
-                      double debit_face = porosite_surf[num_face] * vitesse[num_face] * std::fabs(domaine_VF.face_normales(num_face, axe));
-                      debit_e += tab_rho_face[num_face] * debit_face;
-                    }
+                  CDoubleArrView rho_face = static_cast<const ArrOfDouble&>(tab_rho_face).view_ro();
+                  Kokkos::parallel_reduce(start_gpu_timer(__KERNEL_NAME__), range_1D(ndeb, nfin), KOKKOS_LAMBDA(const int num_face, double& sum)
+                  {
+                    sum += rho_face(num_face) * porosite(num_face) * vitesse(num_face) * Kokkos::fabs(face_normales(num_face, axe));
+                  }, debit_e);
+                  end_gpu_timer(__KERNEL_NAME__);
                 }
               else
                 {
-                  for (num_face = ndeb; num_face < nfin; num_face++)
-                    {
-                      double debit_face = porosite_surf[num_face] * vitesse[num_face] * std::fabs(domaine_VF.face_normales(num_face, axe));
-                      debit_e += debit_face;
-                    }
+                  Kokkos::parallel_reduce(start_gpu_timer(__KERNEL_NAME__), range_1D(ndeb, nfin), KOKKOS_LAMBDA(const int num_face, double& sum)
+                  {
+                    sum += porosite(num_face) * vitesse(num_face) * Kokkos::fabs(face_normales(num_face, axe));
+                  }, debit_e);
+                  end_gpu_timer(__KERNEL_NAME__);
                 }
             }
         }
diff --git a/src/VDF/Sources/Canal/Terme_Source_Canal_perio_VDF_Face.h b/src/VDF/Sources/Canal/Terme_Source_Canal_perio_VDF_Face.h
index 63b6351aaf..ed0e7fb120 100644
--- a/src/VDF/Sources/Canal/Terme_Source_Canal_perio_VDF_Face.h
+++ b/src/VDF/Sources/Canal/Terme_Source_Canal_perio_VDF_Face.h
@@ -1,5 +1,5 @@
 /****************************************************************************
-* Copyright (c) 2024, CEA
+* Copyright (c) 2026, CEA
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
@@ -37,15 +37,16 @@ class Terme_Source_Canal_perio_VDF_Face : public Terme_Source_Canal_perio
   Declare_instanciable(Terme_Source_Canal_perio_VDF_Face);
 public :
   inline void dimensionner_blocs(matrices_t matrices, const tabs_t& semi_impl) const override {}
-  void ajouter_blocs(matrices_t matrices, DoubleTab& secmem, const tabs_t& semi_impl) const override;
+  void ajouter_blocs(matrices_t matrices, DoubleTab& tab_secmen, const tabs_t& semi_impl) const override;
   inline int has_interface_blocs() const override { return 1; }
 
+  protected_but_public_for_cuda
+  void calculer_debit(double&) const override;
+
 protected :
   OBS_PTR(Domaine_VDF) le_dom_VDF;
   OBS_PTR(Domaine_Cl_VDF) le_dom_Cl_VDF;
   void associer_domaines(const Domaine_dis_base& ,const Domaine_Cl_dis_base& ) override;
-
-  void calculer_debit(double&) const override;
 };
 
 class Terme_Source_Canal_perio_QC_VDF_Face : public Terme_Source_Canal_perio_VDF_Face
diff --git a/src/VDF/Sources/Canal/Terme_Source_Canal_perio_VDF_P0.cpp b/src/VDF/Sources/Canal/Terme_Source_Canal_perio_VDF_P0.cpp
index b20d5457a7..afe44e1246 100644
--- a/src/VDF/Sources/Canal/Terme_Source_Canal_perio_VDF_P0.cpp
+++ b/src/VDF/Sources/Canal/Terme_Source_Canal_perio_VDF_P0.cpp
@@ -1,5 +1,5 @@
 /****************************************************************************
-* Copyright (c) 2024, CEA
+* Copyright (c) 2026, CEA
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
@@ -54,6 +54,7 @@ ArrOfDouble Terme_Source_Canal_perio_VDF_P0::source_convection_diffusion(double
       // Compute source term with
       // Source = -Sum(imposed_heat_flux)/Volume
       // Loop on the faces
+      ToDo_Kokkos("critical");
       for (int num_elem = 0; num_elem < size; num_elem++)
         s[num_elem] = -heat_flux/volume;
     }
@@ -68,6 +69,7 @@ void Terme_Source_Canal_perio_VDF_P0::ajouter_blocs(matrices_t matrices, DoubleT
 
   // Boucle sur les elements internes
   int nb_elem = domaine_VF.nb_elem();
+  ToDo_Kokkos("critical");
   for (int num_elem = 0; num_elem < nb_elem; num_elem++)
     {
       double vol = volumes(num_elem);
diff --git a/src/VDF/Sources/Dilatable/Source_Fluide_Dilatable_VDF_Proto.cpp b/src/VDF/Sources/Dilatable/Source_Fluide_Dilatable_VDF_Proto.cpp
index 94defcf7ab..0f1f9074df 100644
--- a/src/VDF/Sources/Dilatable/Source_Fluide_Dilatable_VDF_Proto.cpp
+++ b/src/VDF/Sources/Dilatable/Source_Fluide_Dilatable_VDF_Proto.cpp
@@ -1,5 +1,5 @@
 /****************************************************************************
-* Copyright (c) 2024, CEA
+* Copyright (c) 2025, CEA
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
@@ -35,29 +35,34 @@ void Source_Fluide_Dilatable_VDF_Proto::associer_volume_porosite_impl(const Doma
   porosites.ref(le_dom_Cl->equation().milieu().porosite_elem());
 }
 
-void Source_Fluide_Dilatable_VDF_Proto::ajouter_impl(const DoubleVect& g,const double rho_m,
-                                                     const DoubleTab& tab_rho, DoubleTab& resu) const
+void Source_Fluide_Dilatable_VDF_Proto::ajouter_impl(const DoubleVect& tab_g,const double rho_m,
+                                                     const DoubleTab& tab_rho, DoubleTab& tab_resu) const
 {
-
-  const int nb_faces = le_dom->nb_faces(), premiere_face_interne = le_dom->premiere_face_int();
-  const IntVect& orientation = le_dom->orientation();
-  const DoubleVect& volumes_entrelaces = le_dom->volumes_entrelaces(), porosite_surf=le_dom_Cl->equation().milieu().porosite_face();
-
+  CIntArrView orientation = le_dom->orientation().view_ro();
+  CDoubleArrView g = tab_g.view_ro();
+  CDoubleArrView volumes_entrelaces = le_dom->volumes_entrelaces().view_ro();
+  CDoubleArrView porosite_surf = le_dom_Cl->equation().milieu().porosite_face().view_ro();
+  CDoubleArrView rho = static_cast<const ArrOfDouble&>(tab_rho).view_ro();
+  DoubleArrView resu = static_cast<ArrOfDouble&>(tab_resu).view_rw();
   for (int num_cl=0 ; num_cl<le_dom->nb_front_Cl() ; num_cl++)
     {
       const Cond_lim& la_cl = le_dom_Cl->les_conditions_limites(num_cl);
       const Front_VF& le_bord = ref_cast(Front_VF,la_cl->frontiere_dis());
-      const int ndeb = le_bord.num_premiere_face(), nfin = ndeb + le_bord.nb_faces();
-
       if (sub_type(Dirichlet,la_cl.valeur()) || sub_type(Dirichlet_homogene,la_cl.valeur())) { /* Do nothing */ }
       else
         {
-          for (int face=ndeb ; face<nfin ; face++)
-            resu(face) += (tab_rho(face)-rho_m)*g(orientation(face)) * volumes_entrelaces(face)*porosite_surf(face);
+          const int ndeb = le_bord.num_premiere_face(), nfin = ndeb + le_bord.nb_faces();
+          Kokkos::parallel_for(start_gpu_timer(__KERNEL_NAME__), range_1D(ndeb, nfin), KOKKOS_LAMBDA(const int face)
+          {
+            resu(face) += (rho(face) - rho_m) * g(orientation(face)) * volumes_entrelaces(face) * porosite_surf(face);
+          });
+          end_gpu_timer(__KERNEL_NAME__);
         }
     }
-
-  for (int face=premiere_face_interne ; face<nb_faces; face++)
-    resu(face) += (tab_rho(face)-rho_m)*g(orientation(face)) * volumes_entrelaces(face)*porosite_surf(face);
-
+  const int nb_faces = le_dom->nb_faces(), premiere_face_interne = le_dom->premiere_face_int();
+  Kokkos::parallel_for(start_gpu_timer(__KERNEL_NAME__), range_1D(premiere_face_interne, nb_faces), KOKKOS_LAMBDA(const int face)
+  {
+    resu(face) += (rho(face) - rho_m) * g(orientation(face)) * volumes_entrelaces(face) * porosite_surf(face);
+  });
+  end_gpu_timer(__KERNEL_NAME__);
 }
diff --git a/src/VDF/Sources/Dilatable/Source_Fluide_Dilatable_VDF_Proto.h b/src/VDF/Sources/Dilatable/Source_Fluide_Dilatable_VDF_Proto.h
index 4b09cba7b8..029b42a626 100644
--- a/src/VDF/Sources/Dilatable/Source_Fluide_Dilatable_VDF_Proto.h
+++ b/src/VDF/Sources/Dilatable/Source_Fluide_Dilatable_VDF_Proto.h
@@ -1,5 +1,5 @@
 /****************************************************************************
-* Copyright (c) 2024, CEA
+* Copyright (c) 2026, CEA
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
@@ -18,6 +18,7 @@
 
 #include <TRUSTTabs_forward.h>
 #include <TRUST_Ref.h>
+#include <kokkos++.h>
 
 class Domaine_Cl_VDF;
 class Equation_base;
@@ -27,10 +28,12 @@ class Domaine_Cl_dis_base;
 
 class Source_Fluide_Dilatable_VDF_Proto
 {
+  protected_but_public_for_cuda
+  void ajouter_impl( const DoubleVect& g, const double rho_m, const DoubleTab& tab_rho, DoubleTab& resu) const;
+
 protected:
   void associer_domaines_impl(const Domaine_dis_base& domaine,const Domaine_Cl_dis_base& domaine_cl);
   void associer_volume_porosite_impl(const Domaine_dis_base& domaine, DoubleVect& volumes, DoubleVect& porosites);
-  void ajouter_impl( const DoubleVect& g, const double rho_m, const DoubleTab& tab_rho, DoubleTab& resu) const;
 
   OBS_PTR(Domaine_Cl_VDF) le_dom_Cl;
   OBS_PTR(Domaine_VDF) le_dom;
diff --git a/src/VDF/Sources/Dilatable/Source_Masse_Fluide_Dilatable_VDF.cpp b/src/VDF/Sources/Dilatable/Source_Masse_Fluide_Dilatable_VDF.cpp
index c923df06d6..6b917f25dc 100644
--- a/src/VDF/Sources/Dilatable/Source_Masse_Fluide_Dilatable_VDF.cpp
+++ b/src/VDF/Sources/Dilatable/Source_Masse_Fluide_Dilatable_VDF.cpp
@@ -18,6 +18,8 @@
 #include <Fluide_Weakly_Compressible.h>
 #include <TRUSTTrav.h>
 #include <Domaine_VF.h>
+#include <kokkos++.h>
+#include <TRUSTArray_kokkos.tpp>
 
 Implemente_instanciable(Source_Masse_Fluide_Dilatable_VDF,"Source_Masse_Fluide_Dilatable_VDF",Source_Masse_Fluide_Dilatable_base);
 
@@ -77,21 +79,20 @@ Entree& Source_Masse_Fluide_Dilatable_VDF::readOn(Entree& is) { return Source_Ma
  *              Y, rho at cell center, same as before for surf and V... This gives well the unit 1 / s, as d(Y)/dt !
  *
  */
-void Source_Masse_Fluide_Dilatable_VDF::ajouter_eq_espece(const Convection_Diffusion_Fluide_Dilatable_base& eqn, const Fluide_Dilatable_base& fluide, const bool is_expl, DoubleVect& resu) const
+void Source_Masse_Fluide_Dilatable_VDF::ajouter_eq_espece(const Convection_Diffusion_Fluide_Dilatable_base& eqn, const Fluide_Dilatable_base& fluide, const bool is_expl, DoubleVect& tab_resu) const
 {
   assert(sub_type(Fluide_Weakly_Compressible,fluide));
-  const DoubleTab& Y = eqn.inconnue().valeurs(), &rho = fluide.masse_volumique().valeurs();
+  const DoubleTab& tab_Y = eqn.inconnue().valeurs(), &tab_rho = fluide.masse_volumique().valeurs();
 
   const Domaine_Cl_dis_base& zclb = domaine_cl_dis_.valeur();
   const Domaine_VF& zvf = ref_cast(Domaine_VF, zclb.domaine_dis());
-  const IntTab& face_voisins = zvf.face_voisins();
 
   // pour post
   Champ_Don_base * post_src_ch = fluide.has_source_masse_espece_champ() ? &ref_cast_non_const(Fluide_Dilatable_base, fluide).source_masse_espece() : nullptr;
 
-  // On commence par remplir val_flux seulement pour les bonnes faces ...
-  DoubleTrav val_flux(zvf.nb_faces(), 1);
-  fill_val_flux_tab(val_flux);
+  // On commence par remplir flux seulement pour les bonnes faces ...
+  DoubleTrav flux(zvf.nb_faces(), 1);
+  fill_val_flux_tab(flux);
 
   // Maintennat on regarde resu ...
   for (int n_bord = 0; n_bord < domaine_cl_dis_->nb_cond_lim(); n_bord++)
@@ -102,19 +103,29 @@ void Source_Masse_Fluide_Dilatable_VDF::ajouter_eq_espece(const Convection_Diffu
       if (le_bord.le_nom() == nom_bord_)
         {
           const int ndeb = le_bord.num_premiere_face(), nfin = ndeb + le_bord.nb_faces();
-          for (int num_face = ndeb; num_face < nfin; num_face++)
-            {
-              const int elem1 = face_voisins(num_face, 0), elem2 = face_voisins(num_face, 1);
-              int elem = elem1 == -1 ? elem2 : elem1;
-              const double surface_elem = zvf.face_surfaces(num_face);
-              double srcmass = -(Y(elem) * val_flux(num_face, 0) * surface_elem) / rho(elem);
-              if (is_expl)
-                srcmass /= zvf.volumes(elem); // on divise par volume (pas de solveur masse dans l'equation ...)
-              resu(elem) += srcmass;
-
-              if (post_src_ch)
-                (*post_src_ch).valeurs()(elem) = srcmass;
-            }
+
+          CIntTabView face_voisins = zvf.face_voisins().view_ro();
+          CDoubleArrView face_surfaces = zvf.face_surfaces().view_ro();
+          CDoubleTabView val_flux = flux.view_ro();
+          CDoubleArrView volumes = zvf.volumes().view_ro();
+          CDoubleArrView Y = static_cast<const ArrOfDouble&>(tab_Y).view_ro();
+          CDoubleArrView rho = static_cast<const ArrOfDouble&>(tab_rho).view_ro();
+          DoubleArrView resu = static_cast<ArrOfDouble&>(tab_resu).view_rw();
+          DoubleArrView post_valeurs = post_src_ch ? static_cast<ArrOfDouble&>((*post_src_ch).valeurs()).view_wo() : DoubleArrView();
+          Kokkos::parallel_for(start_gpu_timer(__KERNEL_NAME__), range_1D(ndeb, nfin), KOKKOS_LAMBDA(const int num_face)
+          {
+            const int elem1 = face_voisins(num_face, 0), elem2 = face_voisins(num_face, 1);
+            int elem = elem1 == -1 ? elem2 : elem1;
+            const double surface_elem = face_surfaces(num_face);
+            double srcmass = -(Y(elem) * val_flux(num_face, 0) * surface_elem) / rho(elem);
+            if (is_expl)
+              srcmass /= volumes(elem);
+            Kokkos::atomic_add(&resu(elem), srcmass);
+
+            if (post_src_ch)
+              Kokkos::atomic_store(&post_valeurs(elem), srcmass);
+          });
+          end_gpu_timer(__KERNEL_NAME__);
         }
     }
 
@@ -123,19 +134,18 @@ void Source_Masse_Fluide_Dilatable_VDF::ajouter_eq_espece(const Convection_Diffu
     (*post_src_ch).mettre_a_jour(fluide.inco_chaleur().temps());
 }
 
-void Source_Masse_Fluide_Dilatable_VDF::ajouter_projection(const Fluide_Dilatable_base& fluide, DoubleVect& resu) const
+void Source_Masse_Fluide_Dilatable_VDF::ajouter_projection(const Fluide_Dilatable_base& fluide, DoubleVect& tab_resu) const
 {
   assert(sub_type(Fluide_Weakly_Compressible,fluide));
   const Domaine_Cl_dis_base& zclb = domaine_cl_dis_.valeur();
   const Domaine_VF& zvf = ref_cast(Domaine_VF, zclb.domaine_dis());
-  const IntTab& face_voisins = zvf.face_voisins();
 
   // pour post
   Champ_Don_base* post_src_ch = fluide.has_source_masse_projection_champ() ? &ref_cast_non_const(Fluide_Dilatable_base, fluide).source_masse_projection() : nullptr;
 
-  // On commence par remplir val_flux seulement pour les bonnes faces ...
-  DoubleTrav val_flux(zvf.nb_faces(), 1);
-  fill_val_flux_tab(val_flux);
+  // On commence par remplir flux seulement pour les bonnes faces ...
+  DoubleTrav flux(zvf.nb_faces(), 1);
+  fill_val_flux_tab(flux);
 
   // Maintennat on regarde resu ...
   for (int n_bord = 0; n_bord < domaine_cl_dis_->nb_cond_lim(); n_bord++)
@@ -147,17 +157,24 @@ void Source_Masse_Fluide_Dilatable_VDF::ajouter_projection(const Fluide_Dilatabl
         {
           const int ndeb = le_bord.num_premiere_face(), nfin = ndeb + le_bord.nb_faces();
 
-          for (int num_face = ndeb; num_face < nfin; num_face++)
-            {
-              const int elem1 = face_voisins(num_face, 0), elem2 = face_voisins(num_face, 1);
-              int elem = elem1 == -1 ? elem2 : elem1;
-              const double surf = zvf.face_surfaces(num_face);
-              const double source_per_dv = val_flux(num_face, 0) * surf / zvf.volumes(elem);  // TODO multiple elements!! units [kg.s-1] / zvf.volumes(elem)
-              resu(elem) -= source_per_dv;  // in [kg.m-3.s-1]
-
-              if (post_src_ch)
-                (*post_src_ch).valeurs()(elem) = source_per_dv;
-            }
+          CIntTabView face_voisins = zvf.face_voisins().view_ro();
+          CDoubleArrView face_surfaces = zvf.face_surfaces().view_ro();
+          CDoubleTabView val_flux = flux.view_ro();
+          CDoubleArrView volumes = zvf.volumes().view_ro();
+          DoubleArrView resu = static_cast<ArrOfDouble&>(tab_resu).view_rw();
+          DoubleArrView post_valeurs = post_src_ch ? static_cast<ArrOfDouble&>((*post_src_ch).valeurs()).view_rw() : DoubleArrView();
+          Kokkos::parallel_for(start_gpu_timer(__KERNEL_NAME__), range_1D(ndeb, nfin), KOKKOS_LAMBDA(const int num_face)
+          {
+            const int elem1 = face_voisins(num_face, 0), elem2 = face_voisins(num_face, 1);
+            int elem = elem1 == -1 ? elem2 : elem1;
+            const double surf = face_surfaces(num_face);
+            const double source_per_dv = val_flux(num_face, 0) * surf / volumes(elem);
+            Kokkos::atomic_add(&resu(elem), -source_per_dv);
+
+            if (post_src_ch)
+              Kokkos::atomic_store(&post_valeurs(elem), source_per_dv);
+          });
+          end_gpu_timer(__KERNEL_NAME__);
         }
     }
 
diff --git a/src/VDF/Sources/Dilatable/Source_WC_Chaleur_VDF.cpp b/src/VDF/Sources/Dilatable/Source_WC_Chaleur_VDF.cpp
index de0654fcf0..d33457b71f 100644
--- a/src/VDF/Sources/Dilatable/Source_WC_Chaleur_VDF.cpp
+++ b/src/VDF/Sources/Dilatable/Source_WC_Chaleur_VDF.cpp
@@ -1,5 +1,5 @@
 /****************************************************************************
-* Copyright (c) 2024, CEA
+* Copyright (c) 2026, CEA
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
@@ -133,6 +133,7 @@ void Source_WC_Chaleur_VDF::compute_interpolate_gradP_old(DoubleTab& UgradP_elem
           const Front_VF& le_bord = ref_cast(Front_VF,la_cl->frontiere_dis());
           const int ndeb = le_bord.num_premiere_face(), nfin = ndeb + le_bord.nb_faces();
 
+          ToDo_Kokkos("critical");
           for (int num_face=ndeb; num_face<nfin; num_face++) grad_Ptot(num_face,0) = 0.;
         }
     }
diff --git a/src/VDF/Sources/PDC/Perte_Charge_Reguliere_VDF_Face.cpp b/src/VDF/Sources/PDC/Perte_Charge_Reguliere_VDF_Face.cpp
index e991abd08b..2d4408c18f 100644
--- a/src/VDF/Sources/PDC/Perte_Charge_Reguliere_VDF_Face.cpp
+++ b/src/VDF/Sources/PDC/Perte_Charge_Reguliere_VDF_Face.cpp
@@ -1,5 +1,5 @@
 /****************************************************************************
-* Copyright (c) 2024, CEA
+* Copyright (c) 2026, CEA
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
@@ -75,6 +75,7 @@ void Perte_Charge_Reguliere_VDF_Face::remplir_num_faces(Nom& nom_sous_domaine)
   IntVect num_loc(domaine_VDF.nb_elem_tot());
   num_loc = -1;
   int num_elem,num_face;
+  ToDo_Kokkos("critical");
   for (num_elem=0; num_elem<nb_poly_ss_domaine; num_elem++)
     num_loc[le_sous_domaine(num_elem)] = num_elem;
 
@@ -86,6 +87,7 @@ void Perte_Charge_Reguliere_VDF_Face::remplir_num_faces(Nom& nom_sous_domaine)
         {
           dir_a_faire = k ;
           if (dir[direction] == 1 ) dir_a_faire = direction ;
+          ToDo_Kokkos("critical");
           for (num_elem=0; num_elem<nb_poly_ss_domaine; num_elem++)
             {
               num_poly = le_sous_domaine(num_elem);
diff --git a/src/VDF/Sources/PDC/Perte_Charge_Singuliere_VDF_Face.cpp b/src/VDF/Sources/PDC/Perte_Charge_Singuliere_VDF_Face.cpp
index 954c7d86dc..bf551f44b8 100644
--- a/src/VDF/Sources/PDC/Perte_Charge_Singuliere_VDF_Face.cpp
+++ b/src/VDF/Sources/PDC/Perte_Charge_Singuliere_VDF_Face.cpp
@@ -191,7 +191,9 @@ void Perte_Charge_Singuliere_VDF_Face::ajouter_blocs(matrices_t matrices, Double
 DoubleTab& Perte_Charge_Singuliere_VDF_Face::ajouter_(const DoubleTab& inco, DoubleTab& resu) const
 {
   const std::string& nom_inco = equation().inconnue().le_nom().getString();
-  ajouter_blocs({}, resu, {{nom_inco, inco}});
+  tabs_t semi_impl;
+  semi_impl[nom_inco].ref(inco); /* evite la copie de inco dans tabs_t */
+  ajouter_blocs({}, resu, semi_impl);
   return resu;
 }
 
diff --git a/src/VDF/Sources/Source_Generique_VDF_Face.cpp b/src/VDF/Sources/Source_Generique_VDF_Face.cpp
index c6e219318d..0de1f7a067 100644
--- a/src/VDF/Sources/Source_Generique_VDF_Face.cpp
+++ b/src/VDF/Sources/Source_Generique_VDF_Face.cpp
@@ -1,5 +1,5 @@
 /****************************************************************************
-* Copyright (c) 2024, CEA
+* Copyright (c) 2026, CEA
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
@@ -57,12 +57,16 @@ void Source_Generique_VDF_Face::ajouter_blocs(matrices_t matrices, DoubleTab& se
       int nfin = ndeb + le_bord.nb_faces();
       if ((sub_type(Dirichlet, la_cl.valeur())) || (sub_type(Dirichlet_homogene, la_cl.valeur()))) { /* Do nothing */ }
       else
-        for (num_face = ndeb; num_face < nfin; num_face++)
-          {
-            secmem(num_face) += valeurs_calc(num_face) * vol_entrelaces(num_face) * poro_face(num_face);
-          }
+        {
+          ToDo_Kokkos("critical");
+          for (num_face = ndeb; num_face < nfin; num_face++)
+            {
+              secmem(num_face) += valeurs_calc(num_face) * vol_entrelaces(num_face) * poro_face(num_face);
+            }
+        }
     }
 
+  ToDo_Kokkos("critical");
   for (num_face=premiere_face_interne; num_face<nb_faces; num_face++)
     secmem(num_face) += valeurs_calc(num_face)*vol_entrelaces(num_face)*poro_face(num_face);
 
diff --git a/src/VDF/Sources/Sources_It_Eval/Eval_Puiss_Neutr_VDF_Elem.cpp b/src/VDF/Sources/Sources_It_Eval/Eval_Puiss_Neutr_VDF_Elem.cpp
index 0da01def4b..2f2ab8293a 100644
--- a/src/VDF/Sources/Sources_It_Eval/Eval_Puiss_Neutr_VDF_Elem.cpp
+++ b/src/VDF/Sources/Sources_It_Eval/Eval_Puiss_Neutr_VDF_Elem.cpp
@@ -54,9 +54,9 @@ void Eval_Puiss_Neutr_VDF_Elem::associer_repartition(const Nom& n, const Nom& no
       y = xp(i,1);
       if (Objet_U::dimension == 3) z = xp(i,2);
       else z=0.;
-      p.setVar("x",x);
-      p.setVar("y",y);
-      p.setVar("z",z);
+      p.setVar(0,x);
+      p.setVar(1,y);
+      p.setVar(2,z);
       rep(i) *= p.eval();
     }
 }
diff --git a/src/VDF/Sources/Sources_It_Eval/Source_Dirac_VDF_Elem.cpp b/src/VDF/Sources/Sources_It_Eval/Source_Dirac_VDF_Elem.cpp
index e120a589fa..348092330c 100644
--- a/src/VDF/Sources/Sources_It_Eval/Source_Dirac_VDF_Elem.cpp
+++ b/src/VDF/Sources/Sources_It_Eval/Source_Dirac_VDF_Elem.cpp
@@ -1,5 +1,5 @@
 /****************************************************************************
-* Copyright (c) 2024, CEA
+* Copyright (c) 2026, CEA
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
@@ -45,6 +45,7 @@ void Source_Dirac_VDF_Elem::associer_domaines(const Domaine_dis_base& domaine_di
   const Domaine& mon_dom = domaine_dis.domaine();
   nb_dirac = 0;
 
+  ToDo_Kokkos("critical");
   for (int elem = 0; elem < nb_elem; elem++)
     {
       int test =  mon_dom.type_elem()->contient(point,elem) ;
diff --git a/src/VDF/Sources/Terme_Boussinesq_VDF_Face.cpp b/src/VDF/Sources/Terme_Boussinesq_VDF_Face.cpp
index caa7a6c69c..12c74bf5b4 100644
--- a/src/VDF/Sources/Terme_Boussinesq_VDF_Face.cpp
+++ b/src/VDF/Sources/Terme_Boussinesq_VDF_Face.cpp
@@ -1,5 +1,5 @@
 /****************************************************************************
-* Copyright (c) 2024, CEA
+* Copyright (c) 2025, CEA
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
@@ -46,26 +46,27 @@ void Terme_Boussinesq_VDF_Face::associer_domaines(const Domaine_dis_base& domain
   le_dom_Cl_VDF = ref_cast(Domaine_Cl_VDF, domaine_Cl_dis);
 }
 
-void Terme_Boussinesq_VDF_Face::ajouter_blocs(matrices_t matrices, DoubleTab& secmem, const tabs_t& semi_impl) const
+void Terme_Boussinesq_VDF_Face::ajouter_blocs(matrices_t matrices, DoubleTab& tab_secmem, const tabs_t& semi_impl) const
 {
   const Domaine_VDF& domaine_VDF = le_dom_VDF.valeur();
   const Domaine_Cl_VDF& domaine_Cl_VDF_hyd = le_dom_Cl_VDF.valeur();
   const Domaine_Cl_dis_base& domaine_Cl_scal = equation_scalaire().domaine_Cl_dis();
   const Domaine_Cl_VDF& domaine_Cl_VDF_scal = ref_cast(Domaine_Cl_VDF,domaine_Cl_scal);
-  const DoubleTab& param = equation_scalaire().inconnue().valeurs();
-  const DoubleTab& beta_valeurs = beta().valeurs();
-  const DoubleVect& grav = gravite().valeurs();
-  const IntTab& face_voisins = domaine_VDF.face_voisins();
-  const IntVect& orientation = domaine_VDF.orientation();
-  const DoubleTab& xv = domaine_VDF.xv();
-  const DoubleVect& porosite_surf = equation().milieu().porosite_face();
-  const DoubleVect& volumes_entrelaces = domaine_VDF.volumes_entrelaces();
-  const DoubleTab& vitesse = equation().inconnue().valeurs();
+  const DoubleTab& tab_param = equation_scalaire().inconnue().valeurs();
+  CIntTabView face_voisins = domaine_VDF.face_voisins().view_ro();
+  CIntArrView orientation = domaine_VDF.orientation().view_ro();
+  CDoubleTabView xv = domaine_VDF.xv().view_ro();
+  CDoubleArrView porosite_surf = equation().milieu().porosite_face().view_ro();
+  CDoubleArrView volumes_entrelaces = domaine_VDF.volumes_entrelaces().view_ro();
+  CDoubleArrView vitesse = static_cast<const ArrOfDouble&>(equation().inconnue().valeurs()).view_ro();
+  CDoubleTabView param = tab_param.view_ro();
+  CDoubleArrView grav = static_cast<const ArrOfDouble&>(gravite().valeurs()).view_ro();
+  CDoubleTabView beta_valeurs = beta().valeurs().view_ro();
+  CDoubleArrView S0 = getScalaire0().view_ro();
+  DoubleArrView secmem = static_cast<ArrOfDouble&>(tab_secmem).view_rw();
 
-  DoubleVect g(dimension);
-  g = grav;
-
-  int nb_dim = param.line_size();
+  const bool is_axi = domaine_VDF.axi;
+  int nb_comp = tab_param.line_size();
 
   // Verifie la validite de T0:
   check();
@@ -85,119 +86,135 @@ void Terme_Boussinesq_VDF_Face::ajouter_blocs(matrices_t matrices, DoubleTab& se
           if (sub_type(Neumann_sortie_libre,la_cl_scal.valeur()))
             {
               const Neumann_sortie_libre& la_cl_neumann_scal = ref_cast(Neumann_sortie_libre, la_cl_scal.valeur());
-              for (int num_face=ndeb; num_face<nfin; num_face++)
-                {
-                  int outlet;
-                  int elem = face_voisins(num_face,0);
-                  if (elem==-1)
-                    {
-                      outlet = (vitesse(num_face)<0?1:0);
-                      elem = face_voisins(num_face,1);
-                    }
-                  else
-                    outlet = (vitesse(num_face)>0?1:0);
-                  double coef=0;
-                  for (int dim=0; dim<nb_dim; dim++)
-                    {
-                      double param_face = (outlet ? valeur(param,elem,dim) : 0.5*(valeur(param,elem,dim)+la_cl_neumann_scal.val_ext(num_face-ndeb,dim)));
-                      coef += valeur(beta_valeurs,elem,elem,dim)*(Scalaire0(dim)-param_face);
-                    }
-
-                  if (axi)
-                    {
-                      double cos_teta = cos(xv(num_face,1));
-                      double sin_teta = sin(xv(num_face,1));
-                      g(0) = grav(0)*cos_teta + grav(1)*sin_teta;
-                      g(1) = grav(1)*cos_teta - grav(0)*sin_teta;
-                    }
-                  double vol = volumes_entrelaces(num_face)*porosite_surf(num_face);
-                  int ncomp = orientation(num_face);
-                  secmem(num_face)+= coef*g(ncomp)*vol;
-                }
+              CDoubleTabView val_ext = la_cl_neumann_scal.tab_val_ext().view_ro();
+              Kokkos::parallel_for(start_gpu_timer(__KERNEL_NAME__), range_1D(ndeb, nfin), KOKKOS_LAMBDA(const int num_face)
+              {
+                int outlet;
+                int elem = face_voisins(num_face,0);
+                if (elem==-1)
+                  {
+                    outlet = (vitesse(num_face)<0?1:0);
+                    elem = face_voisins(num_face,1);
+                  }
+                else
+                  outlet = (vitesse(num_face)>0?1:0);
+                double coef=0;
+                for (int comp=0; comp<nb_comp; comp++)
+                  {
+                    double param_face = (outlet ? valeur(param,elem,comp) : 0.5*(valeur(param,elem,comp)+val_ext(num_face-ndeb,comp)));
+                    coef += valeur(beta_valeurs,elem,elem,comp)*(S0(comp)-param_face);
+                  }
+                int ori = orientation(num_face);
+                double g = grav(ori);
+                if (is_axi)
+                  {
+                    double cos_teta = Kokkos::cos(xv(num_face,1));
+                    double sin_teta = Kokkos::sin(xv(num_face,1));
+                    if (ori==0)
+                      g = grav(0)*cos_teta + grav(1)*sin_teta;
+                    else if (ori==1)
+                      g = grav(1)*cos_teta - grav(0)*sin_teta;
+                  }
+                double vol = volumes_entrelaces(num_face)*porosite_surf(num_face);
+                secmem(num_face) += coef*g*vol;
+              });
+              end_gpu_timer(__KERNEL_NAME__);
             }
           else if (sub_type(Dirichlet,la_cl_scal.valeur()))
             {
               const Dirichlet& la_cl_diri_scal = ref_cast(Dirichlet,la_cl_scal.valeur());
-              for (int num_face=ndeb; num_face<nfin; num_face++)
-                {
-                  int outlet;
-                  int elem = face_voisins(num_face,0);
-                  if (elem==-1)
-                    {
-                      outlet = (vitesse(num_face)<0?1:0);
-                      elem = face_voisins(num_face,1);
-                    }
-                  else
-                    outlet = (vitesse(num_face)>0?1:0);
+              CDoubleTabView val_imp = la_cl_diri_scal.tab_val_imp().view_ro();
+              Kokkos::parallel_for(start_gpu_timer(__KERNEL_NAME__), range_1D(ndeb, nfin), KOKKOS_LAMBDA(const int num_face)
+              {
+                int outlet;
+                int elem = face_voisins(num_face,0);
+                if (elem==-1)
+                  {
+                    outlet = (vitesse(num_face)<0?1:0);
+                    elem = face_voisins(num_face,1);
+                  }
+                else
+                  outlet = (vitesse(num_face)>0?1:0);
 
-                  double coef=0;
-                  for (int dim=0; dim<nb_dim; dim++)
-                    {
-                      double param_face = (outlet ? valeur(param,elem,dim) : 0.5*(valeur(param,elem,dim)+la_cl_diri_scal.val_imp(num_face-ndeb,dim)));
-                      coef += valeur(beta_valeurs,elem,elem,dim)*(Scalaire0(dim)-param_face);
-                    }
-                  if (axi)
-                    {
-                      double cos_teta = cos(xv(num_face,1));
-                      double sin_teta = sin(xv(num_face,1));
-                      g(0) = grav(0)*cos_teta + grav(1)*sin_teta;
-                      g(1) = grav(1)*cos_teta - grav(0)*sin_teta;
-                    }
-                  double vol = volumes_entrelaces(num_face)*porosite_surf(num_face);
-                  int ncomp = orientation(num_face);
-                  secmem(num_face)+= coef*g(ncomp)*vol;
-                }
+                double coef=0;
+                for (int comp=0; comp<nb_comp; comp++)
+                  {
+                    double param_face = (outlet ? valeur(param,elem,comp) : 0.5*(valeur(param,elem,comp)+val_imp(num_face-ndeb,comp)));
+                    coef += valeur(beta_valeurs,elem,elem,comp)*(S0(comp)-param_face);
+                  }
+                int ncomp = orientation(num_face);
+                double g = grav(ncomp);
+                if (is_axi)
+                  {
+                    double cos_teta = Kokkos::cos(xv(num_face,1));
+                    double sin_teta = Kokkos::sin(xv(num_face,1));
+                    if (ncomp==0)
+                      g = grav(0)*cos_teta + grav(1)*sin_teta;
+                    else if (ncomp==1)
+                      g = grav(1)*cos_teta - grav(0)*sin_teta;
+                  }
+                double vol = volumes_entrelaces(num_face)*porosite_surf(num_face);
+                secmem(num_face) += coef*g*vol;
+              });
+              end_gpu_timer(__KERNEL_NAME__);
             }
         }
       else
         {
-          for (int num_face=ndeb; num_face<nfin; num_face++)
-            {
-              int elem = face_voisins(num_face,0);
-              if (elem == -1) elem = face_voisins(num_face,1);
-              double coef = 0;
-              for (int dim=0; dim<nb_dim; dim++)
-                {
-                  double param_face = valeur(param,elem,dim);
-                  coef += valeur(beta_valeurs,elem,elem,dim)*(Scalaire0(dim)-param_face);
-                }
-              if (axi)
-                {
-                  double cos_teta = cos(xv(num_face,1));
-                  double sin_teta = sin(xv(num_face,1));
-                  g(0) = grav(0)*cos_teta + grav(1)*sin_teta;
-                  g(1) = grav(1)*cos_teta - grav(0)*sin_teta;
-                }
-              int ncomp = orientation(num_face);
-              double vol = volumes_entrelaces(num_face)*porosite_surf(num_face);
-              secmem(num_face) += coef*g(ncomp)*vol;
-            }
+          Kokkos::parallel_for(start_gpu_timer(__KERNEL_NAME__), range_1D(ndeb, nfin), KOKKOS_LAMBDA(const int num_face)
+          {
+            int elem = face_voisins(num_face,0);
+            if (elem == -1) elem = face_voisins(num_face,1);
+            double coef = 0;
+            for (int comp=0; comp<nb_comp; comp++)
+              {
+                double param_face = valeur(param,elem,comp);
+                coef += valeur(beta_valeurs,elem,elem,comp)*(S0(comp)-param_face);
+              }
+            int ori = orientation(num_face);
+            double g = grav(ori);
+            if (is_axi)
+              {
+                double cos_teta = Kokkos::cos(xv(num_face,1));
+                double sin_teta = Kokkos::sin(xv(num_face,1));
+                if (ori==0)
+                  g = grav(0)*cos_teta + grav(1)*sin_teta;
+                else if (ori==1)
+                  g = grav(1)*cos_teta - grav(0)*sin_teta;
+              }
+            double vol = volumes_entrelaces(num_face)*porosite_surf(num_face);
+            secmem(num_face) += coef*g*vol;
+          });
+          end_gpu_timer(__KERNEL_NAME__);
         }
     }
 
   // Boucle sur les faces internes
   int ndeb = domaine_VDF.premiere_face_int();
   int nb_faces = domaine_VDF.nb_faces();
-  for (int num_face=ndeb; num_face<nb_faces; num_face++)
-    {
-      int elem1 = face_voisins(num_face,0);
-      int elem2 = face_voisins(num_face,1);
-      double coef = 0;
-      for (int dim=0; dim<nb_dim; dim++)
-        {
-          double param_face = 0.5*(valeur(param,elem1,dim)+valeur(param,elem2,dim));
-          coef += valeur(beta_valeurs,elem1,elem2,dim)*(Scalaire0(dim)-param_face);
-        }
-      if (axi)
-        {
-          double cos_teta = cos(xv(num_face,1));
-          double sin_teta = sin(xv(num_face,1));
-          g(0) = grav(0)*cos_teta + grav(1)*sin_teta;
-          g(1) = grav(1)*cos_teta - grav(0)*sin_teta;
-        }
-      int ncomp = orientation(num_face);
-      double vol = volumes_entrelaces(num_face)*porosite_surf(num_face);
-      secmem(num_face) += coef*g(ncomp)*vol;
-    }
-
+  Kokkos::parallel_for(start_gpu_timer(__KERNEL_NAME__), range_1D(ndeb, nb_faces), KOKKOS_LAMBDA(const int num_face)
+  {
+    int elem1 = face_voisins(num_face,0);
+    int elem2 = face_voisins(num_face,1);
+    double coef = 0;
+    for (int comp=0; comp<nb_comp; comp++)
+      {
+        double param_face = 0.5*(valeur(param,elem1,comp)+valeur(param,elem2,comp));
+        coef += valeur(beta_valeurs,elem1,elem2,comp)*(S0(comp)-param_face);
+      }
+    int ori = orientation(num_face);
+    double g = grav(ori);
+    if (is_axi)
+      {
+        double cos_teta = Kokkos::cos(xv(num_face,1));
+        double sin_teta = Kokkos::sin(xv(num_face,1));
+        if (ori==0)
+          g = grav(0)*cos_teta + grav(1)*sin_teta;
+        else if (ori==1)
+          g = grav(1)*cos_teta - grav(0)*sin_teta;
+      }
+    double vol = volumes_entrelaces(num_face)*porosite_surf(num_face);
+    secmem(num_face) += coef*g*vol;
+  });
+  end_gpu_timer(__KERNEL_NAME__);
 }
diff --git a/src/VDF/Sources/Terme_Source_Acceleration_VDF_Face.cpp b/src/VDF/Sources/Terme_Source_Acceleration_VDF_Face.cpp
index 757d38d284..85d7e4cca5 100644
--- a/src/VDF/Sources/Terme_Source_Acceleration_VDF_Face.cpp
+++ b/src/VDF/Sources/Terme_Source_Acceleration_VDF_Face.cpp
@@ -80,6 +80,7 @@ static void TSAVDF_ajouter_liste_faces(const int premiere_face, const int dernie
   const DoubleTab * const rho_elem =
     (bool(ref_rho)) ? &(ref_rho->valeurs()) : 0;
 
+  ToDo_Kokkos("critical");
   for (num_face=premiere_face; num_face<derniere_face; num_face++)
     {
       const double vol = volumes_entrelaces(num_face)*porosite_surf(num_face);
@@ -190,7 +191,6 @@ void Terme_Source_Acceleration_VDF_Face::ajouter_blocs(matrices_t matrices, Doub
             const Front_VF& le_bord = ref_cast(Front_VF,la_cl->frontiere_dis());
             int nb_faces_bord=le_bord.nb_faces();
             ArrOfInt fait(nb_faces_bord);
-            fait = 0;
             for (int ind_face=0; ind_face<nb_faces_bord; ind_face++)
               {
                 if (fait[ind_face] == 0)
@@ -236,6 +236,7 @@ const DoubleTab& Terme_Source_Acceleration_VDF_Face::calculer_vitesse_faces(
   v_faces_stockage.resize(nb_faces, dim);
   int i_face;
   ArrOfDouble composante_vitesse(3);
+  ToDo_Kokkos("critical");
   for (i_face = 0; i_face < nb_faces; i_face++)
     {
       const int orientation_face = orientation(i_face);
diff --git a/src/VDF/Sources/Terme_Source_Coriolis_VDF_Face.cpp b/src/VDF/Sources/Terme_Source_Coriolis_VDF_Face.cpp
index 972dbba65b..6e5056a729 100644
--- a/src/VDF/Sources/Terme_Source_Coriolis_VDF_Face.cpp
+++ b/src/VDF/Sources/Terme_Source_Coriolis_VDF_Face.cpp
@@ -1,5 +1,5 @@
 /****************************************************************************
-* Copyright (c) 2024, CEA
+* Copyright (c) 2026, CEA
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
@@ -124,6 +124,7 @@ void Terme_Source_Coriolis_VDF_Face::ajouter_blocs(matrices_t matrices, DoubleTa
           ndeb = le_bord.num_premiere_face();
           nfin = ndeb + le_bord.nb_faces();
 
+          ToDo_Kokkos("critical");
           for (num_face=ndeb; num_face<nfin; num_face++)
             {
               vol = volumes_entrelaces(num_face)*porosite_surf(num_face);
@@ -150,6 +151,7 @@ void Terme_Source_Coriolis_VDF_Face::ajouter_blocs(matrices_t matrices, DoubleTa
           ndeb = le_bord.num_premiere_face();
           nfin = ndeb + le_bord.nb_faces();
 
+          ToDo_Kokkos("critical");
           for (num_face=ndeb; num_face<nfin; num_face++)
             {
               vol = volumes_entrelaces(num_face)*porosite_surf(num_face);
@@ -164,6 +166,7 @@ void Terme_Source_Coriolis_VDF_Face::ajouter_blocs(matrices_t matrices, DoubleTa
   // Boucle sur les faces internes
 
   ndeb = domaine_VDF.premiere_face_int();
+  ToDo_Kokkos("critical");
   for (num_face =domaine_VDF.premiere_face_int(); num_face<domaine_VDF.nb_faces(); num_face++)
     {
       vol = volumes_entrelaces(num_face)*porosite_surf(num_face);
@@ -200,6 +203,7 @@ void Terme_Source_Coriolis_VDF_Face::calculer_force_de_Coriolis() const
     {
     case 2:
       {
+        ToDo_Kokkos("critical");
         for (num_elem=0; num_elem <nb_elems; num_elem++)
           {
             for (i=0; i<dimension; i++)
@@ -212,6 +216,7 @@ void Terme_Source_Coriolis_VDF_Face::calculer_force_de_Coriolis() const
       }
     case 3:
       {
+        ToDo_Kokkos("critical");
         for (num_elem=0; num_elem <nb_elems; num_elem++)
           {
             for (i=0; i<dimension; i++)
@@ -246,6 +251,7 @@ void Terme_Source_Coriolis_VDF_Face::calculer_force_de_Coriolis() const
       const Fluide_Dilatable_base& le_fluide = ref_cast(Fluide_Dilatable_base,eq_hydraulique().milieu());
       const DoubleTab& tab_rho_elem = le_fluide.masse_volumique().valeurs();
       double rhoelem;
+      ToDo_Kokkos("critical");
       for (num_elem=0; num_elem <nb_elems; num_elem++)
         {
           rhoelem=tab_rho_elem[num_elem];
diff --git a/src/VDF/Sources/Terme_Source_Qdm_VDF_Face.cpp b/src/VDF/Sources/Terme_Source_Qdm_VDF_Face.cpp
index 917efb3039..36f6819aa9 100644
--- a/src/VDF/Sources/Terme_Source_Qdm_VDF_Face.cpp
+++ b/src/VDF/Sources/Terme_Source_Qdm_VDF_Face.cpp
@@ -1,5 +1,5 @@
 /****************************************************************************
-* Copyright (c) 2024, CEA
+* Copyright (c) 2026, CEA
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
@@ -50,95 +50,99 @@ void Terme_Source_Qdm_VDF_Face::associer_domaines(const Domaine_dis_base& domain
   le_dom_Cl_VDF = ref_cast(Domaine_Cl_VDF, domaine_Cl_dis);
 }
 
-void Terme_Source_Qdm_VDF_Face::ajouter_blocs(matrices_t matrices, DoubleTab& resu, const tabs_t& semi_impl) const
+void Terme_Source_Qdm_VDF_Face::ajouter_blocs(matrices_t matrices, DoubleTab& tab_resu, const tabs_t& semi_impl) const
 {
   const Domaine_VDF& domaine_VDF = le_dom_VDF.valeur();
   const Domaine_Cl_VDF& domaine_Cl_VDF = le_dom_Cl_VDF.valeur();
-  const IntTab& face_voisins = domaine_VDF.face_voisins();
-  const IntVect& orientation = domaine_VDF.orientation();
-  const DoubleVect& porosite_surf = equation().milieu().porosite_face();
-  const DoubleVect& volumes_entrelaces = domaine_VDF.volumes_entrelaces();
+  const DoubleTab* ptr_alp = sub_type(Pb_Multiphase, equation().probleme()) ? &ref_cast(Pb_Multiphase, equation().probleme()).equation_masse().inconnue().passe() : nullptr;
+  const DoubleTab* ptr_rho = ptr_alp ? &equation().milieu().masse_volumique().passe() : nullptr;
 
-  // useful only if multiphase problem
-  const DoubleTab* alp = sub_type(Pb_Multiphase, equation().probleme()) ? &ref_cast(Pb_Multiphase, equation().probleme()).equation_masse().inconnue().passe() : nullptr;
-  const DoubleTab* rho = alp ? &equation().milieu().masse_volumique().passe() : nullptr;
-
-  const int cR = alp ? ((*rho).dimension_tot(0) == 1) : 0;
+  const int cR = ptr_alp ? ((*ptr_rho).dimension_tot(0) == 1) : 0;
   const int nb_comp = equation().inconnue().valeurs().line_size();
 
-  double vol;
-  int ndeb, nfin, ncomp, num_face, elem1, elem2;
+  const bool has_alp = (ptr_alp != nullptr);
+  CDoubleArrView volumes_entrelaces = domaine_VDF.volumes_entrelaces().view_ro();
+  CDoubleArrView porosite_surf = equation().milieu().porosite_face().view_ro();
+  CIntArrView orientation = domaine_VDF.orientation().view_ro();
+  CIntTabView face_voisins = domaine_VDF.face_voisins().view_ro();
+  DoubleTabView resu = tab_resu.view_rw();
+  CDoubleTabView alp, rho;
+  if (has_alp)
+    {
+      alp = ptr_alp->view_ro();
+      rho = ptr_rho->view_ro();
+    }
 
   if (sub_type(Champ_Uniforme, la_source.valeur()))
     {
-      const DoubleVect& s = la_source->valeurs();
+      const DoubleVect& tab_s = la_source->valeurs();
+      CDoubleArrView s = tab_s.view_ro();
 
-      // Boucle sur les conditions limites pour traiter les faces de bord : pour chaque Condition Limite on regarde son type
-      // Si face de Dirichlet ou de Symetrie on ne fait rien
-      // Si face de Neumann on calcule la contribution au terme source
+      // Boucle sur les conditions limites pour traiter les faces de bord
       for (int n_bord = 0; n_bord < domaine_VDF.nb_front_Cl(); n_bord++)
         {
           const Cond_lim& la_cl = domaine_Cl_VDF.les_conditions_limites(n_bord);
-
+          const Front_VF& le_bord = ref_cast(Front_VF, la_cl->frontiere_dis());
+          const int ndeb = le_bord.num_premiere_face();
+          const int nfin = ndeb + le_bord.nb_faces();
           if (sub_type(Periodique, la_cl.valeur()))
             {
-              if (alp) Process::exit("Terme_Source_Qdm_VDF_Face : periodic CL not yet available for Pb_Multiphase !");
-
-              const Front_VF& le_bord = ref_cast(Front_VF, la_cl->frontiere_dis());
-              ndeb = le_bord.num_premiere_face();
-              nfin = ndeb + le_bord.nb_faces();
-
-              for (int k = 0; k < nb_comp; k++)
-                for (num_face = ndeb; num_face < nfin; num_face++)
-                  {
-                    vol = volumes_entrelaces(num_face);
-                    ncomp = orientation(num_face);
-                    resu(num_face, k) += s(nb_comp * ncomp + k) * vol;
-                  }
+              if (ptr_alp) Process::exit("Terme_Source_Qdm_VDF_Face : periodic CL not yet available for Pb_Multiphase !");
+              Kokkos::parallel_for(start_gpu_timer(__KERNEL_NAME__), range_1D(ndeb, nfin), KOKKOS_LAMBDA(const int num_face)
+              {
+                const double vol = volumes_entrelaces(num_face);
+                const int ncomp = orientation(num_face);
+                for (int k = 0; k < nb_comp; k++)
+                  resu(num_face, k) += s(nb_comp * ncomp + k) * vol;
+              });
+              end_gpu_timer(__KERNEL_NAME__);
             }
           else if (sub_type(Neumann_sortie_libre, la_cl.valeur()))
             {
-              const Front_VF& le_bord = ref_cast(Front_VF, la_cl->frontiere_dis());
-              ndeb = le_bord.num_premiere_face();
-              nfin = ndeb + le_bord.nb_faces();
-
-              for (int k = 0; k < nb_comp; k++)
-                for (num_face = ndeb; num_face < nfin; num_face++)
+              Kokkos::parallel_for(start_gpu_timer(__KERNEL_NAME__), range_1D(ndeb, nfin), KOKKOS_LAMBDA(const int num_face)
+              {
+                const double vol = volumes_entrelaces(num_face) * porosite_surf(num_face);
+                const int ncomp = orientation(num_face);
+                for (int k = 0; k < nb_comp; k++)
                   {
-                    vol = volumes_entrelaces(num_face) * porosite_surf(num_face);
-                    ncomp = orientation(num_face);
                     double alpha_rho = 1.0;
-                    if (alp)
+                    if (has_alp)
                       {
-                        elem1 = face_voisins(num_face, 0), elem2 = face_voisins(num_face, 1);
-                        const int e = ( elem1 > -1 ? elem1 : elem2);
-                        double a = (*alp)(e, k), r = (*rho)(!cR * e, k);
-                        alpha_rho = a * r;
+                        const int elem1 = face_voisins(num_face, 0), elem2 = face_voisins(num_face, 1);
+                        const int e = (elem1 > -1 ? elem1 : elem2);
+                        alpha_rho = alp(e, k) * rho(!cR * e, k);
                       }
                     resu(num_face, k) += s(nb_comp * ncomp + k) * vol * alpha_rho;
                   }
-
+              });
+              end_gpu_timer(__KERNEL_NAME__);
             }
           else if (sub_type(Symetrie, la_cl.valeur())) { /* Do nothing */}
           else if ((sub_type(Dirichlet, la_cl.valeur())) || (sub_type(Dirichlet_homogene, la_cl.valeur()))) { /* Do nothing */}
         }
 
       // Boucle sur les faces internes
-      ndeb = domaine_VDF.premiere_face_int();
-      for (int k = 0; k < nb_comp; k++)
-        for (num_face = domaine_VDF.premiere_face_int(); num_face < domaine_VDF.nb_faces(); num_face++)
+      const int ndeb_int = domaine_VDF.premiere_face_int();
+      const int nfin_int = domaine_VDF.nb_faces();
+      Kokkos::parallel_for(start_gpu_timer(__KERNEL_NAME__), range_1D(ndeb_int, nfin_int), KOKKOS_LAMBDA(const int num_face)
+      {
+        const double vol = volumes_entrelaces(num_face) * porosite_surf(num_face);
+        const int ncomp = orientation(num_face);
+        const int elem1 = face_voisins(num_face, 0);
+        const int elem2 = face_voisins(num_face, 1);
+        for (int k = 0; k < nb_comp; k++)
           {
-            vol = volumes_entrelaces(num_face) * porosite_surf(num_face);
-            ncomp = orientation(num_face);
             double alpha_rho = 1.0;
-            if (alp)
+            if (has_alp)
               {
-                elem1 = face_voisins(num_face, 0), elem2 = face_voisins(num_face, 1);
-                double a = 0.5 * ((*alp)(elem1, k) + (*alp)(elem2, k)), r = 0.5 * ((*rho)(!cR * elem1, k) + (*rho)(!cR * elem2, k));
+                double a = 0.5 * (alp(elem1, k) + alp(elem2, k));
+                double r = 0.5 * (rho(!cR * elem1, k) + rho(!cR * elem2, k));
                 alpha_rho = a * r;
               }
             resu(num_face, k) += s(nb_comp * ncomp + k) * vol * alpha_rho;
           }
+      });
+      end_gpu_timer(__KERNEL_NAME__);
     }
   else // le champ source n'est plus uniforme
     {
@@ -147,83 +151,83 @@ void Terme_Source_Qdm_VDF_Face::ajouter_blocs(matrices_t matrices, DoubleTab& re
       if (la_source->que_suis_je().contient("_som_"))
         {
           // Need to interpolate
-          const int N = resu.dimension(1), D = dimension;
-          eval.resize(resu.dimension(0), N * D);
+          const int N = tab_resu.dimension(1), D = dimension;
+          eval.resize(tab_resu.dimension(0), N * D);
           la_source->valeur_aux(domaine_VDF.xp(), eval);
           s_tmp = &eval;
         }
       else
         s_tmp = &(la_source->valeurs());
-      const DoubleTab& s = *s_tmp;
+      const DoubleTab& tab_s = *s_tmp;
+      CDoubleTabView s = tab_s.view_ro();
 
-      // Boucle sur les conditions limites pour traiter les faces de bord : pour chaque Condition Limite on regarde son type
-      // Si face de Dirichlet ou de Symetrie on ne fait rien
-      // Si face de Neumann on calcule la contribution au terme source
+      // Boucle sur les conditions limites
       for (int n_bord = 0; n_bord < domaine_VDF.nb_front_Cl(); n_bord++)
         {
           const Cond_lim& la_cl = domaine_Cl_VDF.les_conditions_limites(n_bord);
-
+          const Front_VF& le_bord = ref_cast(Front_VF, la_cl->frontiere_dis());
+          const int ndeb = le_bord.num_premiere_face();
+          const int nfin = ndeb + le_bord.nb_faces();
           if (sub_type(Neumann_sortie_libre, la_cl.valeur()))
             {
-              const Front_VF& le_bord = ref_cast(Front_VF, la_cl->frontiere_dis());
-              ndeb = le_bord.num_premiere_face();
-              nfin = ndeb + le_bord.nb_faces();
-
-              for (int k = 0; k < nb_comp; k++)
-                for (num_face = ndeb; num_face < nfin; num_face++)
+              Kokkos::parallel_for(start_gpu_timer(__KERNEL_NAME__), range_1D(ndeb, nfin), KOKKOS_LAMBDA(const int num_face)
+              {
+                const double vol = volumes_entrelaces(num_face) * porosite_surf(num_face);
+                const int ncomp = orientation(num_face);
+                const int elem1 = face_voisins(num_face, 0), elem2 = face_voisins(num_face, 1);
+                const int e = (elem1 > -1 ? elem1 : elem2);
+                for (int k = 0; k < nb_comp; k++)
                   {
-                    vol = volumes_entrelaces(num_face) * porosite_surf(num_face);
-                    ncomp = orientation(num_face);
-                    elem1 = face_voisins(num_face, 0), elem2 = face_voisins(num_face, 1);
-                    const int e = (elem1 > -1 ? elem1 : elem2);
                     double alpha_rho = 1.0;
-                    if (alp)
-                      {
-                        double a = (*alp)(e, k), r = (*rho)(!cR * e, k);
-                        alpha_rho = a * r;
-                      }
+                    if (has_alp)
+                      alpha_rho = alp(e, k) * rho(!cR * e, k);
                     resu(num_face, k) += s(e, nb_comp * ncomp + k) * vol * alpha_rho;
                   }
+              });
+              end_gpu_timer(__KERNEL_NAME__);
             }
           else if (sub_type(Symetrie, la_cl.valeur())) { /* Do nothing */}
           else if ((sub_type(Dirichlet, la_cl.valeur())) || (sub_type(Dirichlet_homogene, la_cl.valeur()))) { /* Do nothing */}
           else if (sub_type(Periodique, la_cl.valeur()))
             {
-              if (alp) Process::exit("Terme_Source_Qdm_VDF_Face : periodic CL not yet available for Pb_Multiphase !");
-
-              const Front_VF& le_bord = ref_cast(Front_VF, la_cl->frontiere_dis());
-              ndeb = le_bord.num_premiere_face();
-              nfin = ndeb + le_bord.nb_faces();
-
-              for (int k = 0; k < nb_comp; k++)
-                for (num_face = ndeb; num_face < nfin; num_face++)
+              if (ptr_alp) Process::exit("Terme_Source_Qdm_VDF_Face : periodic CL not yet available for Pb_Multiphase !");
+              Kokkos::parallel_for(start_gpu_timer(__KERNEL_NAME__), range_1D(ndeb, nfin), KOKKOS_LAMBDA(const int num_face)
+              {
+                const double vol = volumes_entrelaces(num_face) * porosite_surf(num_face);
+                const int ncomp = orientation(num_face);
+                for (int k = 0; k < nb_comp; k++)
                   {
-                    vol = volumes_entrelaces(num_face) * porosite_surf(num_face);
-                    ncomp = orientation(num_face);
-                    double s_face = 0.5 * (s(face_voisins(num_face, 0), nb_comp * ncomp + k) + s(face_voisins(num_face, 1), nb_comp * ncomp + k));
+                    const double s_face = 0.5 * (s(face_voisins(num_face, 0), nb_comp * ncomp + k) + s(face_voisins(num_face, 1), nb_comp * ncomp + k));
                     resu(num_face, k) += s_face * vol;
                   }
+              });
+              end_gpu_timer(__KERNEL_NAME__);
             }
         }
 
       // Boucle sur les faces internes
-      ndeb = domaine_VDF.premiere_face_int();
-
-      for (int k = 0; k < nb_comp; k++)
-        for (num_face = domaine_VDF.premiere_face_int(); num_face < domaine_VDF.nb_faces(); num_face++)
+      const int ndeb_int = domaine_VDF.premiere_face_int();
+      const int nfin_int = domaine_VDF.nb_faces();
+      Kokkos::parallel_for(start_gpu_timer(__KERNEL_NAME__), range_1D(ndeb_int, nfin_int), KOKKOS_LAMBDA(const int num_face)
+      {
+        const double vol = volumes_entrelaces(num_face) * porosite_surf(num_face);
+        const int ncomp = orientation(num_face);
+        const int elem1 = face_voisins(num_face, 0);
+        const int elem2 = face_voisins(num_face, 1);
+        for (int k = 0; k < nb_comp; k++)
           {
-            vol = volumes_entrelaces(num_face) * porosite_surf(num_face);
-            ncomp = orientation(num_face);
-            elem1 = face_voisins(num_face, 0), elem2 = face_voisins(num_face, 1);
             double alpha_rho = 1.0;
-            if (alp)
+            if (has_alp)
               {
-                double a = 0.5 * ((*alp)(elem1, k) + (*alp)(elem2, k)), r = 0.5 * ((*rho)(!cR * elem1, k) + (*rho)(!cR * elem2, k));
+                double a = 0.5 * (alp(elem1, k) + alp(elem2, k));
+                double r = 0.5 * (rho(!cR * elem1, k) + rho(!cR * elem2, k));
                 alpha_rho = a * r;
               }
-            double s_face = 0.5 * (s(elem1, nb_comp * ncomp + k) + s(elem2, nb_comp * ncomp + k));
-            resu(num_face,k ) += s_face * vol * alpha_rho;
+            const double s_face = 0.5 * (s(elem1, nb_comp * ncomp + k) + s(elem2, nb_comp * ncomp + k));
+            resu(num_face, k) += s_face * vol * alpha_rho;
           }
+      });
+      end_gpu_timer(__KERNEL_NAME__);
     }
 }
 
diff --git a/src/VDF/Sources/Terme_Source_Solide_SWIFT_VDF.cpp b/src/VDF/Sources/Terme_Source_Solide_SWIFT_VDF.cpp
index c8512934ef..0c1c086cf2 100644
--- a/src/VDF/Sources/Terme_Source_Solide_SWIFT_VDF.cpp
+++ b/src/VDF/Sources/Terme_Source_Solide_SWIFT_VDF.cpp
@@ -1,5 +1,5 @@
 /****************************************************************************
-* Copyright (c) 2024, CEA
+* Copyright (c) 2026, CEA
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
@@ -163,6 +163,7 @@ void Terme_Source_Solide_SWIFT_VDF::init_calcul_moyenne(const Conduction& my_eqn
 
   j = 0;
   indic = 0;
+  ToDo_Kokkos("critical");
   for (num_elem = 0; num_elem < nb_elems; num_elem++)
     {
       y = xp(num_elem, 1);
@@ -252,6 +253,7 @@ void Terme_Source_Solide_SWIFT_VDF::ajouter_blocs(matrices_t matrices, DoubleTab
   calcul_moyenne(eq_swift.valeur(), Tmoy_swift, corresp_swift, compt_swift);
   calcul_moyenne(eq_corse.valeur(), Tmoy_corse, corresp_corse, compt_corse);
 
+  ToDo_Kokkos("critical");
   for (int num_elem = 0; num_elem < nb_elems; num_elem++)
     {
       resu(num_elem) += volume(num_elem) * (Tmoy_corse(corresp_SC[corresp_swift[num_elem]]) - Tmoy_swift(corresp_swift[num_elem])) / tau;
diff --git a/src/VDF/Sources/Terme_Source_inc_VDF_Face.cpp b/src/VDF/Sources/Terme_Source_inc_VDF_Face.cpp
index fe0862c0c6..c84cbf12d7 100644
--- a/src/VDF/Sources/Terme_Source_inc_VDF_Face.cpp
+++ b/src/VDF/Sources/Terme_Source_inc_VDF_Face.cpp
@@ -1,5 +1,5 @@
 /****************************************************************************
-* Copyright (c) 2024, CEA
+* Copyright (c) 2026, CEA
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
@@ -751,37 +751,43 @@ void Terme_Source_inc_VDF_Face::ajouter_blocs(matrices_t matrices, DoubleTab& se
   // Evaluate the dynamic model coefficient f
   int elem;
   if (dimension == 2)
-    for (elem=0; elem<nb_elem; elem++)
-      {
-        //         temp_1(elem)=0.;
-        //         temp_2(elem)=0.;
-        //     for (int k=0 ; k<dimension ; k++)
-        //         for (int l=0 ; l<dimension ; l++)
-        //           {
-        //             temp_1(elem)+=Lij(elem,k,k);
-        //             temp_2(elem)+=4.*Qij(elem,l,l);
-        //           }
-
-        temp_1(elem)=Lij_1(elem,0)+Lij_2(elem,1);
-        temp_2(elem)=4.*Qij_1(elem,0)+4.*Qij_2(elem,1);
-        if(std::fabs(temp_2(elem)) < 1.e-12)
-          f(elem)=0.;
-        else
-          f(elem)=temp_1(elem)/temp_2(elem);
-      }
+    {
+      ToDo_Kokkos("critical");
+      for (elem=0; elem<nb_elem; elem++)
+        {
+          //         temp_1(elem)=0.;
+          //         temp_2(elem)=0.;
+          //     for (int k=0 ; k<dimension ; k++)
+          //         for (int l=0 ; l<dimension ; l++)
+          //           {
+          //             temp_1(elem)+=Lij(elem,k,k);
+          //             temp_2(elem)+=4.*Qij(elem,l,l);
+          //           }
+
+          temp_1(elem)=Lij_1(elem,0)+Lij_2(elem,1);
+          temp_2(elem)=4.*Qij_1(elem,0)+4.*Qij_2(elem,1);
+          if(std::fabs(temp_2(elem)) < 1.e-12)
+            f(elem)=0.;
+          else
+            f(elem)=temp_1(elem)/temp_2(elem);
+        }
+    }
   else
-    for (elem=0; elem<nb_elem; elem++)
-      {
-        //         temp_1(elem)=0.;
-        //         temp_2(elem)=0.;
-
-        temp_1(elem)=Lij_1(elem,0)+Lij_2(elem,1)+Lij_3(elem,2);
-        temp_2(elem)=4.*Qij_1(elem,0)+4.*Qij_2(elem,1)+4.*Qij_3(elem,2);
-        if(std::fabs(temp_2(elem)) < 1.e-12)
-          f(elem)=0.;
-        else
-          f(elem)=temp_1(elem)/temp_2(elem);
-      }
+    {
+      ToDo_Kokkos("critical");
+      for (elem=0; elem<nb_elem; elem++)
+        {
+          //         temp_1(elem)=0.;
+          //         temp_2(elem)=0.;
+
+          temp_1(elem)=Lij_1(elem,0)+Lij_2(elem,1)+Lij_3(elem,2);
+          temp_2(elem)=4.*Qij_1(elem,0)+4.*Qij_2(elem,1)+4.*Qij_3(elem,2);
+          if(std::fabs(temp_2(elem)) < 1.e-12)
+            f(elem)=0.;
+          else
+            f(elem)=temp_1(elem)/temp_2(elem);
+        }
+    }
 
   f.echange_espace_virtuel();
   //   Cerr << " calculer_dQij_j" << f << " " << Qij << finl;
@@ -969,6 +975,7 @@ void Terme_Source_inc_VDF_Face::ajouter_blocs(matrices_t matrices, DoubleTab& se
           ndeb = le_bord.num_premiere_face();
           nfin = ndeb + le_bord.nb_faces();
 
+          ToDo_Kokkos("critical");
           for (num_face=ndeb; num_face<nfin; num_face++)
             {
               vol = volumes_entrelaces(num_face)*porosite_surf(num_face);
@@ -995,6 +1002,7 @@ void Terme_Source_inc_VDF_Face::ajouter_blocs(matrices_t matrices, DoubleTab& se
           ndeb = le_bord.num_premiere_face();
           nfin = ndeb + le_bord.nb_faces();
 
+          ToDo_Kokkos("critical");
           for (num_face=ndeb; num_face<nfin; num_face++)
             {
               vol = volumes_entrelaces(num_face)*porosite_surf(num_face);
@@ -1017,6 +1025,7 @@ void Terme_Source_inc_VDF_Face::ajouter_blocs(matrices_t matrices, DoubleTab& se
   int num_e1=-1, n_comp1=-1;
   int num_e2=-1, n_comp2=-1;
   ndeb = domaine_VDF.premiere_face_int();
+  ToDo_Kokkos("critical");
   for (num_face =domaine_VDF.premiere_face_int(); num_face<domaine_VDF.nb_faces(); num_face++)
     {
 
diff --git a/src/VDF/Sources/Terme_Source_inc_th_VDF_Face.cpp b/src/VDF/Sources/Terme_Source_inc_th_VDF_Face.cpp
index 20e8cbc31b..c962d7ba2e 100644
--- a/src/VDF/Sources/Terme_Source_inc_th_VDF_Face.cpp
+++ b/src/VDF/Sources/Terme_Source_inc_th_VDF_Face.cpp
@@ -1,5 +1,5 @@
 /****************************************************************************
-* Copyright (c) 2024, CEA
+* Copyright (c) 2026, CEA
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
@@ -785,37 +785,43 @@ void Terme_Source_inc_th_VDF_Face::ajouter_blocs(matrices_t matrices, DoubleTab&
   // Evaluate the dynamic model coefficient f
   int elem;
   if (dimension == 2)
-    for (elem=0; elem<nb_elem; elem++)
-      {
-        //         temp_1(elem)=0.;
-        //         temp_2(elem)=0.;
-        //     for (int k=0 ; k<dimension ; k++)
-        //         for (int l=0 ; l<dimension ; l++)
-        //           {
-        //             temp_1(elem)+=Lij(elem,k,k);
-        //             temp_2(elem)+=4.*Qij(elem,l,l);
-        //           }
-
-        temp_1(elem)=Lij(elem,0)+Lij(elem,1);
-        temp_2(elem)=4.*Qij(elem,0)+4.*Qij(elem,1);
-        if(std::fabs(temp_2(elem)) < 1.e-12)
-          f(elem)=0.;
-        else
-          f(elem)=temp_1(elem)/temp_2(elem);
-      }
+    {
+      ToDo_Kokkos("critical");
+      for (elem=0; elem<nb_elem; elem++)
+        {
+          //         temp_1(elem)=0.;
+          //         temp_2(elem)=0.;
+          //     for (int k=0 ; k<dimension ; k++)
+          //         for (int l=0 ; l<dimension ; l++)
+          //           {
+          //             temp_1(elem)+=Lij(elem,k,k);
+          //             temp_2(elem)+=4.*Qij(elem,l,l);
+          //           }
+
+          temp_1(elem)=Lij(elem,0)+Lij(elem,1);
+          temp_2(elem)=4.*Qij(elem,0)+4.*Qij(elem,1);
+          if(std::fabs(temp_2(elem)) < 1.e-12)
+            f(elem)=0.;
+          else
+            f(elem)=temp_1(elem)/temp_2(elem);
+        }
+    }
   else
-    for (elem=0; elem<nb_elem; elem++)
-      {
-        //         temp_1(elem)=0.;
-        //         temp_2(elem)=0.;
-
-        temp_1(elem)=Lij(elem,0)+Lij(elem,1)+Lij(elem,2);
-        temp_2(elem)=4.*Qij(elem,0)+4.*Qij(elem,1)+4.*Qij(elem,2);
-        if(std::fabs(temp_2(elem)) < 1.e-12)
-          f(elem)=0.;
-        else
-          f(elem)=temp_1(elem)/temp_2(elem);
-      }
+    {
+      ToDo_Kokkos("critical");
+      for (elem=0; elem<nb_elem; elem++)
+        {
+          //         temp_1(elem)=0.;
+          //         temp_2(elem)=0.;
+
+          temp_1(elem)=Lij(elem,0)+Lij(elem,1)+Lij(elem,2);
+          temp_2(elem)=4.*Qij(elem,0)+4.*Qij(elem,1)+4.*Qij(elem,2);
+          if(std::fabs(temp_2(elem)) < 1.e-12)
+            f(elem)=0.;
+          else
+            f(elem)=temp_1(elem)/temp_2(elem);
+        }
+    }
 
   f.echange_espace_virtuel();
   //   Cerr << " calculer_dQij_j" << f << " " << Qij << finl;
@@ -1009,6 +1015,7 @@ void Terme_Source_inc_th_VDF_Face::ajouter_blocs(matrices_t matrices, DoubleTab&
           ndeb = le_bord.num_premiere_face();
           nfin = ndeb + le_bord.nb_faces();
 
+          ToDo_Kokkos("critical");
           for (num_face=ndeb; num_face<nfin; num_face++)
             {
               vol = volumes_entrelaces(num_face)*porosite_surf(num_face);
@@ -1036,6 +1043,7 @@ void Terme_Source_inc_th_VDF_Face::ajouter_blocs(matrices_t matrices, DoubleTab&
           ndeb = le_bord.num_premiere_face();
           nfin = ndeb + le_bord.nb_faces();
 
+          ToDo_Kokkos("critical");
           for (num_face=ndeb; num_face<nfin; num_face++)
             {
               vol = volumes_entrelaces(num_face)*porosite_surf(num_face);
@@ -1059,6 +1067,7 @@ void Terme_Source_inc_th_VDF_Face::ajouter_blocs(matrices_t matrices, DoubleTab&
   int num_e1=-1, n_comp1=-1;
   int num_e2=-1, n_comp2=-1;
   ndeb = domaine_VDF.premiere_face_int();
+  ToDo_Kokkos("critical");
   for (num_face =domaine_VDF.premiere_face_int(); num_face<domaine_VDF.nb_faces(); num_face++)
     {
 
diff --git a/src/VDF/Traitement_particulier/Traitement_particulier_NS_Profils_VDF.cpp b/src/VDF/Traitement_particulier/Traitement_particulier_NS_Profils_VDF.cpp
index 68d4866501..ec3eb1aaf3 100644
--- a/src/VDF/Traitement_particulier/Traitement_particulier_NS_Profils_VDF.cpp
+++ b/src/VDF/Traitement_particulier/Traitement_particulier_NS_Profils_VDF.cpp
@@ -1,5 +1,5 @@
 /****************************************************************************
-* Copyright (c) 2024, CEA
+* Copyright (c) 2026, CEA
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
@@ -326,6 +326,7 @@ void Traitement_particulier_NS_Profils_VDF::init_calcul_moyenne()
   for(i=0; i<n_probes; i++)
     {
       //Boucle sur les faces
+      ToDo_Kokkos("critical");
       for (num_face=0; num_face<nb_faces; num_face++)
         {
           ori = orientation(num_face);
@@ -401,6 +402,7 @@ void Traitement_particulier_NS_Profils_VDF::init_calcul_moyenne()
 
 
       //Boucle sur les elements
+      ToDo_Kokkos("critical");
       for (num_elem=0; num_elem<nb_elems; num_elem++)
         {
           if((xp(num_elem,dir_profil)-positions(i))>=0)
@@ -517,6 +519,7 @@ void Traitement_particulier_NS_Profils_VDF::init_calcul_moyenne()
       indicw_m = indicw_p = 0;
       indicuv_m = indicuv_p = 0;
       //Boucle sur les faces pour avoir la correspondance pour les composantes de la vitesse.
+      ToDo_Kokkos("critical");
       for (num_face=0; num_face<nb_faces; num_face++)
         {
           ori = orientation(num_face);
@@ -722,6 +725,7 @@ void Traitement_particulier_NS_Profils_VDF::init_calcul_moyenne()
 
 
       //Boucle sur les elements pour table de correspondance pour nu_t et temperature
+      ToDo_Kokkos("critical");
       for (num_elem=0; num_elem<nb_elems; num_elem++)
         {
           //Point avant
@@ -836,6 +840,7 @@ void Traitement_particulier_NS_Profils_VDF::calculer_moyenne_spatiale_nut(Double
   for(i=0; i<n_probes; i++)
     {
       // Calcul de nut
+      ToDo_Kokkos("critical");
       for (num_elem=0; num_elem<nb_elems; num_elem++)
         if(xp(num_elem,dir_profil)==xUV(i))
           {
@@ -899,6 +904,7 @@ void Traitement_particulier_NS_Profils_VDF::calculer_moyenne_spatiale_vitesse(Do
 
   for(i=0; i<n_probes; i++)
     {
+      ToDo_Kokkos("critical");
       for(num_face=0; num_face<nb_faces; num_face++)
         {
           ori_face = orientation[num_face];
@@ -1006,6 +1012,7 @@ void Traitement_particulier_NS_Profils_VDF::calculer_moyenne_spatiale_uv(DoubleT
   for(i=0; i<n_probes; i++)
     {
       // Calcul de uv
+      ToDo_Kokkos("critical");
       for (num_elem=0; num_elem<nb_elems; num_elem++)
         {
           if(xp(num_elem,dir_profil)==xUV(i))
diff --git a/src/VDF/Traitement_particulier/Traitement_particulier_NS_Profils_thermo_VDF.cpp b/src/VDF/Traitement_particulier/Traitement_particulier_NS_Profils_thermo_VDF.cpp
index 49603d5048..17c2d61daa 100644
--- a/src/VDF/Traitement_particulier/Traitement_particulier_NS_Profils_thermo_VDF.cpp
+++ b/src/VDF/Traitement_particulier/Traitement_particulier_NS_Profils_thermo_VDF.cpp
@@ -1,5 +1,5 @@
 /****************************************************************************
-* Copyright (c) 2024, CEA
+* Copyright (c) 2026, CEA
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
@@ -262,6 +262,7 @@ void Traitement_particulier_NS_Profils_thermo_VDF::calculer_moyennes_spatiales_t
   for(i=0; i<n_probes; i++)
     {
       // On parcourt tous les elements pour faire toutes les moyennes au centre des elements.
+      ToDo_Kokkos("critical");
       for (num_elem=0; num_elem<nb_elems; num_elem++)
         {
           // <T>
diff --git a/src/VDF/Traitement_particulier/Traitement_particulier_NS_canal_VDF.cpp b/src/VDF/Traitement_particulier/Traitement_particulier_NS_canal_VDF.cpp
index b8f845583d..d27c25fee4 100644
--- a/src/VDF/Traitement_particulier/Traitement_particulier_NS_canal_VDF.cpp
+++ b/src/VDF/Traitement_particulier/Traitement_particulier_NS_canal_VDF.cpp
@@ -1,5 +1,5 @@
 /****************************************************************************
-* Copyright (c) 2024, CEA
+* Copyright (c) 2026, CEA
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
@@ -73,6 +73,7 @@ void Traitement_particulier_NS_canal_VDF::remplir_Y(DoubleVect& tab_Y,  DoubleVe
   //Remplissage du tableau Y
   ////////////////////////////////////////////////////////
 
+  ToDo_Kokkos("critical");
   for (num_elem=0; num_elem<nb_elems; num_elem++)
     {
       y = xp(num_elem,1);
@@ -136,6 +137,7 @@ void Traitement_particulier_NS_canal_VDF::remplir_Tab_recap(IntTab& Tab_rec) con
 
   Tab_rec.resize(nb_elems,3); // On dimenssione le tableau.
 
+  ToDo_Kokkos("critical");
   for (num_elem=nb_elems; num_elem<nb_elem_tot; num_elem++) // boucle sur les elements fictifs
     {
       face = elem_faces(num_elem,1+dimension);
@@ -183,6 +185,7 @@ void Traitement_particulier_NS_canal_VDF::remplir_Tab_recap(IntTab& Tab_rec) con
     }
 
   Cerr << "Traitement particulier canal : Il y a une amelioration a apporter aux face de bord !! " << finl;
+  ToDo_Kokkos("critical");
   for (num_elem=0; num_elem<nb_elems; num_elem++)
     {
       q=0;// on utilise le compteur q qui ne nous sert plus pour verifier si on a trouver un equivalent.
@@ -249,6 +252,7 @@ void Traitement_particulier_NS_canal_VDF::calculer_moyenne_spatiale_vitesse_rho_
   int taille_mu=visco_dyn.dimension(0);
   int taille_rho=tab_rho_elem.dimension(0);
 
+  ToDo_Kokkos("critical");
   for (num_elem=0; num_elem<nb_elems; num_elem++)
     {
       //y=xp(num_elem,1);
@@ -316,6 +320,7 @@ void Traitement_particulier_NS_canal_VDF::calculer_moyenne_spatiale_nut(DoubleTa
   int num_elem,i;
   // double y;
 
+  ToDo_Kokkos("critical");
   for (num_elem=0; num_elem<nb_elems; num_elem++)
     {
       //y=xp(num_elem,1);
@@ -339,6 +344,7 @@ void Traitement_particulier_NS_canal_VDF::calculer_moyenne_spatiale_Temp(DoubleT
   int num_elem,i;
   int face_x_0,face_x_1,face_y_0,face_y_1,face_z_0,face_z_1;
 
+  ToDo_Kokkos("critical");
   for (num_elem=0; num_elem<nb_elems; num_elem++)
     {
       //y=xp(num_elem,1);
diff --git a/src/VDF/Traitement_particulier/Traitement_particulier_Solide_canal_VDF.cpp b/src/VDF/Traitement_particulier/Traitement_particulier_Solide_canal_VDF.cpp
index 1db1e870ac..983f70df2c 100644
--- a/src/VDF/Traitement_particulier/Traitement_particulier_Solide_canal_VDF.cpp
+++ b/src/VDF/Traitement_particulier/Traitement_particulier_Solide_canal_VDF.cpp
@@ -1,5 +1,5 @@
 /****************************************************************************
-* Copyright (c) 2024, CEA
+* Copyright (c) 2026, CEA
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
@@ -126,6 +126,7 @@ void Traitement_particulier_Solide_canal_VDF::calculer_moyennes_spatiales_thermo
   trms = 0.;
 
   // On parcourt tous les elements pour faire toutes les moyennes au centre des elements.
+  ToDo_Kokkos("critical");
   for (num_elem=0; num_elem<nb_elems; num_elem++)
     {
       // <T>
@@ -287,6 +288,7 @@ void Traitement_particulier_Solide_canal_VDF::init_calcul_moyenne()
   compt = 0;
   corresp = -1;
 
+  ToDo_Kokkos("critical");
   for (num_elem=0; num_elem<nb_elems; num_elem++)
     {
       y = xp(num_elem,1);
diff --git a/src/VDF/Turbulence/Modele_turbulence_hyd_LES_Smago_VDF.cpp b/src/VDF/Turbulence/Modele_turbulence_hyd_LES_Smago_VDF.cpp
index 118e45ea35..f4126ae2ac 100644
--- a/src/VDF/Turbulence/Modele_turbulence_hyd_LES_Smago_VDF.cpp
+++ b/src/VDF/Turbulence/Modele_turbulence_hyd_LES_Smago_VDF.cpp
@@ -56,6 +56,7 @@ Champ_Fonc_base& Modele_turbulence_hyd_LES_Smago_VDF::calculer_viscosite_turbule
 
   Debog::verifier("Modele_turbulence_hyd_LES_Smago_VDF::calculer_viscosite_turbulente visco_turb 0", visco_turb);
 
+  ToDo_Kokkos("critical");
   for (int elem = 0; elem < nb_elem; elem++)
     visco_turb[elem] = cs_ * cs_ * l_(elem) * l_(elem) * sqrt(SMA_barre_(elem));
 
@@ -83,6 +84,7 @@ void Modele_turbulence_hyd_LES_Smago_VDF::calculer_S_barre()
   vit.calcul_duidxj(vitesse, duidxj, domaine_Cl_VDF);
 
   double Sij, temp;
+  ToDo_Kokkos("critical");
   for (elem = 0; elem < nb_elem_tot; elem++)
     {
       temp = 0.;
diff --git a/src/VDF/Turbulence/Modele_turbulence_hyd_LES_VDF_base.cpp b/src/VDF/Turbulence/Modele_turbulence_hyd_LES_VDF_base.cpp
index 87685a3185..0eeafb4a71 100644
--- a/src/VDF/Turbulence/Modele_turbulence_hyd_LES_VDF_base.cpp
+++ b/src/VDF/Turbulence/Modele_turbulence_hyd_LES_VDF_base.cpp
@@ -1,5 +1,5 @@
 /****************************************************************************
-* Copyright (c) 2024, CEA
+* Copyright (c) 2026, CEA
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
@@ -33,47 +33,46 @@ Entree& Modele_turbulence_hyd_LES_VDF_base::readOn(Entree& is)
 void Modele_turbulence_hyd_LES_VDF_base::calculer_longueurs_caracteristiques()
 {
   const Domaine_VDF& domaine_VDF = ref_cast(Domaine_VDF, le_dom_VF_.valeur());
-  int nb_elem = domaine_VDF.domaine().nb_elem();
-  const IntTab& elem_faces = domaine_VDF.elem_faces();
-  const IntVect& orientation = domaine_VDF.orientation();
+  const int nb_elem = domaine_VDF.domaine().nb_elem();
+  const int dim = Objet_U::dimension;
 
-  l_.resize(nb_elem);
+  if (l_.size_array()!=nb_elem) l_.resize(nb_elem);
 
-  ArrOfDouble h(dimension);
-  double dist_tot, dist_min, dist_max, dist_moy;
-  double a1, a2, f_scotti;
-
-  const int modele_scotti = (methode_ == Motcle("Scotti"));
-  if (modele_scotti && (dimension == 2))
+  const bool modele_scotti = (methode_ == Motcle("Scotti"));
+  if (modele_scotti && (dim == 2))
     {
       Cerr << "The Scotti correction can be used only for dimension 3." << finl;
       exit();
     }
 
-  for (int elem = 0; elem < nb_elem; elem++)
-    {
-      for (int i = 0; i < dimension; i++)
-        h[i] = domaine_VDF.dim_elem(elem, orientation(elem_faces(elem, i)));
-
-      if (dimension == 2)
-        l_(elem) = exp((log(h[0] * h[1])) / 2);
-      else
-        l_(elem) = exp((log(h[0] * h[1] * h[2])) / 3);
-
-      if (modele_scotti)
-        {
-          dist_tot = h[0] + h[1] + h[2];
-
-          dist_min = min_array(h);
-          dist_max = max_array(h);
-          dist_moy = dist_tot - dist_min - dist_max;
-
-          a1 = dist_min / dist_max;
-          a2 = dist_moy / dist_max;
-
-          f_scotti = cosh(sqrt((4. / 27.) * (log(a1) * log(a1) - log(a1) * log(a2) + log(a2) * log(a2))));
-
-          l_(elem) *= f_scotti;
-        }
-    }
+  Domaine_VDF_View dom_vdf(domaine_VDF);
+  CIntTabView elem_faces = domaine_VDF.elem_faces().view_ro();
+  CIntArrView orientation = domaine_VDF.orientation().view_ro();
+  DoubleArrView l = l_.view_rw();
+  Kokkos::parallel_for(start_gpu_timer(__KERNEL_NAME__), range_1D(0, nb_elem), KOKKOS_LAMBDA(const int elem)
+  {
+    double h[3];
+    for (int i = 0; i < dim; i++)
+      h[i] = dom_vdf.dim_elem(elem, orientation(elem_faces(elem, i)));
+
+    if (dim == 2)
+      l(elem) = Kokkos::exp(Kokkos::log(h[0] * h[1]) / 2);
+    else
+      l(elem) = Kokkos::exp(Kokkos::log(h[0] * h[1] * h[2]) / 3);
+
+    if (modele_scotti)
+      {
+        const double dist_min = Kokkos::fmin(h[0], Kokkos::fmin(h[1], h[2]));
+        const double dist_max = Kokkos::fmax(h[0], Kokkos::fmax(h[1], h[2]));
+        const double dist_moy = h[0] + h[1] + h[2] - dist_min - dist_max;
+
+        const double a1 = dist_min / dist_max;
+        const double a2 = dist_moy / dist_max;
+
+        const double f_scotti = Kokkos::cosh(Kokkos::sqrt((4. / 27.) * (Kokkos::log(a1) * Kokkos::log(a1) - Kokkos::log(a1) * Kokkos::log(a2) + Kokkos::log(a2) * Kokkos::log(a2))));
+
+        l(elem) *= f_scotti;
+      }
+  });
+  end_gpu_timer(__KERNEL_NAME__);
 }
diff --git a/src/VDF/Turbulence/Modele_turbulence_hyd_LES_Wale_VDF.cpp b/src/VDF/Turbulence/Modele_turbulence_hyd_LES_Wale_VDF.cpp
index adf94daa63..37fce45003 100644
--- a/src/VDF/Turbulence/Modele_turbulence_hyd_LES_Wale_VDF.cpp
+++ b/src/VDF/Turbulence/Modele_turbulence_hyd_LES_Wale_VDF.cpp
@@ -46,32 +46,37 @@ Champ_Fonc_base& Modele_turbulence_hyd_LES_Wale_VDF::calculer_viscosite_turbulen
 {
   const Domaine_VDF& domaine_VDF = ref_cast(Domaine_VDF, le_dom_VF_.valeur());
   const double temps = mon_equation_->inconnue().temps();
-  DoubleTab& visco_turb = la_viscosite_turbulente_->valeurs();
+  DoubleTab& tab_visco_turb = la_viscosite_turbulente_->valeurs();
 
   if (est_egal(cw_, 0., 1.e-15))
-    visco_turb = 0.;
+    tab_visco_turb = 0.;
   else
     {
-      const int nb_elem = domaine_VDF.domaine().nb_elem(), nb_elem_tot = domaine_VDF.nb_elem_tot();
-
-      OP1_.resize(nb_elem_tot);  // OP1 est le premier operateur spatial du modele WALE.
-      OP2_.resize(nb_elem_tot);  // OP2 est le deuxieme operateur spatial du modele WALE.
-
+      const int nb_elem_tot = domaine_VDF.nb_elem_tot();
+      if (OP1_.size_array()!=nb_elem_tot)
+        {
+          OP1_.resize(nb_elem_tot);  // OP1 est le premier operateur spatial du modele WALE.
+          OP2_.resize(nb_elem_tot);  // OP2 est le deuxieme operateur spatial du modele WALE.
+        }
       calculer_OP1_OP2();
 
-      if (visco_turb.size() != nb_elem)
+      const int nb_elem = domaine_VDF.domaine().nb_elem();
+      if (tab_visco_turb.size() != nb_elem)
         {
           Cerr << "Size error for the array containing the values of the turbulent viscosity." << finl;
           exit();
         }
 
-      for (int elem = 0; elem < nb_elem; elem++)
-        {
-          if (OP1_[elem] != 0.) // donc sd2 (et OP2 par voie de consequence) sont differents de zero
-            visco_turb[elem] = cw_ * cw_ * l_(elem) * l_(elem) * OP1_[elem] / OP2_[elem];
-          else
-            visco_turb[elem] = 0;
-        }
+      const double cw = cw_;
+      CDoubleArrView l = l_.view_ro();
+      CDoubleArrView OP1 = OP1_.view_rw();
+      CDoubleArrView OP2 = OP2_.view_rw();
+      DoubleArrView visco_turb = static_cast<ArrOfDouble&>(tab_visco_turb).view_wo();
+      Kokkos::parallel_for(start_gpu_timer(__KERNEL_NAME__), range_1D(0, nb_elem), KOKKOS_LAMBDA(const int elem)
+      {
+        visco_turb[elem] = OP2[elem] == 0. ? 0. : cw * cw * l(elem) * l(elem) * OP1[elem] / OP2[elem];
+      });  // fin de la boucle sur les elements
+      end_gpu_timer(__KERNEL_NAME__);
     }
 
   la_viscosite_turbulente_->changer_temps(temps);
@@ -87,92 +92,72 @@ void Modele_turbulence_hyd_LES_Wale_VDF::calculer_OP1_OP2()
   const Domaine_Cl_VDF& domaine_Cl_VDF = ref_cast(Domaine_Cl_VDF, le_dom_Cl_.valeur());
   const int nb_elem = domaine_VDF.domaine().nb_elem_tot(), nb_elem_tot = domaine_VDF.nb_elem_tot();
 
-  const IntTab& face_voisins = domaine_VDF.face_voisins(), &elem_faces = domaine_VDF.elem_faces();
+  assert(vitesse.line_size() == 1);
+  DoubleTrav tab_duidxj(nb_elem_tot, dimension, dimension, vitesse.line_size());
+  vit.calcul_duidxj(vitesse, tab_duidxj, domaine_Cl_VDF);
+
+  const int dim = Objet_U::dimension;
+  CDoubleTabView4 duidxj = tab_duidxj.view_ro<4>();
+  CIntTabView elem_faces = domaine_VDF.elem_faces().view_ro();
+  CIntTabView face_voisins = domaine_VDF.face_voisins().view_ro();
+  DoubleArrView OP1 = OP1_.view_rw();
+  DoubleArrView OP2 = OP2_.view_rw();
+  Kokkos::parallel_for(start_gpu_timer(__KERNEL_NAME__), range_1D(0, nb_elem), KOKKOS_LAMBDA(const int elem)
+  {
+    double gij2[3][3];
+    double sd[3][3];
+
+    // Calcul du terme gij2
+    for (int i = 0; i < dim; i++)
+      for (int j = 0; j < dim; j++)
+        {
+          gij2[i][j] = 0;
+          for (int k = 0; k < dim; k++)
+            gij2[i][j] += duidxj(elem, i, k, 0) * duidxj(elem, k, j, 0);
+        }
+
+    // Calcul du terme gkk2
+    double gkk2 = 0;
+    for (int k = 0; k < dim; k++)
+      gkk2 += gij2[k][k];
 
-  DoubleTrav gij2(dimension, dimension), sd(dimension, dimension);
+    // Calcul de sd
+    for (int i = 0; i < dim; i++)
+      for (int j = 0; j < dim; j++)
+        {
+          sd[i][j] = 0.5 * (gij2[i][j] + gij2[j][i]);
+          if (i == j)
+            sd[i][j] -= gkk2 / 3.; // Terme derriere le tenseur de Kronecker
+        }
 
-  double gkk2, sd2, Sij, Sij2;
+    // Calcul de sd2 et Sij2
+    double sd2 = 0., Sij2 = 0.;
 
-  assert(vitesse.line_size() == 1);
-  DoubleTab duidxj(nb_elem_tot, dimension, dimension, vitesse.line_size());
+    for (int i = 0; i < dim; i++)
+      for (int j = 0; j < dim; j++)
+        {
+          sd2 += sd[i][j] * sd[i][j];
+          double Sij = 0.5 * (duidxj(elem, i, j, 0) + duidxj(elem, j, i, 0));
+
+          if (i == j)  // augmentation du stencil de Sii
+            {
+              const int face1 = elem_faces(elem, i);
+              const int face2 = elem_faces(elem, i + dim);
+              const int elem1 = face_voisins(face1, 0);
+              const int elem2 = face_voisins(face2, 1);
+              // si pas de bord a proximite on passe au stencil de 3 mailles
+              // sinon on reste au stencil a 1 maille
+              if (elem1 >= 0 && elem2 >= 0)
+                Sij = (duidxj(elem1, i, i, 0) + duidxj(elem, i, i, 0) + duidxj(elem2, i, i, 0)) / 3.;
+            }
+
+          Sij2 += Sij * Sij;
+        }
 
-  vit.calcul_duidxj(vitesse, duidxj, domaine_Cl_VDF);
+    // Calcul de OP1 et OP2 (replace pow by sqrt and multiply, faster)
+    OP1(elem) = sd2 * Kokkos::sqrt(sd2);
+    OP2(elem) = Sij2 * Sij2 * Kokkos::sqrt(Sij2) + sd2 * Kokkos::sqrt(Kokkos::sqrt(sd2));
 
-  for (int elem = 0; elem < nb_elem; elem++)
-    {
-      //Calcul du terme gij2
-      for (int i = 0; i < dimension; i++)
-        for (int j = 0; j < dimension; j++)
-          {
-            gij2(i, j) = 0;
-
-            for (int k = 0; k < dimension; k++)
-              gij2(i, j) += duidxj(elem, i, k, 0) * duidxj(elem, k, j, 0);
-          }
-
-      // Calcul du terme gkk2
-      gkk2 = 0;
-      for (int k = 0; k < dimension; k++)
-        gkk2 += gij2(k, k);
-
-      // Calcul de sd
-      for (int i = 0; i < dimension; i++)
-        for (int j = 0; j < dimension; j++)
-          {
-            sd(i, j) = 0.5 * (gij2(i, j) + gij2(j, i));
-            if (i == j)
-              sd(i, j) -= gkk2 / 3.; // Terme derriere le tenseur de Kronecker
-          }
-
-      // Calcul de sd2 et Sij2
-      sd2 = 0.;
-      Sij2 = 0.;
-
-      int face1 = 0, face2 = 0;
-      int elem1, elem2;
-
-      for (int i = 0; i < dimension; i++)
-        for (int j = 0; j < dimension; j++)
-          {
-            sd2 += sd(i, j) * sd(i, j);
-            //Deplacement du calcul de sij
-            Sij = 0.5 * (duidxj(elem, i, j, 0) + duidxj(elem, j, i, 0));
-
-            // PQ : 24/01/07 : le stencil de Sij est par contruction de :
-            //                   -  1 maille pour les termes diagonaux Sii
-            //                   - ~2 mailles pour les termes croises Sij
-            //
-            // Wale s'appuyant a la fois sur sd2 (porte par Sij) et sur Sij2 (porte principalement par Sii)
-            // est sensible a cette difference de stencil.
-            // En portant le stencil a 3 maille spour le calcul de Sii, on retrouve en THI
-            // le bon taux de dissipation ainsi que des spectres possedant la bonne allure en k^-5/3.
-            //
-            // A traiter : Quid sur canal plan ???
-
-            if (i == j)  // augmentation du stencil de Sii
-              {
-                face1 = elem_faces(elem, i);
-                face2 = elem_faces(elem, i + dimension);
-
-                elem1 = face_voisins(face1, 0);
-                elem2 = face_voisins(face2, 1);
-
-                //if(elem1==elem) elem1=face_voisins(face1,1);  // par construction il n'y a pas besoin
-                //if(elem2==elem) elem2=face_voisins(face2,0);  // par construction il n'y a pas besoin
-
-                // si pas de bord a proximite on passe au stencil de 3 mailles
-                // sinon on reste au stencil a 1 maille
-
-                if (elem1 >= 0 && elem2 >= 0)
-                  Sij = ((duidxj(elem1, i, i, 0) + duidxj(elem, i, i, 0) + duidxj(elem2, i, i, 0))) / 3.;
-              }
-
-            Sij2 += Sij * Sij;
-          }
-
-      // Calcul de OP1 et OP2
-      OP1_(elem) = pow(sd2, 1.5);
-      OP2_(elem) = pow(Sij2, 2.5) + pow(sd2, 1.25);
-
-    }                // fin de la boucle sur les elements
+  });  // fin de la boucle sur les elements
+  end_gpu_timer(__KERNEL_NAME__);
 }
diff --git a/src/VDF/Turbulence/Modele_turbulence_hyd_LES_Wale_VDF.h b/src/VDF/Turbulence/Modele_turbulence_hyd_LES_Wale_VDF.h
index 8dd6db18ff..c6363d6573 100644
--- a/src/VDF/Turbulence/Modele_turbulence_hyd_LES_Wale_VDF.h
+++ b/src/VDF/Turbulence/Modele_turbulence_hyd_LES_Wale_VDF.h
@@ -33,12 +33,13 @@ class Modele_turbulence_hyd_LES_Wale_VDF: public Modele_turbulence_hyd_LES_VDF_b
   Modele_turbulence_hyd_LES_Wale_VDF();
   void set_param(Param& param) const override;
 
+  protected_but_public_for_cuda
+  Champ_Fonc_base& calculer_viscosite_turbulente() override;
+  void calculer_OP1_OP2();
+
 protected:
   double cw_ = 0.5;
   DoubleVect OP1_, OP2_;
-
-  Champ_Fonc_base& calculer_viscosite_turbulente() override;
-  void calculer_OP1_OP2();
 };
 
 #endif /* Modele_turbulence_hyd_LES_Wale_VDF_included */
diff --git a/src/VDF/Turbulence/Modele_turbulence_hyd_Longueur_Melange_VDF.cpp b/src/VDF/Turbulence/Modele_turbulence_hyd_Longueur_Melange_VDF.cpp
index 323fc57bf2..c2624fd53f 100644
--- a/src/VDF/Turbulence/Modele_turbulence_hyd_Longueur_Melange_VDF.cpp
+++ b/src/VDF/Turbulence/Modele_turbulence_hyd_Longueur_Melange_VDF.cpp
@@ -86,6 +86,7 @@ Champ_Fonc_base& Modele_turbulence_hyd_Longueur_Melange_VDF::calculer_viscosite_
 
   //    CANAL PLAN suivant (Ox - h=2) **********************************
 
+  ToDo_Kokkos("critical");
   for (int elem = 0; elem < nb_elem; elem++)
     {
       double y = xp(elem, direction_);
@@ -120,6 +121,7 @@ void Modele_turbulence_hyd_Longueur_Melange_VDF::calculer_Sij2()
 
   ch.calcul_duidxj(vitesse, duidxj, domaine_Cl_VDF);
 
+  ToDo_Kokkos("critical");
   for (int elem = 0; elem < nb_elem; elem++)
     {
       for (i = 0; i < dimension; i++)
diff --git a/src/VDF/Turbulence/Paroi_negligeable_VDF.cpp b/src/VDF/Turbulence/Paroi_negligeable_VDF.cpp
index 29d0bbef22..6bf922d8f8 100644
--- a/src/VDF/Turbulence/Paroi_negligeable_VDF.cpp
+++ b/src/VDF/Turbulence/Paroi_negligeable_VDF.cpp
@@ -1,5 +1,5 @@
 /****************************************************************************
-* Copyright (c) 2025, CEA
+* Copyright (c) 2026, CEA
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
@@ -38,153 +38,61 @@ int Paroi_negligeable_VDF::init_lois_paroi()
 
 int Paroi_negligeable_VDF::calculer_hyd(DoubleTab& tab_k_eps)
 {
-  const Equation_base& eqn_hydr = mon_modele_turb_hyd->equation();
-  if (sub_type(Fluide_base, eqn_hydr.milieu()))
-    {
-      int ndeb, nfin, elem, ori, l_unif;
-      double norm_tau, u_etoile, norm_v = 0, dist, val0, val1, val2, d_visco = 0, visco = 1.;
-
-      const Domaine_VDF& domaine_VDF = ref_cast(Domaine_VDF, le_dom_dis_.valeur());
-      const IntTab& face_voisins = domaine_VDF.face_voisins();
-      const IntVect& orientation = domaine_VDF.orientation();
-      const Fluide_base& le_fluide = ref_cast(Fluide_base, eqn_hydr.milieu());
-      const Champ_Don_base& ch_visco_cin = le_fluide.viscosite_cinematique();
-      const DoubleTab& tab_visco = ch_visco_cin.valeurs();
-      const DoubleTab& vit = eqn_hydr.inconnue().valeurs();
-
-      if (sub_type(Champ_Uniforme, ch_visco_cin))
-        {
-          visco = tab_visco(0, 0);
-          l_unif = 1;
-        }
-      else
-        l_unif = 0;
-
-      for (int n_bord = 0; n_bord < domaine_VDF.nb_front_Cl(); n_bord++)
-        {
-          const Cond_lim& la_cl = le_dom_Cl_dis_->les_conditions_limites(n_bord);
-
-          if (sub_type(Dirichlet_paroi_fixe, la_cl.valeur()))
-            {
-              const Front_VF& le_bord = ref_cast(Front_VF, la_cl->frontiere_dis());
-              ndeb = le_bord.num_premiere_face();
-              nfin = ndeb + le_bord.nb_faces();
-
-              for (int num_face = ndeb; num_face < nfin; num_face++)
-                {
-
-                  if (face_voisins(num_face, 0) != -1)
-                    elem = face_voisins(num_face, 0);
-                  else
-                    elem = face_voisins(num_face, 1);
-
-                  if (dimension == 2)
-                    {
-                      ori = orientation(num_face);
-                      norm_v = norm_2D_vit(vit, elem, ori, domaine_VDF, val0);
-                    }
-                  else if (dimension == 3)
-                    {
-                      ori = orientation(num_face);
-                      norm_v = norm_3D_vit(vit, elem, ori, domaine_VDF, val1, val2);
-                    }
-
-                  if (axi)
-                    dist = domaine_VDF.dist_norm_bord_axi(num_face);
-                  else
-                    dist = domaine_VDF.dist_norm_bord(num_face);
-                  if (l_unif)
-                    d_visco = visco;
-                  else
-                    d_visco = tab_visco[elem];
-
-                  norm_tau = d_visco * norm_v / dist;
-                  u_etoile = sqrt(norm_tau);
-                  tab_u_star_(num_face) = u_etoile;
-
-                } // loop on faces
-
-            } // Fin paroi fixe
-
-        } // Fin boucle sur les bords
-
-    }
-  return 1;
+  return calculer_hyd(tab_k_eps, tab_k_eps);  // arguments are not used anyway
 }
 
 int Paroi_negligeable_VDF::calculer_hyd(DoubleTab& tab_nu_t, DoubleTab& tab_k)
 {
   const Equation_base& eqn_hydr = mon_modele_turb_hyd->equation();
-  if (sub_type(Fluide_base, eqn_hydr.milieu()))
+  if (!sub_type(Fluide_base, eqn_hydr.milieu())) return 1;
+
+  const Domaine_VDF& domaine_VDF = ref_cast(Domaine_VDF, le_dom_dis_.valeur());
+  const Domaine_Cl_VDF& dom_Cl_VDF = ref_cast(Domaine_Cl_VDF, le_dom_Cl_dis_.valeur());
+  const Champ_Don_base& ch_visco_cin = ref_cast(Fluide_base, eqn_hydr.milieu()).viscosite_cinematique();
+  const DoubleTab& tab_visco_cin = ch_visco_cin.valeurs();
+  const int l_unif = sub_type(Champ_Uniforme, ch_visco_cin) ? 1 : 0;
+  const double visco = l_unif ? tab_visco_cin(0, 0) : 1.;
+
+  const int dim = Objet_U::dimension;
+  const int is_axi = Objet_U::axi;
+  Domaine_VDF_View dom_vdf(domaine_VDF);
+  CIntTabView face_voisins = domaine_VDF.face_voisins().view_ro();
+  CIntArrView orientation = domaine_VDF.orientation().view_ro();
+  CIntTabView elem_faces = domaine_VDF.elem_faces().view_ro();
+  CDoubleArrView vitesse = static_cast<const ArrOfDouble&>(eqn_hydr.inconnue().valeurs()).view_ro();
+  CDoubleTabView tab_visco;
+  if (!l_unif) tab_visco = tab_visco_cin.view_ro();
+  DoubleArrView u_star = tab_u_star_.view_wo();
+  for (int n_bord = 0; n_bord < domaine_VDF.nb_front_Cl(); n_bord++)
     {
-      int ndeb, nfin, elem, ori, l_unif;
-      double norm_tau, u_etoile, norm_v = 0, dist, val0, val1, val2, d_visco = 0, visco = 1.;
-
-      const Domaine_VDF& domaine_VDF = ref_cast(Domaine_VDF, le_dom_dis_.valeur());
-      const IntTab& face_voisins = domaine_VDF.face_voisins();
-      const IntVect& orientation = domaine_VDF.orientation();
-      const Fluide_base& le_fluide = ref_cast(Fluide_base, eqn_hydr.milieu());
-      const Champ_Don_base& ch_visco_cin = le_fluide.viscosite_cinematique();
-      const DoubleTab& tab_visco = ch_visco_cin.valeurs();
-      const DoubleTab& vit = eqn_hydr.inconnue().valeurs();
-
-      if (sub_type(Champ_Uniforme, ch_visco_cin))
+      const Cond_lim& la_cl = dom_Cl_VDF.les_conditions_limites(n_bord);
+      if (sub_type(Dirichlet_paroi_fixe, la_cl.valeur()))
         {
-          visco = tab_visco(0, 0);
-          l_unif = 1;
-        }
-      else
-        l_unif = 0;
-
-      for (int n_bord = 0; n_bord < domaine_VDF.nb_front_Cl(); n_bord++)
-        {
-          const Cond_lim& la_cl = le_dom_Cl_dis_->les_conditions_limites(n_bord);
-
-          if (sub_type(Dirichlet_paroi_fixe, la_cl.valeur()))
-            {
-              const Front_VF& le_bord = ref_cast(Front_VF, la_cl->frontiere_dis());
-              ndeb = le_bord.num_premiere_face();
-              nfin = ndeb + le_bord.nb_faces();
-
-              for (int num_face = ndeb; num_face < nfin; num_face++)
-                {
-
-                  if (face_voisins(num_face, 0) != -1)
-                    elem = face_voisins(num_face, 0);
-                  else
-                    elem = face_voisins(num_face, 1);
-
-                  if (dimension == 2)
-                    {
-                      ori = orientation(num_face);
-                      norm_v = norm_2D_vit(vit, elem, ori, domaine_VDF, val0);
-                    }
-                  else if (dimension == 3)
-                    {
-                      ori = orientation(num_face);
-                      norm_v = norm_3D_vit(vit, elem, ori, domaine_VDF, val1, val2);
-                    }
-
-                  if (axi)
-                    dist = domaine_VDF.dist_norm_bord_axi(num_face);
-                  else
-                    dist = domaine_VDF.dist_norm_bord(num_face);
-                  if (l_unif)
-                    d_visco = visco;
-                  else
-                    d_visco = tab_visco[elem];
-
-                  norm_tau = d_visco * norm_v / dist;
-                  u_etoile = sqrt(norm_tau);
-                  tab_u_star_(num_face) = u_etoile;
-
-                } // loop on faces
-
-            } // Fin paroi fixe
-
-        } // Fin boucle sur les bords
-
-    }
+          const Front_VF& le_bord = ref_cast(Front_VF, la_cl->frontiere_dis());
+          const int ndeb = le_bord.num_premiere_face();
+          const int nfin = ndeb + le_bord.nb_faces();
+          Kokkos::parallel_for(start_gpu_timer(__KERNEL_NAME__), range_1D(ndeb, nfin), KOKKOS_LAMBDA(const int num_face)
+          {
+            const int elem = face_voisins(num_face, 0) != -1 ? face_voisins(num_face, 0) : face_voisins(num_face, 1);
+            const int ori = orientation(num_face);
+            double norm_v = 0;
+            if (dim == 2)
+              {
+                double val0;
+                norm_v = norm_2D_vit(vitesse, elem, ori, elem_faces, val0);
+              }
+            else
+              {
+                double val1, val2;
+                norm_v = norm_3D_vit(vitesse, elem, ori, elem_faces, val1, val2);
+              }
+            const double dist = is_axi ? dom_vdf.dist_norm_bord_axi(num_face) : dom_vdf.dist_norm_bord(num_face);
+            const double d_visco = l_unif ? visco : tab_visco(elem, 0);
+            u_star(num_face) = Kokkos::sqrt(d_visco * norm_v / dist);
+          }); // loop on faces
+          end_gpu_timer(__KERNEL_NAME__);
+        } // Fin paroi fixe
+    } // Fin boucle sur les bords
   return 1;
 }
 
diff --git a/src/VDF/Turbulence/Paroi_scal_hyd_base_VDF.cpp b/src/VDF/Turbulence/Paroi_scal_hyd_base_VDF.cpp
index aaa6fa549b..1558eb9cf8 100644
--- a/src/VDF/Turbulence/Paroi_scal_hyd_base_VDF.cpp
+++ b/src/VDF/Turbulence/Paroi_scal_hyd_base_VDF.cpp
@@ -1,5 +1,5 @@
 /****************************************************************************
-* Copyright (c) 2025, CEA
+* Copyright (c) 2026, CEA
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
@@ -102,6 +102,7 @@ void Paroi_scal_hyd_base_VDF::compute_nusselt() const
 
           ndeb = le_bord.num_premiere_face();
           nfin = ndeb + le_bord.nb_faces();
+          ToDo_Kokkos("critical");
           for (int num_face = ndeb; num_face < nfin; num_face++)
             {
               double dist, lambda;
@@ -200,6 +201,7 @@ void Paroi_scal_hyd_base_VDF::imprimer_nusselt(Sortie& os) const
                 }
               ndeb = le_bord.num_premiere_face();
               nfin = ndeb + le_bord.nb_faces();
+              ToDo_Kokkos("critical");
               for (int num_face = ndeb; num_face < nfin; num_face++)
                 {
                   double x = domaine_VDF.xv(num_face, 0);
@@ -251,6 +253,7 @@ void Paroi_scal_hyd_base_VDF::imprimer_nusselt(Sortie& os) const
 
               ndeb = le_bord.num_premiere_face();
               nfin = ndeb + le_bord.nb_faces();
+              ToDo_Kokkos("critical");
               for (int num_face = ndeb; num_face < nfin; num_face++)
                 {
                   double x = domaine_VDF.xv(num_face, 0);
diff --git a/src/VEF/Champs/Champ_Fonc_Tabule_P0_VEF.cpp b/src/VEF/Champs/Champ_Fonc_Tabule_P0_VEF.cpp
index 29143d6603..fc90211b54 100644
--- a/src/VEF/Champs/Champ_Fonc_Tabule_P0_VEF.cpp
+++ b/src/VEF/Champs/Champ_Fonc_Tabule_P0_VEF.cpp
@@ -33,78 +33,7 @@ void Champ_Fonc_Tabule_P0_VEF::associer_param(const VECT(OBS_PTR(Champ_base)) &l
 
 void Champ_Fonc_Tabule_P0_VEF::mettre_a_jour(double t)
 {
-  const Domaine_VF& domaine_VF = le_dom_VF.valeur();
-  const Table& table = la_table.valeur();
-  DoubleTab& mes_valeurs = valeurs();
-  const int nb_elem = domaine_VF.nb_elem(), nb_elem_tot = domaine_VF.nb_elem_tot(), nb_param = les_ch_param.size();
-  const int nbcomp = mes_valeurs.dimension(1);
-  const DoubleTab& centres_de_gravites = domaine_VF.xp();
-
-  // ToDo Kokkos: factorize somewhere this array or rewrite valeur_aux_elems !
-  IntTrav les_polys(nb_elem_tot);
-  IntArrView les_polys_v = static_cast<IntVect&>(les_polys).view_wo();
-  Kokkos::parallel_for(start_gpu_timer(__KERNEL_NAME__), nb_elem_tot, KOKKOS_LAMBDA(const int num_elem)
-  {
-    les_polys_v(num_elem) = num_elem;
-  });
-  end_gpu_timer(__KERNEL_NAME__);
-
-  if (nb_param==1 && nbcomp==1 && table.isfonction()==1)
-    {
-      // Ported on GPU. ToDo Kokkos, extend to more than one param or more than one nbcomp
-      DoubleTrav val_param_aux_elems(nb_elem_tot, nbcomp);
-      les_ch_param[0]->valeur_aux_elems(centres_de_gravites, les_polys, val_param_aux_elems);
-      // Cree un parser specifique ParserView pour Kokkos:
-      ParserView parser(table.parser(0));
-      parser.parseString();
-      CDoubleTabView val_params_aux_elems_v = val_param_aux_elems.view_ro();
-      DoubleTabView mes_valeurs_v = mes_valeurs.view_wo();
-      Kokkos::parallel_for(start_gpu_timer(__KERNEL_NAME__), nb_elem, KOKKOS_LAMBDA(
-                             const int num_elem)
-      {
-        int threadId = parser.acquire();
-        for (int ncomp = 0; ncomp < nbcomp; ncomp++)
-          {
-            double val = val_params_aux_elems_v(num_elem, ncomp);
-
-            parser.setVar(0, val, threadId);
-            mes_valeurs_v(num_elem, ncomp) = parser.eval(threadId);
-          }
-        parser.release(threadId);
-      });
-      end_gpu_timer(__KERNEL_NAME__);
-    }
-  else
-    {
-      ToDo_Kokkos("critical");
-      DoubleTabs val_params_aux_elems;
-      for (int i = 0; i < nb_param; i++)
-        {
-          DoubleTab vp(nb_elem_tot, mes_valeurs.dimension(1));
-          val_params_aux_elems.add(vp);
-        }
-      for (int i = 0; i < nb_param; i++)
-        les_ch_param[i]->valeur_aux_elems(centres_de_gravites, les_polys, val_params_aux_elems[i]);
-
-      if (table.isfonction() != 2)
-        {
-          std::vector<double> vals;
-          vals.reserve(nb_param); // Pre-allocate space once
-          for (int num_elem = 0; num_elem < nb_elem; num_elem++)
-            for (int ncomp = 0; ncomp < nbcomp; ncomp++)
-              {
-                vals.clear();
-                for (int n = 0; n < nb_param; n++)
-                  vals.push_back(val_params_aux_elems[n](num_elem, ncomp));
-                mes_valeurs(num_elem, ncomp) = table.val(vals, ncomp);
-              }
-        }
-      else
-        {
-          table.valeurs(val_params_aux_elems[0], centres_de_gravites, t, mes_valeurs);
-        }
-    }
-  Champ_Fonc_base::mettre_a_jour(t);
+  Champ_Fonc_P0_base::mettre_a_jour(t, la_table.valeur(), les_ch_param);
 }
 
 int Champ_Fonc_Tabule_P0_VEF::initialiser(const double un_temps)
diff --git a/src/VEF/Champs/Champ_P1NC.cpp b/src/VEF/Champs/Champ_P1NC.cpp
index b6c6b58345..59caa2c95b 100644
--- a/src/VEF/Champs/Champ_P1NC.cpp
+++ b/src/VEF/Champs/Champ_P1NC.cpp
@@ -565,7 +565,7 @@ void Champ_P1NC::calcul_y_plus(const Domaine_Cl_VEF& domaine_Cl_VEF, DoubleVect&
     }
   // tab_visco+=DMINFLOAT;
 
-  DoubleTab yplus_faces(1, 1); // will contain yplus values if available
+  DoubleTrav yplus_faces(1, 1); // will contain yplus values if available
   int yplus_already_computed = 0; // flag
 
   const RefObjU& modele_turbulence = eqn_hydr.get_modele(TURBULENCE);
@@ -986,60 +986,97 @@ DoubleTab& Champ_P1NC::calcul_duidxj_paroi(DoubleTab& tab_gij, const DoubleTab&
           CDoubleTabView tau_tan = tab_tau_tan.view_ro();
           CDoubleArrView nu = static_cast<const ArrOfDouble&>(tab_nu).view_ro();
           CDoubleArrView nu_turb = static_cast<const ArrOfDouble&>(tab_nu_turb).view_ro();
-          DoubleTabView3 gij = tab_gij.view_rw<3>();
-          Kokkos::parallel_for(start_gpu_timer(__KERNEL_NAME__), Kokkos::RangePolicy<>(ndeb, nfin), KOKKOS_LAMBDA (const int fac)
+          // Two-pass approach: compute all C values from the unmodified gij (pass 1),
+          // then apply corrections atomically (pass 2). This avoids a race condition
+          // when multiple faces share the same adjacent element (corner elements), and
+          // gives the same result as serial execution when num1 is unique per boundary.
+          DoubleTrav tab_C(nfin - ndeb);
+          DoubleArrView C = static_cast<ArrOfDouble&>(tab_C).view_rw();
+
+          // Pass 1: read gij (read-only), compute and store C per face
           {
-            double P[3][3];
-            int num1 = face_voisins(fac, 0);
-            // definition des vecteurs unitaires constituant le repere local
-            // stockes dans la matrice de passage P
-            // vecteur tangentiel (porte par la vitesse tangentielle)
-            double sum = 0.;
-            for (int i = 0; i < dim; i++)
-              sum += tau_tan(fac, i) * tau_tan(fac, i);
-            double norme_tau_tan = sqrt(sum);
-            for (int i = 0; i < dim; i++)
-              P[i][0] = tau_tan(fac, i) / (norme_tau_tan + DMINFLOAT);
-
-            // vecteur normal a la paroi
-            sum = 0.;
-            for (int i = 0; i < dim; i++)
-              sum += face_normale(fac, i) * face_normale(fac, i);
-            double norme = sqrt(sum);
-
-            int signe = -oriente_normale(fac, num1, face_voisins); // orientation vers l'interieur
-            for (int i = 0; i < dim; i++)
-              P[i][1] = signe * face_normale(fac, i) / norme;
-
-            // (3D) on complete la base par le deuxieme vecteur tangentiel
-            if (dim == 3)
-              {
-                P[0][2] = P[1][0] * P[2][1] - P[2][0] * P[1][1];
-                P[1][2] = P[2][0] * P[0][1] - P[0][0] * P[2][1];
-                P[2][2] = P[0][0] * P[1][1] - P[1][0] * P[0][1];
-              }
-            //         determination du terme d(u_t)/dn a enlever
-            //                                                       -1
-            //         terme identifie a l'aide du produit : F =  P . G . P
-            //
-            double dutdn_old = 0.;
-            for (int i = 0; i < dim; i++)
-              for (int j = 0; j < dim; j++)
+            CDoubleTabView3 gij = tab_gij.view_ro<3>();
+            Kokkos::parallel_for(start_gpu_timer(__KERNEL_NAME__), Kokkos::RangePolicy<>(ndeb, nfin), KOKKOS_LAMBDA (const int fac)
+            {
+              double P[3][3];
+              int num1 = face_voisins(fac, 0);
+              double sum = 0.;
+              for (int i = 0; i < dim; i++)
+                sum += tau_tan(fac, i) * tau_tan(fac, i);
+              double norme_tau_tan = sqrt(sum);
+              for (int i = 0; i < dim; i++)
+                P[i][0] = tau_tan(fac, i) / (norme_tau_tan + DMINFLOAT);
+
+              sum = 0.;
+              for (int i = 0; i < dim; i++)
+                sum += face_normale(fac, i) * face_normale(fac, i);
+              double norme = sqrt(sum);
+
+              int signe = -oriente_normale(fac, num1, face_voisins); // orientation vers l'interieur
+              for (int i = 0; i < dim; i++)
+                P[i][1] = signe * face_normale(fac, i) / norme;
+
+              if (dim == 3)
                 {
-                  double gij_value = Kokkos::atomic_fetch_add(&gij(num1, i, j), 0.0);
-                  dutdn_old += gij_value * P[j][1] * P[i][0];
+                  P[0][2] = P[1][0] * P[2][1] - P[2][0] * P[1][1];
+                  P[1][2] = P[2][0] * P[0][1] - P[0][0] * P[2][1];
+                  P[2][2] = P[0][0] * P[1][1] - P[1][0] * P[0][1];
                 }
 
-            //         Correction finale apportee a la matrice G
-            double C = -dutdn_old + norme_tau_tan / (nu[num1] + nu_turb[num1]) * porosite_face(fac);
+              //         determination du terme d(u_t)/dn a enlever
+              //                                                       -1
+              //         terme identifie a l'aide du produit : F =  P . G . P
+              //
+              double dutdn_old = 0.;
+              for (int i = 0; i < dim; i++)
+                for (int j = 0; j < dim; j++)
+                  dutdn_old += gij(num1, i, j) * P[j][1] * P[i][0];
+
+              //         Correction finale apportee a la matrice G
+              // la division par (nu[num1]+nu_turb[num1]) s'impose du fait que l'operateur de diffusion
+              // fait intervenir le produit : (nu[num1]+nu_turb[num1])*g(i,j)
+              C(fac - ndeb) = -dutdn_old + norme_tau_tan / (nu[num1] + nu_turb[num1]) * porosite_face(fac);
+            });
+            end_gpu_timer(__KERNEL_NAME__);
+          }
 
-            // la division par (nu[num1]+nu_turb[num1]) s'impose du fait que l'operateur de diffusion
-            // fait intervenir le produit : (nu[num1]+nu_turb[num1])*g(i,j)
-            for (int i = 0; i < dim; i++)
-              for (int j = 0; j < dim; j++)
-                Kokkos::atomic_add(&gij(num1, i, j), C * P[j][1] * P[i][0]);
-          });
-          end_gpu_timer(__KERNEL_NAME__);
+          // Pass 2: apply corrections to gij atomically
+          {
+            DoubleTabView3 gij = tab_gij.view_rw<3>();
+            Kokkos::parallel_for(start_gpu_timer(__KERNEL_NAME__), Kokkos::RangePolicy<>(ndeb, nfin), KOKKOS_LAMBDA (const int fac)
+            {
+              double P[3][3];
+              int num1 = face_voisins(fac, 0);
+              double sum = 0.;
+              for (int i = 0; i < dim; i++)
+                sum += tau_tan(fac, i) * tau_tan(fac, i);
+              double norme_tau_tan = sqrt(sum);
+              for (int i = 0; i < dim; i++)
+                P[i][0] = tau_tan(fac, i) / (norme_tau_tan + DMINFLOAT);
+
+              sum = 0.;
+              for (int i = 0; i < dim; i++)
+                sum += face_normale(fac, i) * face_normale(fac, i);
+              double norme = sqrt(sum);
+
+              int signe = -oriente_normale(fac, num1, face_voisins); // orientation vers l'interieur
+              for (int i = 0; i < dim; i++)
+                P[i][1] = signe * face_normale(fac, i) / norme;
+
+              if (dim == 3)
+                {
+                  P[0][2] = P[1][0] * P[2][1] - P[2][0] * P[1][1];
+                  P[1][2] = P[2][0] * P[0][1] - P[0][0] * P[2][1];
+                  P[2][2] = P[0][0] * P[1][1] - P[1][0] * P[0][1];
+                }
+
+              const double coeff = C(fac - ndeb);
+              for (int i = 0; i < dim; i++)
+                for (int j = 0; j < dim; j++)
+                  Kokkos::atomic_add(&gij(num1, i, j), coeff * P[j][1] * P[i][0]);
+            });
+            end_gpu_timer(__KERNEL_NAME__);
+          }
         }
     }
 
diff --git a/src/VEF/Champs/Champ_P1NC_implementation.cpp b/src/VEF/Champs/Champ_P1NC_implementation.cpp
index 4203a4a587..d23658c66b 100644
--- a/src/VEF/Champs/Champ_P1NC_implementation.cpp
+++ b/src/VEF/Champs/Champ_P1NC_implementation.cpp
@@ -1794,7 +1794,7 @@ double Champ_P1NC_implementation::valeur_a_sommet_compo(int num_som, int num_ele
 }
 
 KOKKOS_INLINE_FUNCTION
-double Champ_P1NC_implementation::valeur_a_sommet_compo(int num_som, int num_elem, int ncomp, CIntTabView elem_faces, CIntTabView sommet_elem, CDoubleTabView ch) const
+double Champ_P1NC_implementation::valeur_a_sommet_compo(int num_som, int num_elem, int ncomp, CIntTabView elem_faces, CIntTabView sommet_elem, CDoubleTabView ch)
 {
   double val=0;
   if (num_elem != -1)
diff --git a/src/VEF/Champs/Champ_P1NC_implementation.h b/src/VEF/Champs/Champ_P1NC_implementation.h
index 9c9b7edfa7..ed630d4e10 100644
--- a/src/VEF/Champs/Champ_P1NC_implementation.h
+++ b/src/VEF/Champs/Champ_P1NC_implementation.h
@@ -36,12 +36,12 @@ class Champ_P1NC_implementation: public Champ_implementation_divers
 
   int fixer_nb_valeurs_nodales(int);
 
-  KOKKOS_INLINE_FUNCTION double fonction_forme_2D_v(double x, double y, int le_poly, int face, CIntTabView sommet_poly, CDoubleTabView coord) const
+  KOKKOS_INLINE_FUNCTION static double fonction_forme_2D_v(double x, double y, int le_poly, int face, CIntTabView sommet_poly, CDoubleTabView coord)
   {
     return 1 - 2 * coord_barycentrique_P1_triangle(sommet_poly, coord, x, y, le_poly, face);
   }
 
-  KOKKOS_INLINE_FUNCTION double fonction_forme_3D_v(double x, double y, double z, int le_poly, int face, CIntTabView sommet_poly, CDoubleTabView coord) const
+  KOKKOS_INLINE_FUNCTION static double fonction_forme_3D_v(double x, double y, double z, int le_poly, int face, CIntTabView sommet_poly, CDoubleTabView coord)
   {
     return 1 - 3 * coord_barycentrique_P1_tetraedre(sommet_poly, coord, x, y, z, le_poly, face);
   }
@@ -82,8 +82,8 @@ class Champ_P1NC_implementation: public Champ_implementation_divers
   DoubleVect& valeur_aux_sommets_compo(const Domaine& dom, DoubleVect& ch_som, int ncomp) const override;
   // Retourne la valeur de la composante ncomp du champ au sommet num_som sur l'element le_poly
   double valeur_a_sommet_compo(int num_som, int le_poly, int ncomp) const;
-  KOKKOS_INLINE_FUNCTION
-  double valeur_a_sommet_compo(int num_som, int num_elem, int ncomp, CIntTabView elem_faces, CIntTabView sommet_elem, CDoubleTabView ch) const;
+  KOKKOS_INLINE_FUNCTION static
+  double valeur_a_sommet_compo(int num_som, int num_elem, int ncomp, CIntTabView elem_faces, CIntTabView sommet_elem, CDoubleTabView ch);
 
   DoubleTab& valeur_aux_elems_smooth(const DoubleTab& positions, const IntVect& les_polys, DoubleTab& valeurs);
   DoubleVect& valeur_aux_elems_compo_smooth(const DoubleTab& positions, const IntVect& les_polys, DoubleVect& valeurs, int ncomp);
diff --git a/src/VEF/Champs/Champ_P1iP1B_implementation.cpp b/src/VEF/Champs/Champ_P1iP1B_implementation.cpp
index d0fd8708bd..c2c563d661 100644
--- a/src/VEF/Champs/Champ_P1iP1B_implementation.cpp
+++ b/src/VEF/Champs/Champ_P1iP1B_implementation.cpp
@@ -276,7 +276,7 @@ void assembler(const Domaine_VEF& domaine_VEF, Matrice& matrice)
   Matrice_Morse_Sym& MatPoisson=ref_cast(Matrice_Morse_Sym, matrice.valeur());
   int nb_som_tot = domaine_VEF.domaine().nb_som_tot();
   int nb_arete_tot = domaine_VEF.domaine().nb_aretes_tot();
-  int nnz=nb_som_tot+nb_arete_tot;
+  int nnz=0;
   const IntTab& aretes_som=domaine_VEF.domaine().aretes_som();
   const ArrOfInt& renum_arete_perio=domaine_VEF.get_renum_arete_perio();
   const Domaine& dom=domaine_VEF.domaine();
@@ -299,6 +299,9 @@ void assembler(const Domaine_VEF& domaine_VEF, Matrice& matrice)
             }
           voisins[som1].add(som2);
           coeffs[som1].add(1);
+          nnz++;
+          if (diag(som1)==0) nnz++;
+          if (diag(som2)==0) nnz++;
           diag(som1)++;
           diag(som2)++;
         }
@@ -309,6 +312,7 @@ void assembler(const Domaine_VEF& domaine_VEF, Matrice& matrice)
         {
           assert(i!=dom.get_renum_som_perio(i));
           diag(i)=1; // Sommets periodiques
+          nnz++;
         }
     }
   MatPoisson.dimensionner(nb_som_tot, nnz) ;
diff --git a/src/VEF/Geometrie/Domaine_Cl_VEF.cpp b/src/VEF/Geometrie/Domaine_Cl_VEF.cpp
index dafe173413..c39beaf01b 100644
--- a/src/VEF/Geometrie/Domaine_Cl_VEF.cpp
+++ b/src/VEF/Geometrie/Domaine_Cl_VEF.cpp
@@ -1,5 +1,5 @@
 /****************************************************************************
-* Copyright (c) 2025, CEA
+* Copyright (c) 2026, CEA
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
@@ -147,7 +147,6 @@ void Domaine_Cl_VEF::remplir_volumes_entrelaces_Cl(const Domaine_VEF& le_dom_VEF
 
       int nb_poly_tot = le_dom_VEF.domaine().nb_elem_tot();
       ArrOfInt poly_fait(nb_poly_tot);
-      poly_fait = 0;
       for (int i = 0; i < les_conditions_limites_.size(); i++)
         {
           const Cond_lim_base& la_cl = les_conditions_limites_[i].valeur();
@@ -425,7 +424,7 @@ void Domaine_Cl_VEF::imposer_cond_lim(Champ_Inc_base& ch, double temps)
                         surf += face_n * face_n;
                       }
                     // flux /= surf; // Fixed bug: Arithmetic exception
-                    if (std::fabs(surf) >= DMINFLOAT)
+                    if (Kokkos::fabs(surf) >= DMINFLOAT)
                       flux /= surf;
                     for (int ncomp = 0; ncomp < nb_comp; ncomp++)
                       tab(num_face, ncomp) -= flux * face_normales(num_face, ncomp);
@@ -551,7 +550,7 @@ void Domaine_Cl_VEF::imposer_cond_lim(Champ_Inc_base& ch, double temps)
                         surf += face_n * face_n;
                       }
                     // flux /= surf; // Fixed bug: Arithmetic exception
-                    if (std::fabs(surf) >= DMINFLOAT)
+                    if (Kokkos::fabs(surf) >= DMINFLOAT)
                       flux /= surf;
                     for (int ncomp = 0; ncomp < nb_comp; ncomp++)
                       tab(num_face, ncomp) = val_imp(num_face - ndeb, ncomp) -
@@ -621,7 +620,7 @@ void Domaine_Cl_VEF::imposer_cond_lim(Champ_Inc_base& ch, double temps)
                   int num2 = le_bord.nb_faces_tot();
                   CIntArrView num_face = le_bord.num_face().view_ro();
                   CIntTabView faces = domaine_vef.face_sommets().view_ro();
-                  CDoubleArrView flux_impose = static_cast<const DoubleVect&>(la_sortie_libre.flux_impose(true)).view_ro();
+                  CDoubleArrView flux_impose = static_cast<const DoubleVect&>(la_sortie_libre.tab_flux_impose(true)).view_ro();
                   CDoubleArrView face_surfaces = domaine_vef.face_surfaces().view_ro();
                   DoubleArrView surf_loc = static_cast<DoubleVect&>(tab_surf_loc).view_rw();
                   DoubleArrView pression = static_cast<DoubleVect&>(tab_pression).view_rw();
diff --git a/src/VEF/Geometrie/VerifierCoin.cpp b/src/VEF/Geometrie/VerifierCoin.cpp
index 1353f12620..e297f2c132 100644
--- a/src/VEF/Geometrie/VerifierCoin.cpp
+++ b/src/VEF/Geometrie/VerifierCoin.cpp
@@ -131,7 +131,6 @@ Entree& VerifierCoin::interpreter_(Entree& is)
 
   // On compte les elements attaches a chaque sommet:
   ArrOfInt nb_elem_per_som(nbsom);
-  nb_elem_per_som = 0;
   for (int ne = 0; ne < nbelem; ne++)
     for (int ns = 0; ns < dimension+1; ns++)
       nb_elem_per_som(les_elems(ne,ns))++;
diff --git a/src/VEF/Geometrie/distances_VEF.cpp b/src/VEF/Geometrie/distances_VEF.cpp
index 3faf6f311b..846e356413 100644
--- a/src/VEF/Geometrie/distances_VEF.cpp
+++ b/src/VEF/Geometrie/distances_VEF.cpp
@@ -1,5 +1,5 @@
 /****************************************************************************
-* Copyright (c) 2025, CEA
+* Copyright (c) 2026, CEA
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
@@ -13,6 +13,7 @@
 *
 *****************************************************************************/
 #include <distances_VEF.h>
+#include <TRUSTTrav.h>
 #include <Domaine.h>
 #include <Motcle.h>
 
@@ -493,14 +494,13 @@ double distance_3D(int fac,int elem,const Domaine_VEF& domaine)
 
 
 
-DoubleVect& calcul_longueur_filtre(DoubleVect& longueur_filtre, const Motcle& methode, const Domaine_VEF& domaine)
+DoubleVect& calcul_longueur_filtre(DoubleVect& tab_longueur_filtre, const Motcle& methode, const Domaine_VEF& domaine)
 {
-  int nbr_element=domaine.nb_elem_tot();
-  int element;
-  int dim=Objet_U::dimension;
+  const int nbr_element = domaine.nb_elem_tot();
+  const int dim = Objet_U::dimension;
   const Domaine& domaine_geom = domaine.domaine();
 
-  if (longueur_filtre.size() != nbr_element)
+  if (tab_longueur_filtre.size() != nbr_element)
     {
       Cerr << "erreur dans la taille du DoubleVect valeurs de la longueur du filtre" << finl;
       Process::exit();
@@ -508,125 +508,91 @@ DoubleVect& calcul_longueur_filtre(DoubleVect& longueur_filtre, const Motcle& me
 
   if (methode == Motcle("volume") || methode == Motcle("volume_sans_lissage"))  // racine cubique du volume
     {
-      longueur_filtre=-1.;
-
-      const DoubleVect& volume = domaine.volumes();
-      for (element=0; element<nbr_element; element ++)
-        {
-          longueur_filtre(element) = exp(log(volume[element])/double(dim));
-        }
+      CDoubleArrView volume = domaine.volumes().view_ro();
+      DoubleArrView longueur_filtre = tab_longueur_filtre.view_rw();
+      Kokkos::parallel_for(start_gpu_timer(__KERNEL_NAME__), range_1D(0, nbr_element), KOKKOS_LAMBDA(const int element)
+      {
+        longueur_filtre(element) = Kokkos::exp(Kokkos::log(volume(element)) / double(dim));
+      });
+      end_gpu_timer(__KERNEL_NAME__);
     }
   else if (methode == Motcle("arete"))  // recherche de la plus longue arete d'un element
     {
-      longueur_filtre=-1.;
-
-      const IntTab& les_sommets = domaine_geom.les_elems();
-      int som1,som2,som_1,som_2;
-      double distance;
-
-      for (element=0; element<nbr_element; element ++)
-        for (som1=0; som1<dim; som1++)
-          for (som2=som1; som2<dim+1; som2++)
+      CIntTabView les_sommets = domaine_geom.les_elems().view_ro();
+      CDoubleTabView xs = domaine_geom.coord_sommets().view_ro();
+      DoubleArrView longueur_filtre = tab_longueur_filtre.view_rw();
+      Kokkos::parallel_for(start_gpu_timer(__KERNEL_NAME__), range_1D(0, nbr_element), KOKKOS_LAMBDA(const int element)
+      {
+        longueur_filtre(element) = -1.;
+        for (int som1 = 0; som1 < dim; som1++)
+          for (int som2 = som1; som2 < dim+1; som2++)
             {
-              som_1 = les_sommets(element, som1);
-              som_2 = les_sommets(element, som2);
-
-              distance = distance_sommets(som_1, som_2, domaine);
-              distance/=sqrt(2.);
-              longueur_filtre(element) = std::max(longueur_filtre(element), distance);
+              int som_1 = les_sommets(element, som1);
+              int som_2 = les_sommets(element, som2);
+              double distance = distance_sommets(som_1, som_2, xs);
+              distance /= Kokkos::sqrt(2.);
+              longueur_filtre(element) = Kokkos::fmax(longueur_filtre(element), distance);
             }
+      });
+      end_gpu_timer(__KERNEL_NAME__);
     }
-  else if (methode ==  Motcle("Scotti"))  // application de Scotti a un pseudo-cube
+  else if (methode == Motcle("Scotti"))  // application de Scotti a un pseudo-cube
     {
-      longueur_filtre=-1.;
-
-      const IntTab& les_sommets = domaine_geom.les_elems();
-      int som0,som1,som2,som3;
-      int som_0,som_1,som_2,som_3;
-
-      if(Objet_U::dimension==2)  // On revient a la racine carre du volume
+      CDoubleArrView volume = domaine.volumes().view_ro();
+      CIntTabView les_sommets = domaine_geom.les_elems().view_ro();
+      CDoubleTabView xs = domaine_geom.coord_sommets().view_ro();
+      DoubleArrView longueur_filtre = tab_longueur_filtre.view_rw();
+      if (dim == 2)  // On revient a la racine carre du volume
         {
-          int nbr_elementb=domaine.nb_elem_tot();
-          int elementb;
-          //int dim=Objet_U::dimension;
-          const DoubleVect& volume = domaine.volumes();
-
-          for (elementb=0; elementb<nbr_elementb; elementb ++)
-            {
-              longueur_filtre(elementb) = exp(log(volume[elementb])/double(dim));
-            }
+          Kokkos::parallel_for(start_gpu_timer(__KERNEL_NAME__), range_1D(0, nbr_element), KOKKOS_LAMBDA(const int element)
+          {
+            longueur_filtre(element) = Kokkos::exp(Kokkos::log(volume(element)) / double(dim));
+          });
+          end_gpu_timer(__KERNEL_NAME__);
         }
       else
         {
-          ArrOfDouble psc(4);
-          ArrOfDouble dist(3);
-          double dist_tot,dist_min,dist_max,dist_moy;
-          double a1,a2,f_scotti;
-
-          som_0=0;
-          som_1=0;
-          som_2=0;
-          som_3=0;
-
-          for (element=0; element<nbr_element; element ++)
-            {
-              som0 = les_sommets(element, 0);
-              som1 = les_sommets(element, 1);
-              som2 = les_sommets(element, 2);
-              som3 = les_sommets(element, 3);
-
-              psc[0] = som_pscal(som0,som1,som2,som3,domaine);
-              psc[1] = som_pscal(som1,som0,som2,som3,domaine);
-              psc[2] = som_pscal(som2,som0,som1,som3,domaine);
-              psc[3] = som_pscal(som3,som0,som1,som2,domaine);
-
-              const int indice_min = imin_array(psc);
-              if(indice_min==0)
-                {
-                  som_0=som0;
-                  som_1=som1;
-                  som_2=som2;
-                  som_3=som3;
-                }
-              if(indice_min==1)
-                {
-                  som_0=som1;
-                  som_1=som0;
-                  som_2=som2;
-                  som_3=som3;
-                }
-              if(indice_min==2)
-                {
-                  som_0=som2;
-                  som_1=som0;
-                  som_2=som1;
-                  som_3=som3;
-                }
-              if(indice_min==3)
-                {
-                  som_0=som3;
-                  som_1=som0;
-                  som_2=som1;
-                  som_3=som2;
-                }
-
-              dist[0]=distance_sommets(som_0,som_1,domaine);
-              dist[1]=distance_sommets(som_0,som_2,domaine);
-              dist[2]=distance_sommets(som_0,som_3,domaine);
-
-              dist_tot=dist[0]+dist[1]+dist[2];
-
-              dist_min=min_array(dist);
-              dist_max=max_array(dist);
-              dist_moy=dist_tot-dist_min-dist_max;
-
-              a1=dist_min/dist_max;
-              a2=dist_moy/dist_max;
-
-              f_scotti=cosh(sqrt((4./27.)*( log(a1)*log(a1) - log(a1)*log(a2) + log(a2)*log(a2) )));
-
-              longueur_filtre(element) = f_scotti * exp(log(dist_min*dist_max*dist_moy)/3.);
-            }
+          Kokkos::parallel_for(start_gpu_timer(__KERNEL_NAME__), range_1D(0, nbr_element), KOKKOS_LAMBDA(const int element)
+          {
+            const int som0 = les_sommets(element, 0);
+            const int som1 = les_sommets(element, 1);
+            const int som2 = les_sommets(element, 2);
+            const int som3 = les_sommets(element, 3);
+
+            double psc[4];
+            psc[0] = som_pscal(som0, som1, som2, som3, xs);
+            psc[1] = som_pscal(som1, som0, som2, som3, xs);
+            psc[2] = som_pscal(som2, som0, som1, som3, xs);
+            psc[3] = som_pscal(som3, som0, som1, som2, xs);
+
+            int indice_min = 0;
+            for (int ii = 1; ii < 4; ii++)
+              if (psc[ii] < psc[indice_min]) indice_min = ii;
+
+            int som_0, som_1, som_2, som_3;
+            if (indice_min == 0)      { som_0=som0; som_1=som1; som_2=som2; som_3=som3; }
+            else if (indice_min == 1) { som_0=som1; som_1=som0; som_2=som2; som_3=som3; }
+            else if (indice_min == 2) { som_0=som2; som_1=som0; som_2=som1; som_3=som3; }
+            else                      { som_0=som3; som_1=som0; som_2=som1; som_3=som2; }
+
+            double dist[3];
+            dist[0] = distance_sommets(som_0, som_1, xs);
+            dist[1] = distance_sommets(som_0, som_2, xs);
+            dist[2] = distance_sommets(som_0, som_3, xs);
+
+            const double dist_tot = dist[0] + dist[1] + dist[2];
+            const double dist_min = Kokkos::fmin(dist[0], Kokkos::fmin(dist[1], dist[2]));
+            const double dist_max = Kokkos::fmax(dist[0], Kokkos::fmax(dist[1], dist[2]));
+            const double dist_moy = dist_tot - dist_min - dist_max;
+
+            const double a1 = dist_min / dist_max;
+            const double a2 = dist_moy / dist_max;
+            const double log_a1 = Kokkos::log(a1);
+            const double log_a2 = Kokkos::log(a2);
+            const double f_scotti = Kokkos::cosh(Kokkos::sqrt((4./27.) * (log_a1*log_a1 - log_a1*log_a2 + log_a2*log_a2)));
+            longueur_filtre(element) = f_scotti * Kokkos::exp(Kokkos::log(dist_min * dist_max * dist_moy) / 3.);
+          });
+          end_gpu_timer(__KERNEL_NAME__);
         }//3D
     }
   else
@@ -634,43 +600,47 @@ DoubleVect& calcul_longueur_filtre(DoubleVect& longueur_filtre, const Motcle& me
       Cerr << "calcul_longueur_filtre.cpp n'a pas reconnu l'argument : " << methode << finl;
       Cerr << "les arguments possibles sont : \"volume\", \"volume_sans_lissage\", \"Scotti\", \"arete\"." << finl;
       Process::exit();
-
     }
 
-
-  if ( ! (methode == Motcle("volume_sans_lissage")) )  // processus de "regularisation"
+  if (!(methode == Motcle("volume_sans_lissage")))  // processus de "regularisation"
     {
-      const Domaine& dom=domaine.domaine();
-      const IntTab& les_sommets = domaine_geom.les_elems();
-      int nb_sommet = domaine.nb_som_tot();
-      ArrOfDouble longueur_filtre_sommet(nb_sommet);
-      int som1;
-      int som_0,som_1;
-
-      longueur_filtre_sommet=-1.;
-
-      for (element=0; element<nbr_element; element ++)
-        for (som1=0; som1<dim+1; som1++)
+      const Domaine& dom = domaine.domaine();
+      const int nb_sommet = domaine.nb_som_tot();
+      CIntTabView les_sommets = domaine_geom.les_elems().view_ro();
+      CIntArrView renum_som_perio = dom.get_renum_som_perio().view_ro();
+
+      DoubleTrav tab_longueur_filtre_sommet(nb_sommet);
+      tab_longueur_filtre_sommet = -1.;
+      DoubleArrView longueur_filtre_sommet = static_cast<ArrOfDouble&>(tab_longueur_filtre_sommet).view_rw();
+      CDoubleArrView longueur_filtre = tab_longueur_filtre.view_ro();
+
+      Kokkos::parallel_for(start_gpu_timer(__KERNEL_NAME__), range_1D(0, nbr_element), KOKKOS_LAMBDA(const int element)
+      {
+        for (int som1 = 0; som1 < dim+1; som1++)
           {
-            som_0 = les_sommets(element, som1);
-            som_1 = dom.get_renum_som_perio(som_0);
-            longueur_filtre_sommet[som_1] = std::max(longueur_filtre(element), longueur_filtre_sommet[som_1]);
+            int som_0 = les_sommets(element, som1);
+            int som_1 = renum_som_perio(som_0);
+            Kokkos::atomic_fetch_max(&longueur_filtre_sommet(som_1), longueur_filtre(element));
           }
-
-      longueur_filtre=-1.;
-
-      for (element=0; element<nbr_element; element ++)
-        for (som1=0; som1<dim+1; som1++)
+      });
+      end_gpu_timer(__KERNEL_NAME__);
+
+      DoubleArrView longueur_filtre_rw = tab_longueur_filtre.view_rw();
+      Kokkos::parallel_for(start_gpu_timer(__KERNEL_NAME__), range_1D(0, nbr_element), KOKKOS_LAMBDA(const int element)
+      {
+        longueur_filtre_rw(element) = -1.;
+        for (int som1 = 0; som1 < dim+1; som1++)
           {
-            som_0 = les_sommets(element, som1);
-            som_1 = dom.get_renum_som_perio(som_0);
-            longueur_filtre(element) = std::max (longueur_filtre(element), longueur_filtre_sommet[som_1]);
+            int som_0 = les_sommets(element, som1);
+            int som_1 = renum_som_perio(som_0);
+            longueur_filtre_rw(element) = Kokkos::fmax(longueur_filtre_rw(element), longueur_filtre_sommet(som_1));
           }
+      });
+      end_gpu_timer(__KERNEL_NAME__);
     }
 
-  assert(nbr_element == 0 || min_array(longueur_filtre)>0.);
-
-  return longueur_filtre;
+  assert(nbr_element == 0 || min_array(tab_longueur_filtre) > 0.);
+  return tab_longueur_filtre;
 }
 
 double distance_sommets(const int sommet1, const int sommet2, const Domaine_VEF& domaine)
diff --git a/src/VEF/Geometrie/distances_VEF.h b/src/VEF/Geometrie/distances_VEF.h
index b4baa80915..1ad81c0d0e 100644
--- a/src/VEF/Geometrie/distances_VEF.h
+++ b/src/VEF/Geometrie/distances_VEF.h
@@ -1,5 +1,5 @@
 /****************************************************************************
-* Copyright (c) 2025, CEA
+* Copyright (c) 2026, CEA
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
@@ -117,7 +117,7 @@ double distance_face(int dim, int fac, int fac1, CDoubleTabView xv, CDoubleTabVi
       a += ni * (xv(fac1,i) - xv(fac,i));
       b += ni * ni;
     }
-  return std::fabs(a / sqrt(b));
+  return Kokkos::fabs(a / sqrt(b));
 }
 
 // Kokkos function (factorize distance_2D and distance_3D functions)
@@ -132,7 +132,7 @@ double distance(int dim,int fac,int elem, CDoubleTabView xp, CDoubleTabView xv,
       norme += fn_i * fn_i;
       ps += fn_i * (xp(elem, i) - xv(fac, i));
     }
-  return std::fabs(ps/sqrt(norme));
+  return Kokkos::fabs(ps/sqrt(norme));
 }
 // Kokkos function (factorize norm_2D_vit1 and norm_3D_vit1)
 KOKKOS_INLINE_FUNCTION
@@ -168,7 +168,7 @@ double norm_vit1(int dim, CDoubleTabView vit, int fac, int nfac, const int* num,
       sum_carre += carre(v[i]);
       psc += v[i] * r[i];
     }
-  double norm_vit = sqrt(std::fabs(sum_carre-carre(psc)));
+  double norm_vit = sqrt(Kokkos::fabs(sum_carre-carre(psc)));
 
   // val1,val2 val3 sont les vitesses tangentielles
   for (int i=0; i<dim; i++)
@@ -210,7 +210,7 @@ double norm_vit1_lp(int dim, CDoubleTabView vit, int fac, int nfac, const int* n
       sum_carre += carre(v[i]);
       psc += v[i] * r[i];
     }
-  double norm_vit = sqrt(std::fabs(sum_carre-carre(psc)));
+  double norm_vit = sqrt(Kokkos::fabs(sum_carre-carre(psc)));
 
   // val1,val2 val3 sont les vitesses tangentielles
   for (int i=0; i<dim; i++)
@@ -245,7 +245,7 @@ double norm_vit_lp_k(int dim, CDoubleTabView vit, int num1, int fac, CDoubleTabV
       sum_carre += carre(v[i]);
       psc += v[i] * r[i];
     }
-  double norm_vit = sqrt(std::fabs(sum_carre-carre(psc)));
+  double norm_vit = sqrt(Kokkos::fabs(sum_carre-carre(psc)));
 
   // val1,val2 val3 sont les vitesses tangentielles
   for (int i=0; i<dim; i++)
@@ -254,4 +254,50 @@ double norm_vit_lp_k(int dim, CDoubleTabView vit, int num1, int fac, CDoubleTabV
 }
 // ToDo factorize norm_vit1, norm_vit1_lp, norm_vit_lp_k ?
 
+KOKKOS_INLINE_FUNCTION
+double distance_sommets(int som1, int som2, CDoubleTabView xs)
+{
+  double result = 0.;
+  for (int i = 0; i < (int)xs.extent(1); i++)
+    {
+      double d = xs(som2, i) - xs(som1, i);
+      result += d * d;
+    }
+  return Kokkos::sqrt(result);
+}
+
+// Only called for 3D Scotti branch
+KOKKOS_INLINE_FUNCTION
+double som_pscal(int som0, int som1, int som2, int som3, CDoubleTabView xs)
+{
+  double v1[3], v2[3], v3[3];
+  double n1 = 0., n2 = 0., n3 = 0.;
+  for (int i = 0; i < 3; i++)
+    {
+      v1[i] = xs(som1, i) - xs(som0, i);
+      v2[i] = xs(som2, i) - xs(som0, i);
+      v3[i] = xs(som3, i) - xs(som0, i);
+      n1 += v1[i] * v1[i];
+      n2 += v2[i] * v2[i];
+      n3 += v3[i] * v3[i];
+    }
+  n1 = Kokkos::sqrt(n1);
+  n2 = Kokkos::sqrt(n2);
+  n3 = Kokkos::sqrt(n3);
+  for (int i = 0; i < 3; i++)
+    {
+      v1[i] /= n1;
+      v2[i] /= n2;
+      v3[i] /= n3;
+    }
+  double dot12 = 0., dot23 = 0., dot31 = 0.;
+  for (int i = 0; i < 3; i++)
+    {
+      dot12 += v1[i] * v2[i];
+      dot23 += v2[i] * v3[i];
+      dot31 += v3[i] * v1[i];
+    }
+  return Kokkos::fabs(dot12) + Kokkos::fabs(dot23) + Kokkos::fabs(dot31);
+}
+
 #endif
diff --git a/src/VEF/Operateurs/Op_Conv/Op_Conv_EF_VEF_P1NC_Stab.cpp b/src/VEF/Operateurs/Op_Conv/Op_Conv_EF_VEF_P1NC_Stab.cpp
index b1751545df..dfc73c20fd 100644
--- a/src/VEF/Operateurs/Op_Conv/Op_Conv_EF_VEF_P1NC_Stab.cpp
+++ b/src/VEF/Operateurs/Op_Conv/Op_Conv_EF_VEF_P1NC_Stab.cpp
@@ -539,7 +539,7 @@ void Op_Conv_EF_VEF_P1NC_Stab::remplir_fluent() const
     double psc=0.;
     for (int i=0; i<nb_comp; i++)
       psc+=tab_vitesse(num_face,i)*face_normales(num_face,i);
-    fluent(num_face)=std::fabs(psc);
+    fluent(num_face)=Kokkos::fabs(psc);
   });
   end_gpu_timer(__KERNEL_NAME__);
 }
@@ -934,7 +934,7 @@ Op_Conv_EF_VEF_P1NC_Stab::calculer_senseur(CDoubleTabView3 Kij, CDoubleArrView t
                                            const int nb_comp, const int face_i,
                                            CIntTabView elem_faces, CIntTabView face_voisins, CIntTabView num_fac_loc,
                                            double* P_plus, double* P_moins,
-                                           double* Q_plus, double* Q_moins) const
+                                           double* Q_plus, double* Q_moins)
 {
   for (int i = 0; i < nb_comp; i++)
     {
@@ -3001,4 +3001,3 @@ void Op_Conv_EF_VEF_P1NC_Stab::test_implicite() const
 
   Process::exit();
 }
-
diff --git a/src/VEF/Operateurs/Op_Conv/Op_Conv_EF_VEF_P1NC_Stab.h b/src/VEF/Operateurs/Op_Conv/Op_Conv_EF_VEF_P1NC_Stab.h
index a6a649cc1f..bd65becc33 100644
--- a/src/VEF/Operateurs/Op_Conv/Op_Conv_EF_VEF_P1NC_Stab.h
+++ b/src/VEF/Operateurs/Op_Conv/Op_Conv_EF_VEF_P1NC_Stab.h
@@ -62,7 +62,7 @@ class Op_Conv_EF_VEF_P1NC_Stab : public Op_Conv_VEF_Face
   //test
   void         modifier_pour_Cl(Matrice_Morse&, DoubleTab&) const override;
 
-  public_for_cuda
+  protected_but_public_for_cuda
   void calculer_flux_bords(const DoubleTab&, const DoubleTab&, const DoubleTab&) const;
   void calculer_coefficients_operateur_centre(DoubleTab&,const int, const DoubleTab& vitesse) const;
   DoubleTab& ajouter_partie_compressible(const DoubleTab&, DoubleTab&, const DoubleTab& vitesse) const;
@@ -80,7 +80,7 @@ private :
   //Methodes pour l'explicite
   void reinit_conv_pour_Cl(const DoubleTab&,const IntList&, const DoubleTabs&, const DoubleTab&, DoubleTab&) const;
 
-  KOKKOS_INLINE_FUNCTION void calculer_senseur(CDoubleTabView3, CDoubleArrView, const int, const int, CIntTabView, CIntTabView, CIntTabView, double*, double*, double*, double*) const;
+  KOKKOS_INLINE_FUNCTION static void calculer_senseur(CDoubleTabView3, CDoubleArrView, const int, const int, CIntTabView, CIntTabView, CIntTabView, double*, double*, double*, double*);
   inline void calculer_senseur(const DoubleTab&, const DoubleVect&, const int, const int, const IntTab&, const IntTab&, const IntTab&, ArrOfDouble&, ArrOfDouble&, ArrOfDouble&, ArrOfDouble&) const;
   void ajouter_old(const DoubleTab& , DoubleTab&, const DoubleTab& vitesse) const;
   void calculer_data_pour_dirichlet();
diff --git a/src/VEF/Operateurs/Op_Conv/Op_Conv_Muscl_New_VEF_Face.cpp b/src/VEF/Operateurs/Op_Conv/Op_Conv_Muscl_New_VEF_Face.cpp
index 90c64bd11f..b935cb243c 100644
--- a/src/VEF/Operateurs/Op_Conv/Op_Conv_Muscl_New_VEF_Face.cpp
+++ b/src/VEF/Operateurs/Op_Conv/Op_Conv_Muscl_New_VEF_Face.cpp
@@ -1,5 +1,5 @@
 /****************************************************************************
-* Copyright (c) 2025, CEA
+* Copyright (c) 2026, CEA
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
@@ -584,7 +584,7 @@ void Op_Conv_Muscl_New_VEF_Face::remplir_fluent() const
     double psc=0.;
     for (int i=0; i<dim; i++)
       psc+=velocity(num_face,i)*face_normales(num_face,i);
-    fluent(num_face)=std::fabs(psc);
+    fluent(num_face)=Kokkos::fabs(psc);
   });
   end_gpu_timer(__KERNEL_NAME__);
 }
@@ -790,7 +790,7 @@ void Op_Conv_Muscl_New_VEF_Face::calculer_flux_bords(const DoubleTab& Kij, const
       else if (sub_type(Neumann_sortie_libre,la_cl.valeur()))
         {
           const Neumann_sortie_libre& la_sortie_libre = ref_cast(Neumann_sortie_libre, la_cl.valeur());
-          CDoubleTabView val_ext = la_sortie_libre.val_ext().view_ro();
+          CDoubleTabView val_ext = la_sortie_libre.tab_val_ext().view_ro();
           Kokkos::parallel_for(start_gpu_timer(__KERNEL_NAME__), num2, KOKKOS_LAMBDA(const int ind_face)
           {
             int facei = le_bord_num_face(ind_face);
@@ -955,7 +955,7 @@ Op_Conv_Muscl_New_VEF_Face::ajouter_operateur_centre(const DoubleTab& tab_Kij, c
           CIntTabView face_voisins = domaine_VEF.face_voisins().view_ro();
           CDoubleArrView transporteV = static_cast<const DoubleVect&>(tab_transporte).view_ro();
           CDoubleTabView3 Kij = tab_Kij.view_ro<3>();
-          CDoubleTabView val_ext = la_sortie_libre.val_ext().view_ro();
+          CDoubleTabView val_ext = la_sortie_libre.tab_val_ext().view_ro();
           DoubleArrView resuV = static_cast<DoubleVect&>(tab_resu).view_rw();
           Kokkos::parallel_for(start_gpu_timer(__KERNEL_NAME__),
                                Kokkos::RangePolicy<>(0, num2), KOKKOS_LAMBDA(
@@ -1135,8 +1135,8 @@ Op_Conv_Muscl_New_VEF_Face::ajouter_antidiffusion_v2(const DoubleTab& tab_Kij, c
           double R;
           if (kij >= 0.) //facei amont
             {
-              if (fij >= 0.) R = (std::fabs(P_plus) < DMINFLOAT) ? 0. : Q_plus / P_plus;
-              else R = (std::fabs(P_moins) < DMINFLOAT) ? 0. : Q_moins / P_moins;
+              if (fij >= 0.) R = (Kokkos::fabs(P_plus) < DMINFLOAT) ? 0. : Q_plus / P_plus;
+              else R = (Kokkos::fabs(P_moins) < DMINFLOAT) ? 0. : Q_moins / P_moins;
 
 
               R = minmod(R);
@@ -1153,8 +1153,8 @@ Op_Conv_Muscl_New_VEF_Face::ajouter_antidiffusion_v2(const DoubleTab& tab_Kij, c
             }
           else     //facej amont
             {
-              if (fji <= 0.) R = (std::fabs(P_moins) < DMINFLOAT) ? 0. : Q_moins / P_moins;
-              else R = (std::fabs(P_plus) < DMINFLOAT) ? 0. : Q_plus / P_plus;
+              if (fji <= 0.) R = (Kokkos::fabs(P_moins) < DMINFLOAT) ? 0. : Q_moins / P_moins;
+              else R = (Kokkos::fabs(P_plus) < DMINFLOAT) ? 0. : Q_plus / P_plus;
 
               R = minmod(R);
               R *= fji;
@@ -1242,14 +1242,14 @@ Op_Conv_Muscl_New_VEF_Face::ajouter_antidiffusion_v1(const DoubleTab& tab_Kij, c
               //Face amont : facei
               if (fij >= 0.)
                 {
-                  Ri = (std::fabs(Pi_plus) < DMINFLOAT) ? 0. : Qi_plus / Pi_plus;
-                  Rj = (std::fabs(Pj_moins) < DMINFLOAT) ? 0. : Qj_moins /
+                  Ri = (Kokkos::fabs(Pi_plus) < DMINFLOAT) ? 0. : Qi_plus / Pi_plus;
+                  Rj = (Kokkos::fabs(Pj_moins) < DMINFLOAT) ? 0. : Qj_moins /
                        Pj_moins;//car fji=-fij
                 }
               else
                 {
-                  Ri = (std::fabs(Pi_moins) < DMINFLOAT) ? 0. : Qi_moins / Pi_moins;
-                  Rj = (std::fabs(Pj_plus) < DMINFLOAT) ? 0. : Qj_plus /
+                  Ri = (Kokkos::fabs(Pi_moins) < DMINFLOAT) ? 0. : Qi_moins / Pi_moins;
+                  Rj = (Kokkos::fabs(Pj_plus) < DMINFLOAT) ? 0. : Qj_plus /
                        Pj_plus;//car fji=-fij
                 }
 
@@ -1268,13 +1268,13 @@ Op_Conv_Muscl_New_VEF_Face::ajouter_antidiffusion_v1(const DoubleTab& tab_Kij, c
               //Face amont : facej
               if (fji <= 0.)
                 {
-                  Rj = (std::fabs(Pj_moins) < DMINFLOAT) ? 0. : Qj_moins / Pj_moins;
-                  Ri = (std::fabs(Pi_plus) < DMINFLOAT) ? 0. : Qi_plus / Pi_plus;
+                  Rj = (Kokkos::fabs(Pj_moins) < DMINFLOAT) ? 0. : Qj_moins / Pj_moins;
+                  Ri = (Kokkos::fabs(Pi_plus) < DMINFLOAT) ? 0. : Qi_plus / Pi_plus;
                 }
               else
                 {
-                  Rj = (std::fabs(Pj_plus) < DMINFLOAT) ? 0. : Qj_plus / Pj_plus;
-                  Ri = (std::fabs(Pi_moins) < DMINFLOAT) ? 0. : Qi_moins / Pi_moins;
+                  Rj = (Kokkos::fabs(Pj_plus) < DMINFLOAT) ? 0. : Qj_plus / Pj_plus;
+                  Ri = (Kokkos::fabs(Pi_moins) < DMINFLOAT) ? 0. : Qi_moins / Pi_moins;
                 }
 
               if (is_dirichlet_faces(facei))
@@ -1305,7 +1305,7 @@ Op_Conv_Muscl_New_VEF_Face::calculer_senseur(CDoubleTabView3 Kij, CDoubleTabView
                                              const int dim, const int nb_comp, const int face_i,
                                              CIntTabView elem_faces, CIntTabView face_voisins, CIntTabView num_fac_loc,
                                              double& P_plus, double& P_moins,
-                                             double& Q_plus, double& Q_moins) const
+                                             double& Q_plus, double& Q_moins)
 {
   const int nb_faces_elem=(int)elem_faces.extent(1);
   for (int elem_voisin=0; elem_voisin<2; elem_voisin++)
diff --git a/src/VEF/Operateurs/Op_Conv/Op_Conv_Muscl_New_VEF_Face.h b/src/VEF/Operateurs/Op_Conv/Op_Conv_Muscl_New_VEF_Face.h
index 12539ddfb3..f267790527 100644
--- a/src/VEF/Operateurs/Op_Conv/Op_Conv_Muscl_New_VEF_Face.h
+++ b/src/VEF/Operateurs/Op_Conv/Op_Conv_Muscl_New_VEF_Face.h
@@ -63,7 +63,7 @@ class Op_Conv_Muscl_New_VEF_Face : public Op_Conv_VEF_Face
   //test
   void         modifier_pour_Cl(Matrice_Morse&, DoubleTab&) const override;
 
-  public_for_cuda
+  protected_but_public_for_cuda
   void calculer_flux_bords(const DoubleTab&, const DoubleTab&, const DoubleTab&) const;
   void calculer_coefficients_operateur_centre(DoubleTab&,DoubleTab&,DoubleTab&,DoubleTab&,const int, const DoubleTab& vitesse) const;
   void calculer_flux_operateur_centre(DoubleTab&,const DoubleTab&,const DoubleTab&,const DoubleTab&,const DoubleTab&,const int,const DoubleTab&,const DoubleTab&) const;
@@ -84,7 +84,7 @@ private :
   DoubleTab& ajouter_antidiffusion(const DoubleTab&, const DoubleTab&, const DoubleTab&, DoubleTab&) const;
 
 
-  KOKKOS_INLINE_FUNCTION void calculer_senseur(CDoubleTabView3, CDoubleTabView4, CDoubleArrView, const int, const int, const int, CIntTabView, CIntTabView, CIntTabView, double&, double&, double&, double&) const;
+  KOKKOS_INLINE_FUNCTION static void calculer_senseur(CDoubleTabView3, CDoubleTabView4, CDoubleArrView, const int, const int, const int, CIntTabView, CIntTabView, CIntTabView, double&, double&, double&, double&);
   void calculer_data_pour_dirichlet();
 
   //Attributs de la classe
diff --git a/src/VEF/Operateurs/Op_Conv/Op_Conv_VEF_Face.cpp b/src/VEF/Operateurs/Op_Conv/Op_Conv_VEF_Face.cpp
index f0bf6722c3..387c06ef77 100644
--- a/src/VEF/Operateurs/Op_Conv/Op_Conv_VEF_Face.cpp
+++ b/src/VEF/Operateurs/Op_Conv/Op_Conv_VEF_Face.cpp
@@ -285,15 +285,15 @@ void compute_flux_tetra_kernel(const FluxTetraKernelData& kernel_data)
         // Determination du type de CL selon le rang
         int rang = rang_elem_non_std_v(poly);
         double xc[3];
-        TRUST_IFCONSTEXPR (ordre == 3) // A optimiser! Risque de mauvais resultats en parallel si ordre=3
-        {
-          double xsom[12];
-          for (int i = 0; i < nsom_; i++)
-            for (int j = 0; j < dim; j++)
-              xsom[i * 3 + j] = coord_sommets_v(les_elems_[i], j);
-          int idirichlet, n1, n2, n3;
-          calcul_xg_tetra(xc, xsom, itypcl, idirichlet, n1, n2, n3);
-        }
+        if (ordre == 3) // A optimiser! Risque de mauvais resultats en parallel si ordre=3
+          {
+            double xsom[12];
+            for (int i = 0; i < nsom_; i++)
+              for (int j = 0; j < dim; j++)
+                xsom[i * 3 + j] = coord_sommets_v(les_elems_[i], j);
+            int idirichlet, n1, n2, n3;
+            calcul_xg_tetra(xc, xsom, itypcl, idirichlet, n1, n2, n3);
+          }
 
         double xp[3] = { xp_v(poly,0), xp_v(poly,1), xp_v(poly,2) };
 
@@ -1275,7 +1275,7 @@ DoubleTab& Op_Conv_VEF_Face::ajouter_gen(const DoubleTab& transporte, const Cham
           int num2 = num1 + le_bord.nb_faces();
           int dim = Objet_U::dimension;
           CDoubleTabView face_normale = domaine_VEF.face_normales().view_ro();
-          CDoubleTabView val_ext = la_sortie_libre.val_ext().view_ro();
+          CDoubleTabView val_ext = la_sortie_libre.tab_val_ext().view_ro();
           CDoubleTabView transporte_face_v = transporte_face.view_ro();
           CDoubleTabView vitesse_face_v = vitesse_face.view_ro();
           DoubleTabView flux_b_v = flux_b.view_wo();
@@ -1780,7 +1780,7 @@ void Op_Conv_VEF_Face::remplir_fluent() const
             double psc_m = (psc_c + psc_s + psc_s2) / dim;
 
             int num = (psc_m >= 0 ? num2 : num1);
-            Kokkos::atomic_add(&fluent[num], std::abs(psc_m));
+            Kokkos::atomic_add(&fluent[num], Kokkos::fabs(psc_m));
           } // fin de la boucle sur les facettes
       };
       Kokkos::parallel_for(start_gpu_timer(__KERNEL_NAME__), nb_elem_tot, kernel);
diff --git a/src/VEF/Operateurs/Op_Conv/Op_Conv_VEF_Face.h b/src/VEF/Operateurs/Op_Conv/Op_Conv_VEF_Face.h
index 1d1b57e7d2..6ab63f137e 100644
--- a/src/VEF/Operateurs/Op_Conv/Op_Conv_VEF_Face.h
+++ b/src/VEF/Operateurs/Op_Conv/Op_Conv_VEF_Face.h
@@ -54,10 +54,11 @@ class Op_Conv_VEF_Face : public Op_Conv_VEF_base
   void get_alpha(double& ) const;
   void get_type_op(int& )const;
 
-protected:
+  protected_but_public_for_cuda
   DoubleTab& ajouter_gen(const DoubleTab& transporte, const Champ_Inc_base& la_vitesse, DoubleTab& resu) const;
   void ajouter_contribution_gen(const DoubleTab& transporte, const Champ_Inc_base& la_vitesse, Matrice_Morse& matrice ) const;
 
+protected:
   Motcle type_lim;
   enum type_lim_type {type_lim_minmod,type_lim_vanleer,type_lim_vanalbada,type_lim_chakravarthy,type_lim_superbee};
   type_lim_type type_lim_int = type_lim_minmod;
diff --git a/src/VEF/Operateurs/Op_Diff_Dift/Op_Diff_VEF_Face.cpp b/src/VEF/Operateurs/Op_Diff_Dift/Op_Diff_VEF_Face.cpp
index 555126b88c..482e5d20a0 100644
--- a/src/VEF/Operateurs/Op_Diff_Dift/Op_Diff_VEF_Face.cpp
+++ b/src/VEF/Operateurs/Op_Diff_Dift/Op_Diff_VEF_Face.cpp
@@ -217,7 +217,7 @@ void Op_Diff_VEF_Face::ajouter_cas_scalaire(const DoubleTab& tab_inconnue,
         {
           const Neumann_paroi& la_cl_paroi = ref_cast(Neumann_paroi, la_cl.valeur());
           CDoubleArrView surface = domaine_VEF.face_surfaces().view_ro();
-          CDoubleTabView flux_impose = la_cl_paroi.flux_impose().view_ro();
+          CDoubleTabView flux_impose = la_cl_paroi.tab_flux_impose().view_ro();
           DoubleArrView flux_bords = static_cast<ArrOfDouble&>(tab_flux_bords).view_rw();
           DoubleArrView resu = static_cast<ArrOfDouble&>(tab_resu).view_rw();
           Kokkos::parallel_for(start_gpu_timer(__KERNEL_NAME__), Kokkos::RangePolicy<>(ndeb, nfin), KOKKOS_LAMBDA(const int face)
diff --git a/src/VEF/Operateurs/Op_Diff_Dift/Op_Diff_VEF_Face_Stab.cpp b/src/VEF/Operateurs/Op_Diff_Dift/Op_Diff_VEF_Face_Stab.cpp
index 0ef4516074..c8fc65c851 100644
--- a/src/VEF/Operateurs/Op_Diff_Dift/Op_Diff_VEF_Face_Stab.cpp
+++ b/src/VEF/Operateurs/Op_Diff_Dift/Op_Diff_VEF_Face_Stab.cpp
@@ -839,11 +839,14 @@ void Op_Diff_VEF_Face_Stab::completer()
         if ( (sub_type(Dirichlet,la_cl.valeur()))
              || (sub_type(Dirichlet_homogene,la_cl.valeur()))
            )
-          for (ind_face=0; ind_face<nb_faces_bord_tot; ind_face++)
-            {
-              face = le_bord.num_face(ind_face);
-              is_dirichlet_faces_(face)=1;
-            }
+          {
+            ToDo_Kokkos("critical");
+            for (ind_face=0; ind_face<nb_faces_bord_tot; ind_face++)
+              {
+                face = le_bord.num_face(ind_face);
+                is_dirichlet_faces_(face)=1;
+              }
+          }
       }
   }
 }
diff --git a/src/VEF/Operateurs/Op_Diff_Dift/Op_Diff_VEF_base.h b/src/VEF/Operateurs/Op_Diff_Dift/Op_Diff_VEF_base.h
index 81a75cf3d7..a27be02999 100644
--- a/src/VEF/Operateurs/Op_Diff_Dift/Op_Diff_VEF_base.h
+++ b/src/VEF/Operateurs/Op_Diff_Dift/Op_Diff_VEF_base.h
@@ -46,7 +46,7 @@ class Op_Diff_VEF_base : public Operateur_Diff_base, public Op_VEF_Face
   template <typename _TYPE_>
   double viscA(int face_i, int face_j, int num_elem, const _TYPE_& diffu) const;
   template <typename _TYPE_>
-  KOKKOS_INLINE_FUNCTION double viscA(int face_i, int face_j, int num_elem, const _TYPE_& diffu, CIntTabView face_voisins_v, CDoubleTabView face_normales_v, CDoubleArrView inverse_volumes_v) const;
+  KOKKOS_INLINE_FUNCTION static double viscA(int face_i, int face_j, int num_elem, const _TYPE_& diffu, CIntTabView face_voisins_v, CDoubleTabView face_normales_v, CDoubleArrView inverse_volumes_v);
 
   double calculer_dt_stab() const override;
   void calculer_pour_post(Champ_base& espace_stockage,const Nom& option,int comp) const override;
@@ -75,14 +75,14 @@ class Op_Diff_VEF_base : public Operateur_Diff_base, public Op_VEF_Face
   template<typename _TYPE_> std::enable_if_t< std::is_same<_TYPE_, TRUSTArray<double>>::value , double>
   inline diffu__(const int comp, const int num_elem, const _TYPE_ &diffu) const { return diffu(comp); }
 
-  template<typename _TYPE_> std::enable_if_t< std::is_same<_TYPE_, double>::value , double>
-  KOKKOS_INLINE_FUNCTION diffu__view(const int comp, const int num_elem, const _TYPE_ &diffu) const { return diffu; }
+  template<typename _TYPE_> static std::enable_if_t< std::is_same<_TYPE_, double>::value , double>
+  KOKKOS_INLINE_FUNCTION diffu__view(const int comp, const int num_elem, const _TYPE_ &diffu) { return diffu; }
 
-  template<typename _TYPE_> std::enable_if_t< std::is_same<_TYPE_, TRUSTTab<double>>::value , double>
-  KOKKOS_INLINE_FUNCTION diffu__view(const int comp, const int num_elem, const _TYPE_ &diffu) const { return diffu(num_elem, comp); }
+  template<typename _TYPE_> static std::enable_if_t< std::is_same<_TYPE_, TRUSTTab<double>>::value , double>
+  KOKKOS_INLINE_FUNCTION diffu__view(const int comp, const int num_elem, const _TYPE_ &diffu) { return diffu(num_elem, comp); }
 
-  template<typename _TYPE_> std::enable_if_t< std::is_same<_TYPE_, TRUSTArray<double>>::value , double>
-  KOKKOS_INLINE_FUNCTION diffu__view(const int comp, const int num_elem, const _TYPE_ &diffu) const { return diffu(comp); }
+  template<typename _TYPE_> static std::enable_if_t< std::is_same<_TYPE_, TRUSTArray<double>>::value , double>
+  KOKKOS_INLINE_FUNCTION diffu__view(const int comp, const int num_elem, const _TYPE_ &diffu) { return diffu(comp); }
 };
 
 // ATTENTION le diffu intervenant dans les fonctions n'est que LOCAL (on appelle d_nu apres)
@@ -117,7 +117,7 @@ inline double Op_Diff_VEF_base::viscA(int i, int j, int num_elem, const _TYPE_ &
 }
 
 template<typename _TYPE_>
-KOKKOS_INLINE_FUNCTION double Op_Diff_VEF_base::viscA(int i, int j, int num_elem, const _TYPE_ &diffu, CIntTabView face_voisins_v, CDoubleTabView face_normales_v, CDoubleArrView inverse_volumes_v) const
+KOKKOS_INLINE_FUNCTION double Op_Diff_VEF_base::viscA(int i, int j, int num_elem, const _TYPE_ &diffu, CIntTabView face_voisins_v, CDoubleTabView face_normales_v, CDoubleArrView inverse_volumes_v)
 {
   constexpr bool is_double = std::is_same<_TYPE_, double>::value;
   int dim = (int)face_normales_v.extent(1);
diff --git a/src/VEF/Operateurs/Op_Diff_Dift/Op_Dift_Stab_VEF_Face.cpp b/src/VEF/Operateurs/Op_Diff_Dift/Op_Dift_Stab_VEF_Face.cpp
index 8457451bea..339c2dc9d0 100644
--- a/src/VEF/Operateurs/Op_Diff_Dift/Op_Dift_Stab_VEF_Face.cpp
+++ b/src/VEF/Operateurs/Op_Diff_Dift/Op_Dift_Stab_VEF_Face.cpp
@@ -622,11 +622,14 @@ void Op_Dift_Stab_VEF_Face::completer()
       int nb_faces_bord_tot = le_bord.nb_faces_tot(), face = -1;
 
       if ((sub_type(Dirichlet, la_cl.valeur())) || (sub_type(Dirichlet_homogene, la_cl.valeur())))
-        for (ind_face = 0; ind_face < nb_faces_bord_tot; ind_face++)
-          {
-            face = le_bord.num_face(ind_face);
-            is_dirichlet_faces_(face) = 1;
-          }
+        {
+          ToDo_Kokkos("critical");
+          for (ind_face = 0; ind_face < nb_faces_bord_tot; ind_face++)
+            {
+              face = le_bord.num_face(ind_face);
+              is_dirichlet_faces_(face) = 1;
+            }
+        }
     }
 }
 
diff --git a/src/VEF/Operateurs/Op_Diff_Dift/Op_Dift_VEF_Face_Gen.h b/src/VEF/Operateurs/Op_Diff_Dift/Op_Dift_VEF_Face_Gen.h
index de9e38ba21..ce65724f3f 100644
--- a/src/VEF/Operateurs/Op_Diff_Dift/Op_Dift_VEF_Face_Gen.h
+++ b/src/VEF/Operateurs/Op_Diff_Dift/Op_Dift_VEF_Face_Gen.h
@@ -66,7 +66,7 @@ class Op_Dift_VEF_Face_Gen
 
 private:
 
-  public_for_cuda
+  protected_but_public_for_cuda
   template <Type_Champ _TYPE_, Type_Schema _SCHEMA_, bool _IS_STAB_ = false, bool _IS_RANS_ = false >
   void ajouter_bord_perio_gen__(const int , const DoubleTab&, DoubleTab* /* Si explicite */ , Matrice_Morse* /* Si implicite */, const DoubleTab&, const DoubleTab&, const DoubleVect& , DoubleTab* flux_bord = nullptr /* flux_bords */) const;
 
diff --git a/src/VEF/Operateurs/Op_Diff_Dift/Op_Dift_VEF_Face_Gen.tpp b/src/VEF/Operateurs/Op_Diff_Dift/Op_Dift_VEF_Face_Gen.tpp
index 90c58c0224..2c0ad4cdcc 100644
--- a/src/VEF/Operateurs/Op_Diff_Dift/Op_Dift_VEF_Face_Gen.tpp
+++ b/src/VEF/Operateurs/Op_Diff_Dift/Op_Dift_VEF_Face_Gen.tpp
@@ -424,7 +424,7 @@ void Op_Dift_VEF_Face_Gen<DERIVED_T>::modifie_pour_cl_gen(const DoubleTab& tab_i
       if (sub_type(Neumann_paroi, la_cl.valeur()))
         {
           const Neumann_paroi& la_cl_paroi = ref_cast(Neumann_paroi, la_cl.valeur());
-          CDoubleTabView flux_impose = la_cl_paroi.flux_impose().view_ro();
+          CDoubleTabView flux_impose = la_cl_paroi.tab_flux_impose().view_ro();
           CDoubleArrView face_surfaces = domaine_VEF.face_surfaces().view_ro();
           DoubleTabView flux_bords = tab_flux_bords.view_wo();
           DoubleTabView resu = tab_resu.view_rw();
diff --git a/src/VEF/Operateurs/Op_Divers/Op_Div_VEFP1B_Elem.cpp b/src/VEF/Operateurs/Op_Divers/Op_Div_VEFP1B_Elem.cpp
index d1eddd8cc8..d33be3a4ee 100644
--- a/src/VEF/Operateurs/Op_Divers/Op_Div_VEFP1B_Elem.cpp
+++ b/src/VEF/Operateurs/Op_Divers/Op_Div_VEFP1B_Elem.cpp
@@ -1,5 +1,5 @@
 /****************************************************************************
-* Copyright (c) 2025, CEA
+* Copyright (c) 2026, CEA
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
@@ -112,18 +112,15 @@ DoubleTab& Op_Div_VEFP1B_Elem::ajouter_elem(const DoubleTab& vit, DoubleTab& div
       RandomAccessView<int, 2> face_voisins_v = face_voisins.view_ro();
       RandomAccessView<double, 2> face_normales_v = face_normales.view_ro();
       RandomAccessView<double, 2> vit_v = vit.view_ro();
-      Kokkos::parallel_for(start_gpu_timer(__KERNEL_NAME__), nb_elem, KOKKOS_LAMBDA(
-                             const int elem)
+      Kokkos::parallel_for(start_gpu_timer(__KERNEL_NAME__), range_2D({0,0}, {nb_elem,nfe}), KOKKOS_LAMBDA(
+                             const int elem, const int indice)
       {
         double pscf = 0;
-        for (int indice = 0; indice < nfe; indice++)
-          {
-            int face = elem_faces_v(elem, indice);
-            int signe = elem == face_voisins_v(face, 0) ? 1 : -1;
-            for (int comp = 0; comp < dim; comp++)
-              pscf += signe * vit_v(face, comp) * face_normales_v(face, comp);
-          }
-        div_v(elem, 0) += pscf;
+        int face = elem_faces_v(elem, indice);
+        int signe = elem == face_voisins_v(face, 0) ? 1 : -1;
+        for (int comp = 0; comp < dim; comp++)
+          pscf += signe * vit_v(face, comp) * face_normales_v(face, comp);
+        Kokkos::atomic_add(&div_v(elem, 0), pscf);
       });
     }
   else
@@ -131,18 +128,15 @@ DoubleTab& Op_Div_VEFP1B_Elem::ajouter_elem(const DoubleTab& vit, DoubleTab& div
       CIntTabView face_voisins_v = face_voisins.view_ro();
       CDoubleTabView face_normales_v = face_normales.view_ro();
       CDoubleTabView vit_v = vit.view_ro();
-      Kokkos::parallel_for(start_gpu_timer(__KERNEL_NAME__), nb_elem, KOKKOS_LAMBDA(
-                             const int elem)
+      Kokkos::parallel_for(start_gpu_timer(__KERNEL_NAME__), range_2D({0,0}, {nb_elem,nfe}), KOKKOS_LAMBDA(
+                             const int elem, const int indice)
       {
         double pscf = 0;
-        for (int indice = 0; indice < nfe; indice++)
-          {
-            int face = elem_faces_v(elem, indice);
-            int signe = elem == face_voisins_v(face, 0) ? 1 : -1;
-            for (int comp = 0; comp < dim; comp++)
-              pscf += signe * vit_v(face, comp) * face_normales_v(face, comp);
-          }
-        div_v(elem, 0) += pscf;
+        int face = elem_faces_v(elem, indice);
+        int signe = elem == face_voisins_v(face, 0) ? 1 : -1;
+        for (int comp = 0; comp < dim; comp++)
+          pscf += signe * vit_v(face, comp) * face_normales_v(face, comp);
+        Kokkos::atomic_add(&div_v(elem, 0), pscf);
       });
     }
   end_gpu_timer(__KERNEL_NAME__);
@@ -321,7 +315,6 @@ DoubleTab& Op_Div_VEFP1B_Elem::ajouter_som(const DoubleTab& tab_vit, DoubleTab&
   // Initialisation tableaux constants
   if (!som_initialized_)
     {
-      som_initialized_ = true;
       const IntTab& som_elem = domaine.les_elems();
       som_.resize(nb_elem_tot, nfe);
       nb_degres_liberte_.resize(domaine_VEF.domaine().nb_som_tot());
@@ -340,14 +333,20 @@ DoubleTab& Op_Div_VEFP1B_Elem::ajouter_som(const DoubleTab& tab_vit, DoubleTab&
   int modif_traitement_diri = domaine_VEF.get_modif_div_face_dirichlet();
   const Domaine_Cl_VEF& zcl = ref_cast(Domaine_Cl_VEF, la_zcl_vef.valeur());
 
+  CIntArrView rang_elem_non_std;
+  CIntArrView type_elem_Cl;
+  if (modif_traitement_diri)
+    {
+      rang_elem_non_std = domaine_VEF.rang_elem_non_std().view_ro();
+      type_elem_Cl = zcl.type_elem_Cl().view_ro();
+    }
   CDoubleTabView face_normales = domaine_VEF.face_normales().view_ro();
   CDoubleTabView vit = tab_vit.view_ro();
-  CIntArrView rang_elem_non_std = domaine_VEF.rang_elem_non_std().view_ro();
-  CIntArrView type_elem_Cl = zcl.type_elem_Cl().view_ro();
   CIntTabView elem_faces = domaine_VEF.elem_faces().view_ro();
   CIntTabView face_voisins = domaine_VEF.face_voisins().view_ro();
   CIntTabView som_v = som_.view_ro();
   DoubleArrView div = static_cast<DoubleVect&>(tab_div).view_rw();
+  // PL: not possible to use MDRangePolicy here. It needs sigma to be the complete sum over faces
   Kokkos::parallel_for(start_gpu_timer(__KERNEL_NAME__),
                        range_1D(0, nb_elem_tot),
                        KOKKOS_LAMBDA (const int elem)
@@ -418,9 +417,9 @@ DoubleTab& Op_Div_VEFP1B_Elem::ajouter_som(const DoubleTab& tab_vit, DoubleTab&
           if (sub_type(Dirichlet,la_cl.valeur()) || sub_type(Dirichlet_homogene, la_cl.valeur()) || sub_type(Dirichlet_entree_fluide, la_cl.valeur()) || sub_type(Symetrie, la_cl.valeur()))
             libre = 0;
 
+          int som_initialized = som_initialized_;
           CIntTabView face_sommets = domaine_VEF.face_sommets().view_ro();
           CIntArrView renum_som_perio = domaine.get_renum_som_perio().view_ro();
-
           // On boucle sur les faces de bord reelles et virtuelles
           Kokkos::parallel_for(start_gpu_timer(__KERNEL_NAME__),
                                range_1D(0, nb_faces_bord_tot), KOKKOS_LAMBDA(
@@ -437,7 +436,7 @@ DoubleTab& Op_Div_VEFP1B_Elem::ajouter_som(const DoubleTab& tab_vit, DoubleTab&
               {
                 int som = renum_som_perio(face_sommets(face, indice));
                 Kokkos::atomic_add(&div(nps + som), flux);
-                if (libre)
+                if (libre && !som_initialized)
                   Kokkos::atomic_add(&nb_degres_liberte(som), 1);
               }
           });
@@ -470,6 +469,7 @@ DoubleTab& Op_Div_VEFP1B_Elem::ajouter_som(const DoubleTab& tab_vit, DoubleTab&
           end_gpu_timer(__KERNEL_NAME__);
         }
     }
+  som_initialized_ = true;
   return tab_div;
 }
 
@@ -755,99 +755,111 @@ void Op_Div_VEFP1B_Elem::degres_liberte() const
   decoup_som << "1" << finl;
   decoup_som << Objet_U::dimension << " " << nb_som << finl;
   ArrOfInt somm(dimension + 2);
-  for (int k = 0; k < nb_som; k++)
+  CIntArrView renum_som_perio = domaine.get_renum_som_perio().view_ro();
+  CIntArrView nb_degres_liberte = nb_degres_liberte_.view_ro();
+  int error = 0;
+  Kokkos::parallel_reduce(start_gpu_timer(__KERNEL_NAME__), nb_som, KOKKOS_LAMBDA(const int k, int& local_error)
+  {
+    int sommet = renum_som_perio(k);
+    if (nb_degres_liberte(sommet) == 0) local_error++;
+  }, error);
+  end_gpu_timer(__KERNEL_NAME__);
+  if (error)
     {
-      int sommet = domaine.get_renum_som_perio(k);
-      if (nb_degres_liberte_(sommet) != 0)
-        continue;
-      if (!afficher_message)
+      for (int k = 0; k < nb_som; k++)
         {
-          afficher_message = 1;
-          Cerr << finl << "Problem with the mesh used for the VEF P1Bulle discretization." << finl;
-          journal << "List of nodes with no degrees of freedom :" << finl;
-        }
-      const double x = domaine.coord(sommet, 0);
-      const double y = domaine.coord(sommet, 1);
-      const double z = (Objet_U::dimension == 3) ? domaine.coord(sommet, 2) : 0.;
-
-      journal << "Error node " << sommet << " ( " << x << " " << y << " " << z << " )\n";
-      // On affiche la liste des indices d'elements reels et virtuels qui contiennent
-      // ce sommet. On affiche la lettre "v" pour les elements virtuels.
-      journal << "Elements ";
-      const int nb_elem_tot = domaine.nb_elem_tot();
-      const int nb_elem = domaine.nb_elem();
-      const IntTab& som_elem = domaine.les_elems();
-      for (int elem = 0; elem < nb_elem_tot; elem++)
-        for (int som = 0; som < nse; som++)
-          if (som_elem(elem, som) == sommet)
+          int sommet = domaine.get_renum_som_perio(k);
+          if (nb_degres_liberte_(sommet) != 0)
+            continue;
+          if (!afficher_message)
             {
-              journal << elem << ((elem >= nb_elem) ? "v " : " ");
+              afficher_message = 1;
+              Cerr << finl << "Problem with the mesh used for the VEF P1Bulle discretization." << finl;
+              journal << "List of nodes with no degrees of freedom :" << finl;
+            }
+          const double x = domaine.coord(sommet, 0);
+          const double y = domaine.coord(sommet, 1);
+          const double z = (Objet_U::dimension == 3) ? domaine.coord(sommet, 2) : 0.;
+
+          journal << "Error node " << sommet << " ( " << x << " " << y << " " << z << " )\n";
+          // On affiche la liste des indices d'elements reels et virtuels qui contiennent
+          // ce sommet. On affiche la lettre "v" pour les elements virtuels.
+          journal << "Elements ";
+          const int nb_elem_tot = domaine.nb_elem_tot();
+          const int nb_elem = domaine.nb_elem();
+          const IntTab& som_elem = domaine.les_elems();
+          for (int elem = 0; elem < nb_elem_tot; elem++)
+            for (int som = 0; som < nse; som++)
+              if (som_elem(elem, som) == sommet)
+                {
+                  journal << elem << ((elem >= nb_elem) ? "v " : " ");
 
-              // Ecriture dans le fichier decoupage_som
-              int face_opp = elem_faces(elem, som);
-              int elem_opp;
-              somm = -1;
-              somm(0) = sommet;
+                  // Ecriture dans le fichier decoupage_som
+                  int face_opp = elem_faces(elem, som);
+                  int elem_opp;
+                  somm = -1;
+                  somm(0) = sommet;
 
-              int elem1 = face_voisins(face_opp, 0);
-              int elem2 = face_voisins(face_opp, 1);
+                  int elem1 = face_voisins(face_opp, 0);
+                  int elem2 = face_voisins(face_opp, 1);
 
-              if (elem1 == elem)
-                elem_opp = elem2;
-              else
-                elem_opp = elem1;
-
-              int i = 2;
-              for (int som1 = 0; som1 < nse; som1++)  // on parcourt les sommets de elem_opp
-                {
-                  int ok = 1;
-                  for (int som2 = 0; som2 < nse; som2++)  // on parcourt les sommets de elem
-                    if (som_elem(elem, som2) == som_elem(elem_opp, som1))
-                      ok = 0;
-                  if (ok)
-                    somm(1) = som_elem(elem_opp, som1);
+                  if (elem1 == elem)
+                    elem_opp = elem2;
                   else
+                    elem_opp = elem1;
+
+                  int i = 2;
+                  for (int som1 = 0; som1 < nse; som1++)  // on parcourt les sommets de elem_opp
                     {
-                      somm(i) = som_elem(elem_opp, som1);  // sommets de la face commune
-                      i++;
+                      int ok = 1;
+                      for (int som2 = 0; som2 < nse; som2++)  // on parcourt les sommets de elem
+                        if (som_elem(elem, som2) == som_elem(elem_opp, som1))
+                          ok = 0;
+                      if (ok)
+                        somm(1) = som_elem(elem_opp, som1);
+                      else
+                        {
+                          somm(i) = som_elem(elem_opp, som1);  // sommets de la face commune
+                          i++;
+                        }
+                    }
+                  if (decoupage_som)
+                    {
+                      ecrire_decoupage_som = 1;
+                      for (int j = 0; j < dimension + 2; j++)
+                        decoup_som << somm(j) << " ";
+                      decoup_som << elem << " " << elem_opp << finl;
                     }
                 }
-              if (decoupage_som)
-                {
-                  ecrire_decoupage_som = 1;
-                  for (int j = 0; j < dimension + 2; j++)
-                    decoup_som << somm(j) << " ";
-                  decoup_som << elem << " " << elem_opp << finl;
-                }
-            }
-      journal << "\n";
-      // On affiche la liste des faces qui contiennent ce sommet.
-      // Pour les faces de bord, on affiche la condlim,
-      // pour les faces virtuelles, la lettre "v"
-      journal << "\nFaces ";
-      const int nb_faces = domaine_VEF.nb_faces();
-      const int nb_som_face = domaine_VEF.face_sommets().dimension(1);
-      for (int face = 0; face < nb_faces_tot; face++)
-        {
-          for (int som = 0; som < nb_som_face; som++)
+          journal << "\n";
+          // On affiche la liste des faces qui contiennent ce sommet.
+          // Pour les faces de bord, on affiche la condlim,
+          // pour les faces virtuelles, la lettre "v"
+          journal << "\nFaces ";
+          const int nb_faces = domaine_VEF.nb_faces();
+          const int nb_som_face = domaine_VEF.face_sommets().dimension(1);
+          for (int face = 0; face < nb_faces_tot; face++)
             {
-              if (domaine_VEF.face_sommets(face, som) == sommet)
+              for (int som = 0; som < nb_som_face; som++)
                 {
-                  journal << face;
-                  if (face >= nb_faces) // Face virtuelle
-                    journal << "v";
-                  const int cl = find_cl_face(domaine, face);
-                  // Face de bord reelle:
-                  if (cl >= 0)
+                  if (domaine_VEF.face_sommets(face, som) == sommet)
                     {
-                      const Nom& nom_bord = domaine.frontiere(cl).le_nom();
-                      journal << "(boundary=" << nom_bord << ")";
+                      journal << face;
+                      if (face >= nb_faces) // Face virtuelle
+                        journal << "v";
+                      const int cl = find_cl_face(domaine, face);
+                      // Face de bord reelle:
+                      if (cl >= 0)
+                        {
+                          const Nom& nom_bord = domaine.frontiere(cl).le_nom();
+                          journal << "(boundary=" << nom_bord << ")";
+                        }
+                      journal << " ";
                     }
-                  journal << " ";
                 }
             }
+          journal << finl;
         }
-      journal << finl;
     }
 
   if (ecrire_decoupage_som)
diff --git a/src/VEF/Operateurs/Op_Divers/Op_Div_VEFP1B_Elem.h b/src/VEF/Operateurs/Op_Divers/Op_Div_VEFP1B_Elem.h
index 2c80e73d2d..3b3c77d9ed 100644
--- a/src/VEF/Operateurs/Op_Divers/Op_Div_VEFP1B_Elem.h
+++ b/src/VEF/Operateurs/Op_Divers/Op_Div_VEFP1B_Elem.h
@@ -56,7 +56,7 @@ class Op_Div_VEFP1B_Elem: public Operateur_Div_base
   void contribuer_a_avec(const DoubleTab&, Matrice_Morse&) const override { }
   void contribuer_au_second_membre(DoubleTab&) const override { }
 
-  public_for_cuda
+  protected_but_public_for_cuda
   void volumique_P0(DoubleTab&) const;
 
 private:
diff --git a/src/VEF/Operateurs/Op_Divers/Op_Grad_VEF_P1B_Face.cpp b/src/VEF/Operateurs/Op_Divers/Op_Grad_VEF_P1B_Face.cpp
index e7c327a6a8..d745f8bcc4 100644
--- a/src/VEF/Operateurs/Op_Divers/Op_Grad_VEF_P1B_Face.cpp
+++ b/src/VEF/Operateurs/Op_Divers/Op_Grad_VEF_P1B_Face.cpp
@@ -1,5 +1,5 @@
 /****************************************************************************
-* Copyright (c) 2025, CEA
+* Copyright (c) 2026, CEA
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
@@ -215,7 +215,7 @@ DoubleTab& Op_Grad_VEF_P1B_Face::modifier_grad_pour_Cl(DoubleTab& tab_grad) cons
                       norm += face_normales(face, comp) * face_normales(face, comp);
                     }
                   // psc/=norm; // Fixed bug: Arithmetic exception
-                  if (std::fabs(norm) >= DMINFLOAT)
+                  if (Kokkos::fabs(norm) >= DMINFLOAT)
                     psc /= norm;
                   for (int comp = 0; comp < dim; comp++)
                     grad(face, comp) -= psc * face_normales(face, comp);
@@ -242,7 +242,6 @@ DoubleTab& Op_Grad_VEF_P1B_Face::ajouter_elem(const DoubleTab& tab_pre, DoubleTa
   const Domaine_VEF& domaine_VEF = domaine_vef();
   assert(domaine_VEF.get_alphaE());
   const Domaine& domaine = domaine_VEF.domaine();
-  const IntTab& elem_faces = domaine_VEF.elem_faces();
   int nfe = domaine.nb_faces_elem();
   int nb_elem_tot = domaine.nb_elem_tot();
   CDoubleArrView porosite_face = equation().milieu().porosite_face().view_ro();
@@ -265,7 +264,7 @@ DoubleTab& Op_Grad_VEF_P1B_Face::ajouter_elem(const DoubleTab& tab_pre, DoubleTa
               const Front_VF& le_bord = ref_cast(Front_VF, la_cl->frontiere_dis());
               int num1 = le_bord.num_premiere_face();
               int num2 = num1 + le_bord.nb_faces();
-              CDoubleTabView flux_impose = la_sortie_libre.flux_impose().view_ro();
+              CDoubleTabView flux_impose = la_sortie_libre.tab_flux_impose().view_ro();
               DoubleTabView grad = tab_grad.view_rw();
               Kokkos::parallel_for(start_gpu_timer(__KERNEL_NAME__), Kokkos::RangePolicy<>(num1, num2), KOKKOS_LAMBDA(const int face)
               {
@@ -302,30 +301,22 @@ DoubleTab& Op_Grad_VEF_P1B_Face::ajouter_elem(const DoubleTab& tab_pre, DoubleTa
     }
 
   CIntTabView face_voisins = domaine_VEF.face_voisins().view_ro();
-  CIntTabView elem_faces_v = elem_faces.view_ro();
+  CIntTabView elem_faces = domaine_VEF.elem_faces().view_ro();
   CDoubleTabView pre = tab_pre.view_ro();
   DoubleTabView grad = tab_grad.view_rw();
   int dim = Objet_U::dimension;
-
-  auto kern_elem = KOKKOS_LAMBDA(int
-                                 elem)
+  Kokkos::parallel_for(start_gpu_timer(__KERNEL_NAME__), Kokkos::MDRangePolicy<Kokkos::Rank<2>>({0,0}, {nb_elem_tot,nfe}), KOKKOS_LAMBDA (int elem, int indice)
   {
-    for (int indice = 0; indice < nfe; indice++)
+    int face = elem_faces(elem, indice);
+    double pe = pre(elem, 0);
+    double signe = elem == face_voisins(face, 0) ? 1 : -1;
+    double poro = porosite_face(face);
+    double coeff = pe * signe * poro;
+    for (int comp = 0; comp < dim; comp++)
       {
-        double pe = pre(elem, 0);
-        int face = elem_faces_v(elem, indice);
-        double signe = 1;
-        if (elem != face_voisins(face, 0)) signe = -1;
-        for (int comp = 0; comp < dim; comp++)
-          {
-            double val = pe * signe * face_normales(face, comp) * porosite_face(face);
-            Kokkos::atomic_sub(&grad(face, comp), val);
-
-          }
+        Kokkos::atomic_sub(&grad(face, comp), coeff * face_normales(face, comp));
       }
-  };
-
-  Kokkos::parallel_for(start_gpu_timer(__KERNEL_NAME__), nb_elem_tot, kern_elem);
+  });
   end_gpu_timer(__KERNEL_NAME__);
 
   return tab_grad;
@@ -377,21 +368,16 @@ DoubleTab& Op_Grad_VEF_P1B_Face::ajouter_som(const DoubleTab& tab_pre, DoubleTab
                        KOKKOS_LAMBDA (int elem, int indice)
   {
     int face = elem_faces(elem,indice);
-
-    double signe = 1;
-    if (elem != face_voisins(face,0))
-      signe = -1;
-
-    double sigma[3];
-    for (int comp = 0; comp < dim; comp++)
-      sigma[comp] = face_normales(face,comp) * signe;
-
+    double signe = elem == face_voisins(face,0) ? 1 : -1;
+    double pe = pre(som_v(elem,indice));
+    double coeff = coeff_som(elem) * pe * signe;
     for (int indice2 = 0; indice2 < nfe; indice2++)
       {
-        int face2 = elem_faces(elem,indice2);
+        int face2 = elem_faces(elem, indice2);
+        double poro = porosite_face(face2);
         for (int comp = 0; comp < dim; comp++)
           {
-            Kokkos::atomic_add(&grad(face2,comp), -(coeff_som(elem) * pre(som_v(elem,indice)) * sigma[comp] * porosite_face(face2)));
+            Kokkos::atomic_sub(&grad(face2,comp), coeff * poro * face_normales(face,comp));
           }
       }
   });
@@ -417,7 +403,7 @@ DoubleTab& Op_Grad_VEF_P1B_Face::ajouter_som(const DoubleTab& tab_pre, DoubleTab
               const Neumann_sortie_libre& sortie_libre = ref_cast(Neumann_sortie_libre, la_cl.valeur());
               int num1 = le_bord.num_premiere_face();
               int num2 = num1 + le_bord.nb_faces();
-              CDoubleTabView flux_impose = sortie_libre.flux_impose().view_ro();
+              CDoubleTabView flux_impose = sortie_libre.tab_flux_impose().view_ro();
               CIntTabView face_sommets = domaine_VEF.face_sommets().view_ro();
               CIntArrView renum_som_perio = dom.get_renum_som_perio().view_ro();
               Kokkos::parallel_for(start_gpu_timer(__KERNEL_NAME__), Kokkos::RangePolicy<>(num1, num2), KOKKOS_LAMBDA(const int face)
@@ -665,13 +651,8 @@ void Op_Grad_VEF_P1B_Face::calculer_flux_bords() const
     flux_bords_.resize(domaine_VEF.nb_faces_bord(), dimension);
   flux_bords_ = 0.;
 
-  //int nse=domaine_VEF.domaine().nb_som_elem();
   int nb_faces_bord = domaine_VEF.premiere_face_int();
   int nps = domaine_VEF.numero_premier_sommet();
-  const IntTab& sommets = domaine_VEF.face_sommets();
-  const IntTab& face_voisins = domaine_VEF.face_voisins();
-  //const IntTab& som_elem=le_dom_vef->domaine().les_elems();
-  const DoubleTab& face_normales = domaine_VEF.face_normales();
   const Navier_Stokes_std& eqn_hydr = ref_cast(Navier_Stokes_std, equation());
   const Champ_P1_isoP1Bulle& la_pression_P1B = ref_cast(Champ_P1_isoP1Bulle, eqn_hydr.pression_pa());
   // Si on filtre:
@@ -682,42 +663,34 @@ void Op_Grad_VEF_P1B_Face::calculer_flux_bords() const
   else
     la_pression_P1B.filtrage(domaine_VEF, la_pression_P1B);
 
-
-
-  const DoubleVect& pression_P1B = la_pression_P1B.champ_filtre();
-
   double coeff_P1 = 1. / dimension;
   bool alphaE = domaine_VEF.get_alphaE();
   bool alphaS = domaine_VEF.get_alphaS();
-  int nb_som_par_face = sommets.dimension(1);
-  CIntTabView face_voisins_v = face_voisins.view_ro();
-  CIntTabView sommets_v = sommets.view_ro();
-  CDoubleTabView face_normales_v = face_normales.view_ro();
-  CDoubleArrView pression_P1B_v = pression_P1B.view_ro();
-  DoubleTabView flux_bords_v = flux_bords_.view_wo();
+  int nb_som_par_face = domaine_VEF.face_sommets().dimension(1);
   int dim = Objet_U::dimension;
-
-  auto kern_flux_bords = KOKKOS_LAMBDA(int
-                                       face)
+  CIntTabView face_voisins = domaine_VEF.face_voisins().view_ro();
+  CIntTabView sommets = domaine_VEF.face_sommets().view_ro();
+  CDoubleTabView face_normales = domaine_VEF.face_normales().view_ro();
+  CDoubleArrView pression_P1B = static_cast<const ArrOfDouble&>(la_pression_P1B.champ_filtre()).view_ro();
+  DoubleTabView flux_bords = flux_bords_.view_wo();
+  Kokkos::parallel_for(start_gpu_timer(__KERNEL_NAME__), nb_faces_bord, KOKKOS_LAMBDA(int face)
   {
-    int elem = face_voisins_v(face, 0);
+    int elem = face_voisins(face, 0);
     double pres_tot = 0.;
     // Contribution de la pression P0
-    if (alphaE) pres_tot = pression_P1B_v(elem);
+    if (alphaE) pres_tot = pression_P1B(elem);
     // Contribution de la pression P1
     if (alphaS)
       {
         double pres_som = 0.;
         for (int som = 0; som < nb_som_par_face; som++)
-          pres_som += pression_P1B_v(nps + sommets_v(face, som));
+          pres_som += pression_P1B(nps + sommets(face, som));
         pres_tot += coeff_P1 * pres_som;
       }
     // Calcul de la resultante et du couple de pression
     for (int i = 0; i < dim; i++)
-      flux_bords_v(face, i) = pres_tot * face_normales_v(face, i);
-  };
-
-  Kokkos::parallel_for(start_gpu_timer(__KERNEL_NAME__), nb_faces_bord, kern_flux_bords);
+      flux_bords(face, i) = pres_tot * face_normales(face, i);
+  });
   end_gpu_timer(__KERNEL_NAME__);
 }
 
diff --git a/src/VEF/Solveurs/Assembleur_P_VEF.h b/src/VEF/Solveurs/Assembleur_P_VEF.h
index 765028aab0..1281984561 100644
--- a/src/VEF/Solveurs/Assembleur_P_VEF.h
+++ b/src/VEF/Solveurs/Assembleur_P_VEF.h
@@ -49,13 +49,15 @@ class Assembleur_P_VEF: public Assembleur_base
   void completer(const Equation_base&) override;
   inline const Equation_base& equation() const;
 
+  protected_but_public_for_cuda
+  void calculer_inv_volume(DoubleTab& inv_volumes_entrelaces, const Domaine_Cl_VEF& domaine_Cl_VEF, const DoubleVect& volumes_entrelaces);
+
 protected:
   OBS_PTR(Equation_base) mon_equation;
   OBS_PTR(Domaine_VEF) le_dom_VEF;
   OBS_PTR(Domaine_Cl_VEF) le_dom_Cl_VEF;
   DoubleTab les_coeff_pression;
   int has_P_ref = 0;
-  void calculer_inv_volume(DoubleTab& inv_volumes_entrelaces, const Domaine_Cl_VEF& domaine_Cl_VEF, const DoubleVect& volumes_entrelaces);
 
 };
 
diff --git a/src/VEF/Solveurs/Assembleur_P_VEFPre1B_tools.cpp b/src/VEF/Solveurs/Assembleur_P_VEFPre1B_tools.cpp
index 2ee16ee829..c0abf8f3d2 100644
--- a/src/VEF/Solveurs/Assembleur_P_VEFPre1B_tools.cpp
+++ b/src/VEF/Solveurs/Assembleur_P_VEFPre1B_tools.cpp
@@ -1,5 +1,5 @@
 /****************************************************************************
-* Copyright (c) 2025, CEA
+* Copyright (c) 2026, CEA
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
@@ -29,77 +29,86 @@
 #include <Milieu_base.h>
 #include <Scatter.h>
 #include <communications.h>
+#include <Matrix_tools.h>
+#include <Matrice_Bloc.h>
+#include <algorithm>
+#include <numeric>
 
-static int face_associee=-1;
+static double gradi[3];
+static double gradj[3];
 
-static ArrOfDouble gradi(3);
-static ArrOfDouble gradj(3);
-static inline void
-projette(ArrOfDouble& grad, int face, const DoubleTab& normales)
+template<typename VectType, typename TabType>
+KOKKOS_INLINE_FUNCTION
+static void projette(VectType& grad, int face, const TabType& normales)
 {
+  int dimension;
+  if constexpr (Kokkos::is_view<TabType>::value)
+    dimension = (int)normales.extent(1);
+  else
+    dimension = normales.dimension(1);
   double psc=0, norm=0;
-  int dimension=Objet_U::dimension, comp;
-  for(comp=0; comp<dimension; comp++)
+  for(int comp=0; comp<dimension; comp++)
     {
       psc+=grad[comp]*normales(face,comp);
       norm+=normales(face,comp)*normales(face,comp);
     }
   // psc/=norm; // Fixed bug: Arithmetic exception
-  if (std::fabs(norm)>=DMINFLOAT) psc/=norm;
-  for(comp=0; comp<dimension; comp++)
-    {
-      grad[comp]-=psc*normales(face,comp);
-    }
-  psc=0;
-  //   for(comp=0; comp<dimension; comp++)
-  //     {
-  //       psc+=gradi(comp)*normales(face,comp);
-  //     }
-  //   assert(psc < 1.e-10);
+  if (Kokkos::fabs(norm)>=DMINFLOAT) psc/=norm;
+  for(int comp=0; comp<dimension; comp++)
+    grad[comp]-=psc*normales(face,comp);
 }
-//
 
 // renvoie la premiere face non Dirichlet
 //                            2 si Perio
-//                            3 si Neumann
-//                            4 si Symetrie
-//                            1 sinon
+//                            -3 si Neumann
+//                            -4 si Symetrie
+//                            -2 sinon
 // et face_associee=-1 sauf si perio (face_associee=face associee)
-static inline int okface(int& ind_face, int& face, const Cond_lim& la_cl)
+
+// >=0 face_associee_perio pour Periodique
+// INTERNAL Interne
+// DIRICHLET Dirichlet
+// NEUMANN Neumann
+// SYMMETRY Symetrie
+// OTHER Periodique skipee
+enum BOUNDARY { INTERNAL = -1, DIRICHLET = -2, NEUMANN = -3, SYMMETRY = -4, OTHER = -5 };
+
+static void build_cl(ArrOfInt& cl, const Conds_lim& les_cl)
 {
-  face_associee=-1;
-  int ok=1;
-  const Front_VF& le_bord = ref_cast(Front_VF,la_cl->frontiere_dis());
-  int nb_faces_bord_tot = le_bord.nb_faces_tot();
-  do
+  cl = INTERNAL;
+  for (int i = 0; i < les_cl.size(); i++)
     {
-      face=le_bord.num_face(ind_face);
-      if ((sub_type(Dirichlet, la_cl.valeur()))
-          || (sub_type(Dirichlet_homogene, la_cl.valeur())))
-        {
-          ok=0;
-        }
-      else if (sub_type(Periodique,la_cl.valeur()))
+      const Cond_lim& la_cl = les_cl[i];
+      const Front_VF& le_bord = ref_cast(Front_VF, la_cl->frontiere_dis());
+      int nb_faces_bord_tot = le_bord.nb_faces_tot();
+      if (sub_type(Dirichlet, la_cl.valeur()) || sub_type(Dirichlet_homogene, la_cl.valeur()))
         {
-          //periodicite
-          const Periodique& la_cl_perio = ref_cast(Periodique, la_cl.valeur());
-          face_associee=le_bord.num_face(la_cl_perio.face_associee(ind_face));
-          ok=2;
+          for (int ind_face = 0; ind_face < nb_faces_bord_tot; ind_face++)
+            cl(le_bord.num_face(ind_face)) = DIRICHLET;
         }
       else if (sub_type(Neumann_sortie_libre, la_cl.valeur()))
         {
-          //sortie_libre
-          ok=3;
+          for (int ind_face = 0; ind_face < nb_faces_bord_tot; ind_face++)
+            cl(le_bord.num_face(ind_face)) = NEUMANN;
         }
       else if (sub_type(Symetrie, la_cl.valeur()))
         {
-          //symetrie
-          ok=4;
+          for (int ind_face = 0; ind_face < nb_faces_bord_tot; ind_face++)
+            cl(le_bord.num_face(ind_face)) = SYMMETRY;
+        }
+      else if (sub_type(Periodique, la_cl.valeur()))
+        {
+          const Periodique& la_cl_perio = ref_cast(Periodique, la_cl.valeur());
+          for (int ind_face = 0; ind_face < nb_faces_bord_tot; ind_face++)
+            {
+              int face = le_bord.num_face(ind_face);
+              int face_associee = le_bord.num_face(la_cl_perio.face_associee(ind_face));
+              cl(face) = face_associee >= face ? face_associee : OTHER; // OTHER = Periodique skipee
+            }
         }
+      else
+        Process::exit("Not coded");
     }
-  while ( ( (ok==0) || ((ok==2)&&(face_associee<face)) ) && (++ind_face<nb_faces_bord_tot) );
-  if (ind_face==nb_faces_bord_tot) ok=-1;
-  return ok;
 }
 
 inline int verifier_complet(const Assembleur_P_VEFPreP1B& ass,
@@ -271,28 +280,28 @@ int verifier( const Assembleur_P_VEFPreP1B& ass,
     }
   return 1;
 }
+KOKKOS_INLINE_FUNCTION void swap (int& i, int& j)
+{
+  int k=i;
+  i=j;
+  j=k;
+}
+
 //
 // trie le tableau sommets dans l'ordre croissant et
 // faces_op1 et faces_op2 consequemment.
 //
-static inline void sort( ArrOfInt& sommets, ArrOfInt& faces_op1, ArrOfInt& faces_op2)
+KOKKOS_INLINE_FUNCTION void sort(int* sommets, int* faces_op1, int* faces_op2)
 {
-  int sz=sommets.size_array();
-  if(sommets[sz-1]==-1) sz--;
-  int i,j;
-  for(i=0; i<sz; i++)
-    for(j=i; j<sz; j++)
+  int sz=5;
+  while (sz>0 && sommets[sz-1]==-1) sz--;
+  for(int i=0; i<sz; i++)
+    for(int j=i; j<sz; j++)
       if(sommets[i]>sommets[j])
         {
-          int tmp=sommets[i];
-          sommets[i]=sommets[j];
-          sommets[j]=tmp;
-          tmp=faces_op1[i];
-          faces_op1[i]=faces_op1[j];
-          faces_op1[j]=tmp;
-          tmp=faces_op2[i];
-          faces_op2[i]=faces_op2[j];
-          faces_op2[j]=tmp;
+          swap(sommets[i], sommets[j]);
+          swap(faces_op1[i], faces_op1[j]);
+          swap(faces_op2[i], faces_op2[j]);
         }
 }
 static inline int chercher_arete(const Domaine_VEF& domaine_VEF,
@@ -324,12 +333,6 @@ static inline int chercher_arete(const Domaine_VEF& domaine_VEF,
     }
   return -1;
 }
-static inline void swap (int& i, int& j)
-{
-  int k=i;
-  i=j;
-  j=k;
-}
 
 //
 // rempli sommets, faces_op1 et faces_op2
@@ -338,93 +341,88 @@ static inline void swap (int& i, int& j)
 // dans elem1. face_op2(i) est ... dans elem2. (si elem2=-1, alors face_op2=-1)
 // les dimension premiers sommets sont ceux de face
 // le dernier est dans elem2
-static inline void remplir_sommets(const Domaine_VEF& domaine_VEF,
-                                   int face, int elem1, int elem2,
-                                   ArrOfInt& sommets,
-                                   ArrOfInt& faces_op1,
-                                   ArrOfInt& faces_op2)
+template<typename TabType, typename VectType>
+KOKKOS_INLINE_FUNCTION
+static void remplir_sommets(const TabType& elem_som, const TabType& face_som, const TabType& elem_faces, const VectType& renum_som_perio,
+                            int face, int face_associee, int elem1, int elem2,
+                            int* sommets,
+                            int* faces_op1,
+                            int* faces_op2)
 {
-  int dplusun=Objet_U::dimension+1;
-  const IntTab& elem_som = domaine_VEF.domaine().les_elems();
-  const IntTab& face_som = domaine_VEF.face_sommets();
-  const IntTab& elem_faces = domaine_VEF.elem_faces();
-  const Domaine& dom=domaine_VEF.domaine();
-  for(int i=0; i<Objet_U::dimension; i++)
-    sommets[i]=dom.get_renum_som_perio(face_som(face,i));
+  int size;
+  if constexpr (Kokkos::is_view<TabType>::value)
+    size = (int)elem_faces.extent(1);
+  else
+    size = elem_faces.dimension(1);
+  int dim = size - 1;
+  for(int i=0; i<dim; i++)
+    sommets[i]=renum_som_perio(face_som(face,i));
   if(elem1!=-1)
     {
       int ok=0;
-      for(int i=0; i<dplusun; i++)
+      for(int i=0; i<size; i++)
         if( (elem_faces(elem1,i)==face) ||
             (elem_faces(elem1,i)==face_associee) )
           {
-            sommets[Objet_U::dimension]=
-              dom.get_renum_som_perio(elem_som(elem1, i));
-            faces_op1[Objet_U::dimension]=face;
-            faces_op2[Objet_U::dimension]=-1;
+            sommets[dim]=renum_som_perio(elem_som(elem1, i));
+            faces_op1[dim]=face;
+            faces_op2[dim]=-1;
             ok=1;
           }
         else
           {
-            int j=dom.get_renum_som_perio(elem_som(elem1, i));
-            for(int k=0; k<Objet_U::dimension; k++)
+            int j=renum_som_perio(elem_som(elem1, i));
+            for(int k=0; k<dim; k++)
               if(j==sommets[k])
                 faces_op1[k]=elem_faces(elem1, i);
           }
-      if (ok!=1)
-        {
-          Cerr << "The discretization used has a P1 component" << finl;
-          Cerr << "which is not available to deal your mesh." << finl;
-          Cerr << "The mesh with this discretization must contain only ";
-          Cerr << (Objet_U::dimension==2?"triangles":"tetraedras") << "." << finl;
-          Process::exit();
-        }
+      if (ok!=1) Process::Kokkos_exit("The discretization used has a P1 component must contain only tri (2D) or tetra (3D).");
     }
   else
-    {
-      Cerr << "pas prevu ... " << finl;
-      Process::exit();
-    }
+    Process::Kokkos_exit("pas prevu");
   if(elem2!=-1)
     {
-      //int ok=0;
-      for(int i=0; i<dplusun; i++)
+      for(int i=0; i<size; i++)
         if( (elem_faces(elem2,i)==face)||
             (elem_faces(elem2,i)==face_associee) )
           {
-            sommets[dplusun]=dom.get_renum_som_perio(elem_som(elem2, i));
-            faces_op2[dplusun]=face;
-            faces_op1[dplusun]=-1;
-            //ok=1;
+            sommets[size]=renum_som_perio(elem_som(elem2, i));
+            faces_op2[size]=face;
+            faces_op1[size]=-1;
           }
         else
           {
-            int j=dom.get_renum_som_perio(elem_som(elem2, i));
-            for(int k=0; k<Objet_U::dimension; k++)
+            int j=renum_som_perio(elem_som(elem2, i));
+            for(int k=0; k<dim; k++)
               if(j==sommets[k])
                 faces_op2[k]=elem_faces(elem2, i);
           }
-      // A cause de mise en commentaire de ok=1 assert(ok==1);
     }
   else
     {
-      sommets[dplusun]=-1;
-      faces_op2[dplusun]=-1;
-      faces_op1[dplusun]=-1;
+      sommets[size]=-1;
+      faces_op2[size]=-1;
+      faces_op1[size]=-1;
     }
 }
 
 // calcule le gradient a la face separant elem1 et elem2
 // de la fonction de forme associee au sommet s
 //
-static void calculer_grad(const IntTab& face_voisins,
+template<typename ConstTabType, typename VectType, typename TabType>
+KOKKOS_INLINE_FUNCTION
+static void calculer_grad(const ConstTabType& face_voisins,
                           int elem1, int elem2,
-                          const ArrOfDouble& coef_som,
+                          const VectType& coef_som,
                           int s, int fop1, int fop2,
-                          const DoubleTab& normales,
-                          ArrOfDouble& grad)
+                          const TabType& normales,
+                          double* grad)
 {
-  int dimension=Objet_U::dimension;
+  int dimension;
+  if constexpr (Kokkos::is_view<TabType>::value)
+    dimension = (int)normales.extent(1);
+  else
+    dimension = normales.dimension(1);
   double signe=1;
   if(fop1!=-1)
     {
@@ -435,7 +433,8 @@ static void calculer_grad(const IntTab& face_voisins,
         grad[comp]=signe*normales(fop1,comp);
     }
   else
-    grad=0;
+    for(int comp=0; comp<dimension; comp++)
+      grad[comp]=0;
   if((elem2!=-1)&&(fop2!=-1))
     {
       signe=1;
@@ -457,19 +456,19 @@ static void calculer_grad_arete(int face,
                                 int fop1, int fop2,
                                 int fop3, int fop4,
                                 const DoubleTab& normales,
-                                ArrOfDouble& grad)
+                                double* grad)
 {
   assert(face_voisins(face,0)==elem1);
   int signe1=1,signe2=1,signe3=1,signe4=1;
-  if((!(fop1==-1) && !(face_voisins(fop1,0)==elem1)))
+  if(fop1 != -1 && face_voisins(fop1, 0) != elem1)
     signe1=-1;
-  if(!(fop3==-1) && !(face_voisins(fop3,0)==elem1))
+  if(fop3 != -1 && face_voisins(fop3, 0) != elem1)
     signe3=-1;
   if(elem2!=-1)
     {
-      if((!(fop2==-1) && !(face_voisins(fop2,0)==elem2)))
+      if(fop2 != -1 && face_voisins(fop2, 0) != elem2)
         signe2=-1;
-      if(!(fop4==-1) && !( face_voisins(fop4,0)==elem2))
+      if(fop4 != -1 && face_voisins(fop4, 0) != elem2)
         signe4=-1;
     }
   if(j<3) // une arete de la face
@@ -506,175 +505,42 @@ static void calculer_grad_arete(int face,
     }
 }
 
-static double dotproduct_array_fois_inverse_quantitee_entrelacee(const ArrOfDouble& grad1,const ArrOfDouble& grad2,const DoubleTab& inverse_quantitee_entrelacee, int face )
+template<typename GradType1, typename GradType2, typename TabType>
+KOKKOS_INLINE_FUNCTION
+static double dotproduct_array_fois_inverse_quantitee_entrelacee(const GradType1& grad1, const GradType2& grad2, const TabType& inverse_quantitee_entrelacee, int face)
 {
+  int size;
+  if constexpr (Kokkos::is_view<TabType>::value)
+    size = (int)inverse_quantitee_entrelacee.extent(1);
+  else
+    size = inverse_quantitee_entrelacee.dimension(1);
   double dot=0;
-  int size=inverse_quantitee_entrelacee.dimension(1);
   for (int i=0; i<size; i++) dot+=grad1[i]*grad2[i]*inverse_quantitee_entrelacee(face,i);
   return dot;
-  //return  dotproduct_array(gradi,gradj)*inverse_quantitee_entrelacee(face,0);
 }
 
-static void contribuer_matriceP0P1(int elem1, int elem2, const ArrOfInt& sommets,
-                                   IntLists& voisins, DoubleLists& coeffs)
-{
-  int dimension=Objet_U::dimension,
-      dplusdeux=dimension+2;
+inline static void mat_add(Matrice_Morse& m, int i, int j, double v) { m(i,j) += v; }
+KOKKOS_INLINE_FUNCTION static void mat_add(const Matrice_Morse_View& m, int i, int j, double v) { m.atomic_add(i, j, v); }
 
-  for(int i=0; i<dplusdeux; i++)
-    {
-      int si=sommets[i];
-      if (si<0) break;
-      int rang1=voisins[elem1].rang(si);
-      if(rang1==-1)
-        {
-          voisins[elem1].add(si);
-          coeffs[elem1].add(0);
-        }
-      if (elem2!=-1)
-        {
-          int rang2=voisins[elem2].rang(si);
-          if(rang2==-1)
-            {
-              voisins[elem2].add(si);
-              coeffs[elem2].add(0);
-            }
-        }
-    }
-}
-inline void range(int& i, int& n, int& j, int& m, Matrice_Morse& ARR, Matrice_Morse& ARV, Matrice_Morse& AVR, Matrice_Morse& AVV, double coeff)
-{
-  if(i<n)
-    if(j<m)
-      ARR(i,j)+=coeff;
-    else
-      ARV(i,j-m)+=coeff;
-  else if(j<m)
-    AVR(i-n,j)+=coeff;
-  else
-    AVV(i-n,j-m)+=coeff;
-}
-static void update_matriceP0P1(const Domaine_VEF& domaine_VEF,
-                               const DoubleTab& inverse_quantitee_entrelacee,
-                               int face, int elem1, int elem2,
-                               ArrOfInt& sommets, ArrOfInt& faces_op1,
-                               ArrOfInt& faces_op2,           const ArrOfDouble& coef_som,
-                               Matrice_Morse& ARR, Matrice_Morse& ARV,
-                               Matrice_Morse& AVR, Matrice_Morse& AVV)
+template<typename MatType>
+KOKKOS_INLINE_FUNCTION
+void range(int i, int n, int j, int m, MatType& ARR, MatType& ARV, MatType& AVR, MatType& AVV, double coeff)
 {
-  const DoubleTab& normales = domaine_VEF.face_normales();
-  const IntTab& face_voisins = domaine_VEF.face_voisins();
-  assert(elem1==face_voisins(face, 0));
-  assert(elem2==face_voisins(face, 1));
-
-  int dimension=Objet_U::dimension,
-      dplusdeux=dimension+2;
-  double psc;
-  //double coeff_som=1./(dimension)/(dimension+1);
-
-  int nb_elem=domaine_VEF.nb_elem();
-  int nb_som=domaine_VEF.nb_som();
-  for(int i=0; i<dplusdeux; i++)
-    {
-      int si=sommets[i];
-      if (si<0) break;
-      calculer_grad(face_voisins, elem1, elem2, coef_som,si, faces_op1[i],
-                    faces_op2[i], normales, gradi);
-      for(int k=0; k<dimension; k++)
-        gradj[k]=normales(face,k);
-      psc=dotproduct_array_fois_inverse_quantitee_entrelacee(gradi,gradj,
-                                                             inverse_quantitee_entrelacee,face);
-      range(elem1,nb_elem,si,nb_som,ARR,ARV,AVR,AVV,psc);
-      if (elem2!=-1)
-        range(elem2,nb_elem,si,nb_som,ARR,ARV,AVR,AVV,-psc);
-    }
-}
-
-static void contribuer_matriceP1P1(int elem1, int elem2, const ArrOfInt& sommets, IntLists& voisins, DoubleLists& coeffs)
-
-{
-  int dimension=Objet_U::dimension,
-      dplusdeux=dimension+2;
-
-  // On ne traite pas les sommets -1 qui
-  // sont en fin de tableau sommets:
-  while (sommets[dplusdeux-1]==-1)
-    dplusdeux--;
-
-  for(int i=0; i<dplusdeux; i++)
+  if (i<n)
     {
-      int si=sommets[i];
-      for(int j=i+1; j<dplusdeux; j++)
-        {
-          int sj=sommets[j];
-          int rang=voisins[si].rang(sj);
-          if (sj>si)
-            {
-              if(rang==-1)
-                {
-                  voisins[si].add(sj);
-                  coeffs[si].add(0);
-                }
-            }
-        }
+      if (j<m) mat_add(ARR, i, j, coeff);
+      else     mat_add(ARV, i, j-m, coeff);
     }
-}
-
-static void update_matriceP1P1(const Domaine_VEF& domaine_VEF,
-                               const DoubleTab& inverse_quantitee_entrelacee,
-                               int face, int elem1, int elem2,
-                               ArrOfInt& sommets, ArrOfInt& faces_op1,
-                               ArrOfInt& faces_op2,           const ArrOfDouble& coef_som,
-                               Matrice_Morse& ARR, Matrice_Morse& ARV,
-                               Matrice_Morse& AVR, Matrice_Morse& AVV)
-{
-  const DoubleTab& normales = domaine_VEF.face_normales();
-  const IntTab& face_voisins = domaine_VEF.face_voisins();
-
-  int dimension=Objet_U::dimension,
-      dplusdeux=dimension+2;
-  double psc;
-  //double coeff_som=1./(dimension)/(dimension+1);
-  //coeff_som*=coeff_som;
-  int nb_som_tot=domaine_VEF.nb_som();
-  int i,j;
-
-  // On ne traite pas les sommets -1 qui
-  // sont en fin de tableau sommets:
-  while (sommets[dplusdeux-1]==-1)
-    dplusdeux--;
-
-  for(i=0; i<dplusdeux; i++)
+  else
     {
-      int si=sommets[i];
-      calculer_grad(face_voisins, elem1, elem2, coef_som,si, faces_op1[i],
-                    faces_op2[i], normales, gradi);
-      if(si<nb_som_tot)
-        ARR(si,si)+=dotproduct_array_fois_inverse_quantitee_entrelacee(gradi,gradi,
-                                                                       inverse_quantitee_entrelacee,face);
-      for(j=i+1; j<dplusdeux; j++)
-        {
-          int sj=sommets[j];
-          calculer_grad(face_voisins, elem1, elem2,coef_som, sj, faces_op1[j],
-                        faces_op2[j], normales, gradj);
-          psc=dotproduct_array_fois_inverse_quantitee_entrelacee(gradi,gradj,inverse_quantitee_entrelacee,face);
-          //assert(sj>si);
-          if(si<nb_som_tot)
-            if(sj<nb_som_tot)
-              ARR(si,sj)+=psc;
-            else
-              ARV(si,sj-nb_som_tot)+=psc;
-          else if(sj<nb_som_tot)
-            AVR(si-nb_som_tot,sj)+=psc;
-          else
-            AVV(si-nb_som_tot,sj-nb_som_tot)+=psc;
-        }
+      if (j<m) mat_add(AVR, i-n, j, coeff);
+      else     mat_add(AVV, i-n, j-m, coeff);
     }
 }
 
 static void contribuer_matricePaPa(const Domaine_VEF& domaine_VEF,
                                    int elem1, int elem2,
-                                   const ArrOfInt& sommets, IntLists& voisins, DoubleLists& coeffs)
+                                   const int* sommets, Stencil& stencil, int& nnz)
 
 {
   const IntTab& elem_aretes=domaine_VEF.domaine().elem_aretes();
@@ -719,13 +585,9 @@ static void contribuer_matricePaPa(const Domaine_VEF& domaine_VEF,
                         {
                           int tmp=arete1;
                           if(arete1>arete2) swap(arete1, arete2);
-                          int rang=voisins[arete1].rang(arete2);
-                          if(rang==-1)
-                            {
-                              voisins[arete1].add(arete2);
-                              coeffs[arete1].add(0);
-
-                            }
+                          stencil(nnz, 0) = arete1;
+                          stencil(nnz, 1) = arete2;
+                          nnz++;
                           arete1=tmp;
                         }
                     }
@@ -738,8 +600,8 @@ static void contribuer_matricePaPa(const Domaine_VEF& domaine_VEF,
 static void update_matricePaPa(const Domaine_VEF& domaine_VEF,
                                const DoubleTab& inverse_quantitee_entrelacee,
                                int face, int elem1, int elem2,
-                               ArrOfInt& sommets, ArrOfInt& faces_op1,
-                               ArrOfInt& faces_op2,
+                               int* sommets, int* faces_op1,
+                               int* faces_op2,
                                Matrice_Morse& ARR, Matrice_Morse& ARV,
                                Matrice_Morse& AVR, Matrice_Morse& AVV)
 {
@@ -749,8 +611,6 @@ static void update_matricePaPa(const Domaine_VEF& domaine_VEF,
   const IntTab& face_voisins=domaine_VEF.face_voisins();
   const DoubleTab& normales = domaine_VEF.face_normales();
   int nb_aretes_tot=domaine_VEF.domaine().nb_aretes();
-  int i, j, k, l;
-  double psc;
   // On ne traite pas les sommets -1 qui
   // sont en fin de tableau sommets:
   //while (sommets(dplusdeux-1)==-1)
@@ -760,10 +620,10 @@ static void update_matricePaPa(const Domaine_VEF& domaine_VEF,
 
   int jmax=5;
   if(elem2==-1) jmax=4;
-  for(i=0; i<3; i++)
+  for(int i=0; i<3; i++)
     {
       int si=sommets[i];
-      for(j=i+1; j<jmax; j++)
+      for(int j=i+1; j<jmax; j++)
         {
           int sj=sommets[j];
           int arete1;
@@ -785,10 +645,10 @@ static void update_matricePaPa(const Domaine_VEF& domaine_VEF,
                 ARR(arete1,arete1)+=dotproduct_array_fois_inverse_quantitee_entrelacee(gradi,gradi,
                                                                                        inverse_quantitee_entrelacee,face);
               int jj=j;
-              for(k=i; k<3; k++)
+              for(int k=i; k<3; k++)
                 {
                   int sk=sommets[k];
-                  for(l=jj+1; l<jmax; l++)
+                  for(int l=jj+1; l<jmax; l++)
                     {
                       int sl=sommets[l];
                       int arete2;
@@ -808,19 +668,11 @@ static void update_matricePaPa(const Domaine_VEF& domaine_VEF,
                                               faces_op1[k], faces_op2[k],
                                               faces_op1[l], faces_op2[l],
                                               normales, gradj);
-                          psc=dotproduct_array_fois_inverse_quantitee_entrelacee(gradi,gradj,
-                                                                                 inverse_quantitee_entrelacee,face);
+                          double psc=dotproduct_array_fois_inverse_quantitee_entrelacee(gradi,gradj,
+                                                                                        inverse_quantitee_entrelacee,face);
                           int tmp=arete1;
                           if(arete1>arete2) swap(arete1, arete2);
-                          if(arete1<nb_aretes_tot)
-                            if(arete2<nb_aretes_tot)
-                              ARR(arete1,arete2)+=psc;
-                            else
-                              ARV(arete1,arete2-nb_aretes_tot)+=psc;
-                          else if(arete2<nb_aretes_tot)
-                            AVR(arete1-nb_aretes_tot,arete2)+=psc;
-                          else
-                            AVV(arete1-nb_aretes_tot,arete2-nb_aretes_tot)+=psc;
+                          range(arete1,nb_aretes_tot,arete2,nb_aretes_tot,ARR,ARV,AVR,AVV,psc);
                           arete1=tmp;
                         }
                     }
@@ -830,220 +682,9 @@ static void update_matricePaPa(const Domaine_VEF& domaine_VEF,
         }
     }
 }
-static void
-contribuer_matrice_NeumannP0P1(int elem, const ArrOfInt& sommets, IntLists& voisins, DoubleLists& coeffs)
-
-{
-  int dimension=Objet_U::dimension,
-      dplusun=dimension+1;
-  for(int i=0; i<dplusun; i++)
-    {
-      int si=sommets[i];
-      int rang1=voisins[elem].rang(si);
-      if(rang1==-1)
-        {
-          voisins[elem].add(si);
-          coeffs[elem].add(0);
-        }
-    }
-}
-
-static void
-update_matrice_NeumannP0P1(const Domaine_VEF& domaine_VEF,
-                           const DoubleTab& inverse_quantitee_entrelacee,
-                           int face, int elem,
-                           ArrOfInt& sommets, ArrOfInt& faces_op1,   const ArrOfDouble& coef_som,
-                           Matrice_Morse& ARR, Matrice_Morse& ARV,
-                           Matrice_Morse& AVR, Matrice_Morse& AVV)
-{
-  const DoubleTab& normales = domaine_VEF.face_normales();
-  const IntTab& face_voisins = domaine_VEF.face_voisins();
-
-  int nb_elem_tot=domaine_VEF.nb_elem();
-  int nb_som_tot=domaine_VEF.nb_som();
-
-  int dimension=Objet_U::dimension,
-      dplusun=dimension+1;
-  double unsurdim=1./Objet_U::dimension;
-  double psc;
-  //  double coeff_som=1./(dimension)/(dplusun);
-
-
-  for(int i=0; i<dplusun; i++)
-    {
-      int si=sommets[i];
-      calculer_grad(face_voisins, elem, -1,  coef_som,si, faces_op1[i],
-                    -1, normales, gradi);
-      if(faces_op1[i]!=face)
-        for (int comp=0; comp<dimension; comp++)
-          gradi[comp]+= normales(face,comp)*unsurdim;
-      for(int k=0; k<dimension; k++)
-        gradj[k]=normales(face,k);
-      psc=dotproduct_array_fois_inverse_quantitee_entrelacee(gradi,gradj,
-                                                             inverse_quantitee_entrelacee,face);
-      if(elem<nb_elem_tot)
-        if(si<nb_som_tot)
-          ARR(elem,si)+=psc;
-        else
-          ARV(elem,si-nb_som_tot)+=psc;
-      else if(si<nb_som_tot)
-        AVR(elem-nb_elem_tot,si)+=psc;
-      else
-        AVV(elem-nb_elem_tot,si-nb_som_tot)+=psc;
-    }
-}
-
-static void
-contribuer_matrice_NeumannP1P1(int elem, const ArrOfInt& sommets, IntLists& voisins, DoubleLists& coeffs)
-
-{
-  int dimension=Objet_U::dimension,
-      dplusun=dimension+1;
-  for(int i=0; i<dplusun; i++)
-    {
-      int si=sommets[i];
-      for(int j=i+1; j<dplusun; j++)
-        {
-          int sj=sommets[j];
-          int rang=voisins[si].rang(sj);
-          if(rang==-1)
-            {
-              voisins[si].add(sj);
-              coeffs[si].add(0);
-            }
-        }
-    }
-}
-
-
-static void
-update_matrice_NeumannP1P1(const Domaine_VEF& domaine_VEF,
-                           const DoubleTab& inverse_quantitee_entrelacee,
-                           int face, int elem,
-                           ArrOfInt& sommets, ArrOfInt& faces_op1,   const ArrOfDouble& coef_som,
-                           Matrice_Morse& ARR, Matrice_Morse& ARV,
-                           Matrice_Morse& AVR, Matrice_Morse& AVV)
-{
-  const DoubleTab& normales = domaine_VEF.face_normales();
-  const IntTab& face_voisins = domaine_VEF.face_voisins();
-
-  int dimension=Objet_U::dimension,
-      dplusun=dimension+1;
-  double unsurdim=1./Objet_U::dimension;
-  double psc;
-  //  double coeff_som=1./(dimension)/(dplusun);
-  //coeff_som*=coeff_som;
-
-
-  int nb_som_tot=domaine_VEF.nb_som();
-  int i,j;
-  for(i=0; i<dplusun; i++)
-    {
-      int si=sommets[i];
-      calculer_grad(face_voisins, elem, -1,  coef_som,si, faces_op1[i],
-                    -1, normales, gradi);
-      if(faces_op1[i]!=face)
-        for (int comp=0; comp<dimension; comp++)
-          gradi[comp]+= normales(face,comp)*unsurdim;
-      if(si<nb_som_tot)
-        ARR(si,si)+=dotproduct_array_fois_inverse_quantitee_entrelacee(gradi,gradi,
-                                                                       inverse_quantitee_entrelacee,face);
-      for(j=i+1; j<dplusun; j++)
-        {
-          int sj=sommets[j];
-          calculer_grad(face_voisins, elem, -1,  coef_som,sj, faces_op1[j],
-                        -1, normales, gradj);
-          if(faces_op1[j]!=face)
-            for (int comp=0; comp<dimension; comp++)
-              gradj[comp]+= normales(face,comp)*unsurdim;
-          psc=dotproduct_array_fois_inverse_quantitee_entrelacee(gradi,gradj,inverse_quantitee_entrelacee,face);
-          if(si<nb_som_tot)
-            if(sj<nb_som_tot)
-              ARR(si,sj)+=psc;
-            else
-              ARV(si,sj-nb_som_tot)+=psc;
-          else if(sj<nb_som_tot)
-            AVR(si-nb_som_tot,sj)+=psc;
-          else
-            AVV(si-nb_som_tot,sj-nb_som_tot)+=psc;
-        }
-    }
-}
-
-static void
-contribuer_matrice_SymetrieP1P1(int elem, const ArrOfInt& sommets, IntLists& voisins, DoubleLists& coeffs)
-{
-  int dimension=Objet_U::dimension,
-      dplusun=dimension+1;
-  for(int i=0; i<dplusun; i++)
-    {
-      int si=sommets[i];
-      for(int j=i+1; j<dplusun; j++)
-        {
-          int sj=sommets[j];
-          int rang=voisins[si].rang(sj);
-          if(rang==-1)
-            {
-              voisins[si].add(sj);
-              coeffs[si].add(0);
-            }
-        }
-    }
-}
-
 
 static void
-update_matrice_SymetrieP1P1(const Domaine_VEF& domaine_VEF,
-                            const DoubleTab& inverse_quantitee_entrelacee,
-                            int face, int elem,
-                            ArrOfInt& sommets, ArrOfInt& faces_op1,   const ArrOfDouble& coef_som,
-                            Matrice_Morse& ARR, Matrice_Morse& ARV,
-                            Matrice_Morse& AVR, Matrice_Morse& AVV)
-{
-  const DoubleTab& normales = domaine_VEF.face_normales();
-  const IntTab& face_voisins = domaine_VEF.face_voisins();
-
-  int dimension=Objet_U::dimension,
-      dplusun=dimension+1;
-  double psc;
-  //  double coeff_som=1./(dimension)/(dplusun);
-  //coeff_som*=coeff_som;
-
-
-  int nb_som_tot=domaine_VEF.nb_som();
-  int i,j;
-  for(i=0; i<dplusun; i++)
-    {
-      int si=sommets[i];
-      calculer_grad(face_voisins, elem, -1, coef_som, si, faces_op1[i],
-                    -1, normales, gradi);
-      projette(gradi, face, normales);
-      if(si<nb_som_tot)
-        ARR(si,si)+=dotproduct_array_fois_inverse_quantitee_entrelacee(gradi,gradi,
-                                                                       inverse_quantitee_entrelacee,face);
-      for(j=i+1; j<dplusun; j++)
-        {
-          int sj=sommets[j];
-          calculer_grad(face_voisins, elem, -1,  coef_som,sj, faces_op1[j],
-                        -1, normales, gradj);
-          projette(gradj, face, normales);
-          psc=dotproduct_array_fois_inverse_quantitee_entrelacee(gradi,gradj,
-                                                                 inverse_quantitee_entrelacee,face);
-          if(si<nb_som_tot)
-            if(sj<nb_som_tot)
-              ARR(si,sj)+=psc;
-            else
-              ARV(si,sj-nb_som_tot)+=psc;
-          else if(sj<nb_som_tot)
-            AVR(si-nb_som_tot,sj)+=psc;
-          else
-            AVV(si-nb_som_tot,sj-nb_som_tot)+=psc;
-        }
-    }
-}
-
-static void
-contribuer_matrice_NeumannPaPa(const Domaine_VEF& domaine_VEF, int elem, const ArrOfInt& sommets, IntLists& voisins, DoubleLists& coeffs)
+contribuer_matrice_NeumannPaPa(const Domaine_VEF& domaine_VEF, int elem, const int* sommets, Stencil& stencil, int& nnz)
 
 {
   const IntTab& elem_aretes=domaine_VEF.domaine().elem_aretes();
@@ -1075,13 +716,9 @@ contribuer_matrice_NeumannPaPa(const Domaine_VEF& domaine_VEF, int elem, const A
                         {
                           int tmp=arete1;
                           if(arete1>arete2) swap(arete1, arete2);
-                          int rang=voisins[arete1].rang(arete2);
-                          if(rang==-1)
-                            {
-                              voisins[arete1].add(arete2);
-                              coeffs[arete1].add(0);
-
-                            }
+                          stencil(nnz, 0) = arete1;
+                          stencil(nnz, 1) = arete2;
+                          nnz++;
                           arete1=tmp;
                         }
                     }
@@ -1096,7 +733,7 @@ static void
 update_matrice_NeumannPaPa(const Domaine_VEF& domaine_VEF,
                            const DoubleTab& inverse_quantitee_entrelacee,
                            int face, int elem,
-                           ArrOfInt& sommets, ArrOfInt& faces_op1,
+                           int* sommets, int* faces_op1,
                            Matrice_Morse& ARR, Matrice_Morse& ARV,
                            Matrice_Morse& AVR, Matrice_Morse& AVV)
 {
@@ -1107,14 +744,10 @@ update_matrice_NeumannPaPa(const Domaine_VEF& domaine_VEF,
   const DoubleTab& normales = domaine_VEF.face_normales();
   int nb_aretes_tot=domaine_VEF.domaine().nb_aretes();
 
-  int i, j, k, l;
-  double psc;
-
-
-  for(i=0; i<3; i++)
+  for(int i=0; i<3; i++)
     {
       int si=sommets[i];
-      for(j=i+1; j<4; j++)
+      for(int j=i+1; j<4; j++)
         {
           int sj=sommets[j];
           int arete1;
@@ -1131,10 +764,10 @@ update_matrice_NeumannPaPa(const Domaine_VEF& domaine_VEF,
                 ARR(arete1,arete1)+=dotproduct_array_fois_inverse_quantitee_entrelacee(gradi,gradi,
                                                                                        inverse_quantitee_entrelacee,face);
               int jj=j;
-              for(k=i; k<3; k++)
+              for(int k=i; k<3; k++)
                 {
                   int sk=sommets[k];
-                  for(l=jj+1; l<4; l++)
+                  for(int l=jj+1; l<4; l++)
                     {
                       int sl=sommets[l];
                       int arete2= chercher_arete(domaine_VEF,elem, sl, sk,
@@ -1148,19 +781,11 @@ update_matrice_NeumannPaPa(const Domaine_VEF& domaine_VEF,
                                               faces_op1[k], -1,
                                               faces_op1[l], -1,
                                               normales, gradj);
-                          psc=dotproduct_array_fois_inverse_quantitee_entrelacee(gradi,gradj,
-                                                                                 inverse_quantitee_entrelacee,face);
+                          double psc=dotproduct_array_fois_inverse_quantitee_entrelacee(gradi,gradj,
+                                                                                        inverse_quantitee_entrelacee,face);
                           int tmp=arete1;
                           if(arete1>arete2) swap(arete1, arete2);
-                          if(arete1<nb_aretes_tot)
-                            if(arete2<nb_aretes_tot)
-                              ARR(arete1,arete2)+=psc;
-                            else
-                              ARV(arete1,arete2-nb_aretes_tot)+=psc;
-                          else if(arete2<nb_aretes_tot)
-                            AVR(arete1-nb_aretes_tot,arete2)+=psc;
-                          else
-                            AVV(arete1-nb_aretes_tot,arete2-nb_aretes_tot)+=psc;
+                          range(arete1,nb_aretes_tot,arete2,nb_aretes_tot,ARR,ARV,AVR,AVV,psc);
                           arete1=tmp;
                         }
                     }
@@ -1172,7 +797,7 @@ update_matrice_NeumannPaPa(const Domaine_VEF& domaine_VEF,
 }
 
 static void
-contribuer_matrice_SymetriePaPa(const Domaine_VEF& domaine_VEF, int elem, const ArrOfInt& sommets, IntLists& voisins, DoubleLists& coeffs)
+contribuer_matrice_SymetriePaPa(const Domaine_VEF& domaine_VEF, int elem, const int* sommets, Stencil& stencil, int& nnz)
 {
   const IntTab& elem_aretes=domaine_VEF.domaine().elem_aretes();
   const IntTab& aretes_som=domaine_VEF.domaine().aretes_som();
@@ -1203,13 +828,9 @@ contribuer_matrice_SymetriePaPa(const Domaine_VEF& domaine_VEF, int elem, const
                         {
                           int tmp=arete1;
                           if(arete1>arete2) swap(arete1, arete2);
-                          int rang=voisins[arete1].rang(arete2);
-                          if(rang==-1)
-                            {
-                              voisins[arete1].add(arete2);
-                              coeffs[arete1].add(0);
-
-                            }
+                          stencil(nnz, 0) = arete1;
+                          stencil(nnz, 1) = arete2;
+                          nnz++;
                           arete1=tmp;
                         }
                     }
@@ -1224,7 +845,7 @@ static void
 update_matrice_SymetriePaPa(const Domaine_VEF& domaine_VEF,
                             const DoubleTab& inverse_quantitee_entrelacee,
                             int face, int elem,
-                            ArrOfInt& sommets, ArrOfInt& faces_op1,
+                            int* sommets, int* faces_op1,
                             Matrice_Morse& ARR, Matrice_Morse& ARV,
                             Matrice_Morse& AVR, Matrice_Morse& AVV)
 {
@@ -1233,16 +854,12 @@ update_matrice_SymetriePaPa(const Domaine_VEF& domaine_VEF,
   const ArrOfInt& ok_arete=domaine_VEF.get_ok_arete();
   const IntTab& face_voisins=domaine_VEF.face_voisins();
   const DoubleTab& normales = domaine_VEF.face_normales();
-  int i, j, k, l;
   int nb_aretes_tot=domaine_VEF.domaine().nb_aretes();
 
-  double psc;
-
-
-  for(i=0; i<3; i++)
+  for(int i=0; i<3; i++)
     {
       int si=sommets[i];
-      for(j=i+1; j<4; j++)
+      for(int j=i+1; j<4; j++)
         {
           int sj=sommets[j];
           int arete1;
@@ -1260,10 +877,10 @@ update_matrice_SymetriePaPa(const Domaine_VEF& domaine_VEF,
                 ARR(arete1,arete1)+=dotproduct_array_fois_inverse_quantitee_entrelacee(gradi,gradi,
                                                                                        inverse_quantitee_entrelacee,face);
               int jj=j;
-              for(k=i; k<3; k++)
+              for(int k=i; k<3; k++)
                 {
                   int sk=sommets[k];
-                  for(l=jj+1; l<4; l++)
+                  for(int l=jj+1; l<4; l++)
                     {
                       int sl=sommets[l];
                       int arete2= chercher_arete(domaine_VEF,elem, sl, sk,
@@ -1278,19 +895,11 @@ update_matrice_SymetriePaPa(const Domaine_VEF& domaine_VEF,
                                               faces_op1[l], -1,
                                               normales, gradj);
                           projette(gradj, face, normales);
-                          psc=dotproduct_array_fois_inverse_quantitee_entrelacee(gradi,gradj,
-                                                                                 inverse_quantitee_entrelacee,face);
+                          double psc=dotproduct_array_fois_inverse_quantitee_entrelacee(gradi,gradj,
+                                                                                        inverse_quantitee_entrelacee,face);
                           int tmp=arete1;
                           if(arete1>arete2) swap(arete1, arete2);
-                          if(arete1<nb_aretes_tot)
-                            if(arete2<nb_aretes_tot)
-                              ARR(arete1,arete2)+=psc;
-                            else
-                              ARV(arete1,arete2-nb_aretes_tot)+=psc;
-                          else if(arete2<nb_aretes_tot)
-                            AVR(arete1-nb_aretes_tot,arete2)+=psc;
-                          else
-                            AVV(arete1-nb_aretes_tot,arete2-nb_aretes_tot)+=psc;
+                          range(arete1,nb_aretes_tot,arete2,nb_aretes_tot,ARR,ARV,AVR,AVV,psc);
                           arete1=tmp;
                         }
                     }
@@ -1302,9 +911,8 @@ update_matrice_SymetriePaPa(const Domaine_VEF& domaine_VEF,
 }
 
 static void contribuer_matriceP0Pa(const Domaine_VEF& domaine_VEF, int elem1, int elem2,
-                                   const ArrOfInt& sommets,
-                                   IntLists& voisins,
-                                   DoubleLists& coeffs)
+                                   const int* sommets,
+                                   Stencil& stencil, int& nnz)
 
 {
   const IntTab& elem_aretes=domaine_VEF.domaine().elem_aretes();
@@ -1327,20 +935,14 @@ static void contribuer_matriceP0Pa(const Domaine_VEF& domaine_VEF, int elem1, in
                                     elem_aretes, aretes_som);
           if(ok_arete[arete1])
             {
-              int rang=voisins[elem1].rang(arete1);
-              if(rang==-1)
-                {
-                  voisins[elem1].add(arete1);
-                  coeffs[elem1].add(0);
-                }
+              stencil(nnz, 0) = elem1;
+              stencil(nnz, 1) = arete1;
+              nnz++;
               if(elem2!=-1)
                 {
-                  int rangbis=voisins[elem2].rang(arete1);
-                  if(rangbis==-1)
-                    {
-                      voisins[elem2].add(arete1);
-                      coeffs[elem2].add(0);
-                    }
+                  stencil(nnz, 0) = elem2;
+                  stencil(nnz, 1) = arete1;
+                  nnz++;
                 }
             }
         }
@@ -1350,9 +952,9 @@ static void contribuer_matriceP0Pa(const Domaine_VEF& domaine_VEF, int elem1, in
 static void update_matriceP0Pa(const Domaine_VEF& domaine_VEF,
                                const DoubleTab& inverse_quantitee_entrelacee,
                                int face, int elem1, int elem2,
-                               ArrOfInt& sommets,
-                               ArrOfInt& faces_op1,
-                               ArrOfInt& faces_op2,
+                               int* sommets,
+                               int* faces_op1,
+                               int* faces_op2,
                                Matrice_Morse& ARR, Matrice_Morse& ARV,
                                Matrice_Morse& AVR, Matrice_Morse& AVV)
 {
@@ -1364,16 +966,12 @@ static void update_matriceP0Pa(const Domaine_VEF& domaine_VEF,
   int nb_aretes_tot=domaine_VEF.domaine().nb_aretes();
   int nb_elem_tot=domaine_VEF.domaine().nb_elem();
 
-  int i, j;
-  double psc;
-
-
   int jmax=5;
   if(elem2==-1) jmax=4;
-  for(i=0; i<3; i++)
+  for(int i=0; i<3; i++)
     {
       int si=sommets[i];
-      for(j=i+1; j<jmax; j++)
+      for(int j=i+1; j<jmax; j++)
         {
           int sj=sommets[j];
           int arete1;
@@ -1390,32 +988,16 @@ static void update_matriceP0Pa(const Domaine_VEF& domaine_VEF,
                                   faces_op1[i], faces_op2[i],
                                   faces_op1[j], faces_op2[j],
                                   normales, gradi);
-              psc=0;
+              double psc=0;
               for(int comp=0; comp<3; comp++)
                 psc+=gradi[comp]*normales(face, comp)
                      *(-inverse_quantitee_entrelacee(face,comp));
-              if(elem1<nb_elem_tot)
-                if(arete1<nb_aretes_tot)
-                  ARR(elem1,arete1)+=psc;
-                else
-                  ARV(elem1,arete1-nb_aretes_tot)+=psc;
-              else if(arete1<nb_aretes_tot)
-                AVR(elem1-nb_elem_tot,arete1)+=psc;
-              else
-                AVV(elem1-nb_elem_tot,arete1-nb_aretes_tot)+=psc;
+              range(elem1,nb_elem_tot,arete1,nb_aretes_tot,ARR,ARV,AVR,AVV,psc);
 
               if(elem2!=-1)
                 {
                   psc*=-1.0;
-                  if(elem2<nb_elem_tot)
-                    if(arete1<nb_aretes_tot)
-                      ARR(elem2,arete1)+=psc;
-                    else
-                      ARV(elem2,arete1-nb_aretes_tot)+=psc;
-                  else if(arete1<nb_aretes_tot)
-                    AVR(elem2-nb_elem_tot,arete1)+=psc;
-                  else
-                    AVV(elem2-nb_elem_tot,arete1-nb_aretes_tot)+=psc;
+                  range(elem2,nb_elem_tot,arete1,nb_aretes_tot,ARR,ARV,AVR,AVV,psc);
                 }
             }
         }
@@ -1424,9 +1006,8 @@ static void update_matriceP0Pa(const Domaine_VEF& domaine_VEF,
 
 static void
 contribuer_matrice_NeumannP0Pa(const Domaine_VEF& domaine_VEF, int elem,
-                               const ArrOfInt& sommets,
-                               IntLists& voisins,
-                               DoubleLists& coeffs)
+                               const int* sommets,
+                               Stencil& stencil, int& nnz)
 
 {
   const IntTab& elem_aretes=domaine_VEF.domaine().elem_aretes();
@@ -1443,12 +1024,9 @@ contribuer_matrice_NeumannP0Pa(const Domaine_VEF& domaine_VEF, int elem,
                                   elem_aretes, aretes_som);
           if(ok_arete[arete1])
             {
-              int rang=voisins[elem].rang(arete1);
-              if(rang==-1)
-                {
-                  voisins[elem].add(arete1);
-                  coeffs[elem].add(0);
-                }
+              stencil(nnz, 0) = elem;
+              stencil(nnz, 1) = arete1;
+              nnz++;
             }
         }
     }
@@ -1458,8 +1036,8 @@ static void
 update_matrice_NeumannP0Pa(const Domaine_VEF& domaine_VEF,
                            const DoubleTab& inverse_quantitee_entrelacee,
                            int face, int elem,
-                           ArrOfInt& sommets,
-                           ArrOfInt& faces_op1,
+                           int* sommets,
+                           int* faces_op1,
                            Matrice_Morse& ARR, Matrice_Morse& ARV,
                            Matrice_Morse& AVR, Matrice_Morse& AVV)
 {
@@ -1471,14 +1049,10 @@ update_matrice_NeumannP0Pa(const Domaine_VEF& domaine_VEF,
   int nb_aretes_tot=domaine_VEF.domaine().nb_aretes();
   int nb_elem_tot=domaine_VEF.domaine().nb_elem();
 
-  int i, j;
-  double psc;
-
-
-  for(i=0; i<3; i++)
+  for(int i=0; i<3; i++)
     {
       int si=sommets[i];
-      for(j=i+1; j<4; j++)
+      for(int j=i+1; j<4; j++)
         {
           int sj=sommets[j];
           int arete1;
@@ -1491,28 +1065,19 @@ update_matrice_NeumannP0Pa(const Domaine_VEF& domaine_VEF,
                                   faces_op1[i], -1,
                                   faces_op1[j], -1,
                                   normales, gradi);
-              psc=0;
+              double psc=0;
               for(int comp=0; comp<3; comp++)
                 psc+=gradi[comp]*normales(face, comp)
                      *(-inverse_quantitee_entrelacee(face,comp));
-              if(elem<nb_elem_tot)
-                if(arete1<nb_aretes_tot)
-                  ARR(elem,arete1)+=psc;
-                else
-                  ARV(elem,arete1-nb_aretes_tot)+=psc;
-              else if(arete1<nb_aretes_tot)
-                AVR(elem-nb_elem_tot,arete1)+=psc;
-              else
-                AVV(elem-nb_elem_tot,arete1-nb_aretes_tot)+=psc;
+              range(elem,nb_elem_tot,arete1,nb_aretes_tot,ARR,ARV,AVR,AVV,psc);
             }
         }
     }
 }
 
 static void contribuer_matriceP1Pa(const Domaine_VEF& domaine_VEF, int elem1, int elem2,
-                                   const ArrOfInt& sommets,
-                                   IntLists& voisins,
-                                   DoubleLists& coeffs)
+                                   const int* sommets,
+                                   Stencil& stencil, int& nnz)
 
 {
   const IntTab& elem_aretes=domaine_VEF.domaine().elem_aretes();
@@ -1538,12 +1103,9 @@ static void contribuer_matriceP1Pa(const Domaine_VEF& domaine_VEF, int elem1, in
               for(int k=0; k<jmax; k++)
                 {
                   int sk=sommets[k];
-                  int rang1=voisins[sk].rang(arete1);
-                  if(rang1==-1)
-                    {
-                      voisins[sk].add(arete1);
-                      coeffs[sk].add(0);
-                    }
+                  stencil(nnz, 0) = sk;
+                  stencil(nnz, 1) = arete1;
+                  nnz++;
                 }
             }
         }
@@ -1553,9 +1115,9 @@ static void contribuer_matriceP1Pa(const Domaine_VEF& domaine_VEF, int elem1, in
 static void update_matriceP1Pa(const Domaine_VEF& domaine_VEF,
                                const DoubleTab& inverse_quantitee_entrelacee,
                                int face, int elem1, int elem2,
-                               ArrOfInt& sommets,
-                               ArrOfInt& faces_op1,
-                               ArrOfInt& faces_op2,           const ArrOfDouble& coef_som,
+                               int* sommets,
+                               int* faces_op1,
+                               int* faces_op2,           const ArrOfDouble& coef_som,
                                Matrice_Morse& ARR, Matrice_Morse& ARV,
                                Matrice_Morse& AVR, Matrice_Morse& AVV)
 {
@@ -1570,16 +1132,12 @@ static void update_matriceP1Pa(const Domaine_VEF& domaine_VEF,
   int nb_aretes_tot=domaine_VEF.domaine().nb_aretes();
   int nb_som_tot=domaine_VEF.domaine().nb_som();
 
-  int i, j, k;
-  double psc;
-
-
   int jmax=5;
   if(elem2==-1) jmax=4;
-  for(i=0; i<3; i++)
+  for(int i=0; i<3; i++)
     {
       int si=sommets[i];
-      for(j=i+1; j<jmax; j++)
+      for(int j=i+1; j<jmax; j++)
         {
           int sj=sommets[j];
           int arete1;
@@ -1596,23 +1154,15 @@ static void update_matriceP1Pa(const Domaine_VEF& domaine_VEF,
                                   faces_op1[i], faces_op2[i],
                                   faces_op1[j], faces_op2[j],
                                   normales, gradi);
-              for(k=0; k<jmax; k++)
+              for(int k=0; k<jmax; k++)
                 {
                   int sk=sommets[k];
                   calculer_grad(face_voisins, elem1, elem2, coef_som, sk,
                                 faces_op1[k], faces_op2[k],
                                 normales, gradj);
-                  psc=-dotproduct_array_fois_inverse_quantitee_entrelacee(gradi,gradj,
-                                                                          inverse_quantitee_entrelacee,face);
-                  if(sk<nb_som_tot)
-                    if(arete1<nb_aretes_tot)
-                      ARR(sk,arete1)+=psc;
-                    else
-                      ARV(sk,arete1-nb_aretes_tot)+=psc;
-                  else if(arete1<nb_aretes_tot)
-                    AVR(sk-nb_som_tot,arete1)+=psc;
-                  else
-                    AVV(sk-nb_som_tot,arete1-nb_aretes_tot)+=psc;
+                  double psc=-dotproduct_array_fois_inverse_quantitee_entrelacee(gradi,gradj,
+                                                                                 inverse_quantitee_entrelacee,face);
+                  range(sk,nb_som_tot,arete1,nb_aretes_tot,ARR,ARV,AVR,AVV,psc);
                 }
             }
         }
@@ -1621,9 +1171,8 @@ static void update_matriceP1Pa(const Domaine_VEF& domaine_VEF,
 
 static void
 contribuer_matrice_NeumannP1Pa(const Domaine_VEF& domaine_VEF, int elem,
-                               const ArrOfInt& sommets,
-                               IntLists& voisins,
-                               DoubleLists& coeffs)
+                               const int* sommets,
+                               Stencil& stencil, int& nnz)
 
 {
   const IntTab& elem_aretes=domaine_VEF.domaine().elem_aretes();
@@ -1643,13 +1192,9 @@ contribuer_matrice_NeumannP1Pa(const Domaine_VEF& domaine_VEF, int elem,
               for(int k=0; k<4; k++)
                 {
                   int sk=sommets[k];
-                  int rang1=voisins[sk].rang(arete1);
-                  if(rang1==-1)
-                    {
-                      voisins[sk].add(arete1);
-                      coeffs[sk].add(0);
-
-                    }
+                  stencil(nnz, 0) = sk;
+                  stencil(nnz, 1) = arete1;
+                  nnz++;
                 }
             }
         }
@@ -1660,8 +1205,8 @@ static void
 update_matrice_NeumannP1Pa(const Domaine_VEF& domaine_VEF,
                            const DoubleTab& inverse_quantitee_entrelacee,
                            int face, int elem,
-                           ArrOfInt& sommets,
-                           ArrOfInt& faces_op1,   const ArrOfDouble& coef_som,
+                           int* sommets,
+                           int* faces_op1,   const ArrOfDouble& coef_som,
                            Matrice_Morse& ARR, Matrice_Morse& ARV,
                            Matrice_Morse& AVR, Matrice_Morse& AVV)
 {
@@ -1677,14 +1222,10 @@ update_matrice_NeumannP1Pa(const Domaine_VEF& domaine_VEF,
   int nb_aretes_tot=domaine_VEF.domaine().nb_aretes();
   int nb_som_tot=domaine_VEF.domaine().nb_som();
 
-  int i, j, k;
-  double psc;
-
-
-  for(i=0; i<3; i++)
+  for(int i=0; i<3; i++)
     {
       int si=sommets[i];
-      for(j=i+1; j<4; j++)
+      for(int j=i+1; j<4; j++)
         {
           int sj=sommets[j];
           int arete1;
@@ -1697,7 +1238,7 @@ update_matrice_NeumannP1Pa(const Domaine_VEF& domaine_VEF,
                                   faces_op1[i], -1,
                                   faces_op1[j], -1,
                                   normales, gradi);
-              for(k=0; k<4; k++)
+              for(int k=0; k<4; k++)
                 {
                   int sk=sommets[k];
                   calculer_grad(face_voisins, elem, -1,  coef_som,sk,
@@ -1706,17 +1247,9 @@ update_matrice_NeumannP1Pa(const Domaine_VEF& domaine_VEF,
                   if(faces_op1[k]!=face)
                     for (int comp=0; comp<dimension; comp++)
                       gradj[comp]+= normales(face,comp)*unsurdim;
-                  psc=-dotproduct_array_fois_inverse_quantitee_entrelacee(gradi,gradj,
-                                                                          inverse_quantitee_entrelacee,face);
-                  if(sk<nb_som_tot)
-                    if(arete1<nb_aretes_tot)
-                      ARR(sk,arete1)+=psc;
-                    else
-                      ARV(sk,arete1-nb_aretes_tot)+=psc;
-                  else if(arete1<nb_aretes_tot)
-                    AVR(sk-nb_som_tot,arete1)+=psc;
-                  else
-                    AVV(sk-nb_som_tot,arete1-nb_aretes_tot)+=psc;
+                  double psc=-dotproduct_array_fois_inverse_quantitee_entrelacee(gradi,gradj,
+                                                                                 inverse_quantitee_entrelacee,face);
+                  range(sk,nb_som_tot,arete1,nb_aretes_tot,ARR,ARV,AVR,AVV,psc);
                 }
             }
         }
@@ -1725,9 +1258,8 @@ update_matrice_NeumannP1Pa(const Domaine_VEF& domaine_VEF,
 
 static void
 contribuer_matrice_SymetrieP1Pa(const Domaine_VEF& domaine_VEF, int elem,
-                                const ArrOfInt& sommets,
-                                IntLists& voisins,
-                                DoubleLists& coeffs)
+                                const int* sommets,
+                                Stencil& stencil, int& nnz)
 
 {
   const IntTab& elem_aretes=domaine_VEF.domaine().elem_aretes();
@@ -1747,12 +1279,9 @@ contribuer_matrice_SymetrieP1Pa(const Domaine_VEF& domaine_VEF, int elem,
               for(int k=0; k<4; k++)
                 {
                   int sk=sommets[k];
-                  int rang1=voisins[sk].rang(arete1);
-                  if(rang1==-1)
-                    {
-                      voisins[sk].add(arete1);
-                      coeffs[sk].add(0);
-                    }
+                  stencil(nnz, 0) = sk;
+                  stencil(nnz, 1) = arete1;
+                  nnz++;
                 }
             }
         }
@@ -1763,8 +1292,8 @@ static void
 update_matrice_SymetrieP1Pa(const Domaine_VEF& domaine_VEF,
                             const DoubleTab& inverse_quantitee_entrelacee,
                             int face, int elem,
-                            ArrOfInt& sommets,
-                            ArrOfInt& faces_op1,   const ArrOfDouble& coef_som,
+                            int* sommets,
+                            int* faces_op1,   const ArrOfDouble& coef_som,
                             Matrice_Morse& ARR, Matrice_Morse& ARV,
                             Matrice_Morse& AVR, Matrice_Morse& AVV)
 {
@@ -1779,14 +1308,10 @@ update_matrice_SymetrieP1Pa(const Domaine_VEF& domaine_VEF,
   int nb_aretes_tot=domaine_VEF.domaine().nb_aretes();
   int nb_som_tot=domaine_VEF.domaine().nb_som();
 
-  int i, j, k;
-  double psc;
-
-
-  for(i=0; i<3; i++)
+  for(int i=0; i<3; i++)
     {
       int si=sommets[i];
-      for(j=i+1; j<4; j++)
+      for(int j=i+1; j<4; j++)
         {
           int sj=sommets[j];
           int arete1;
@@ -1800,24 +1325,16 @@ update_matrice_SymetrieP1Pa(const Domaine_VEF& domaine_VEF,
                                   faces_op1[j], -1,
                                   normales, gradi);
               projette(gradi, face, normales);
-              for(k=0; k<4; k++)
+              for(int k=0; k<4; k++)
                 {
                   int sk=sommets[k];
                   calculer_grad(face_voisins, elem, -1,  coef_som,sk,
                                 faces_op1[k], -1,
                                 normales, gradj);
                   projette(gradj, face, normales);
-                  psc=-dotproduct_array_fois_inverse_quantitee_entrelacee(gradi,gradj,
-                                                                          inverse_quantitee_entrelacee,face);
-                  if(sk<nb_som_tot)
-                    if(arete1<nb_aretes_tot)
-                      ARR(sk,arete1)+=psc;
-                    else
-                      ARV(sk,arete1-nb_aretes_tot)+=psc;
-                  else if(arete1<nb_aretes_tot)
-                    AVR(sk-nb_som_tot,arete1)+=psc;
-                  else
-                    AVV(sk-nb_som_tot,arete1-nb_aretes_tot)+=psc;
+                  double psc=-dotproduct_array_fois_inverse_quantitee_entrelacee(gradi,gradj,
+                                                                                 inverse_quantitee_entrelacee,face);
+                  range(sk,nb_som_tot,arete1,nb_aretes_tot,ARR,ARV,AVR,AVV,psc);
                 }
             }
         }
@@ -1859,60 +1376,71 @@ void assemblerP1P1(const Domaine_dis_base& z,
   const Domaine& domaine=domaine_VEF.domaine();
   const Domaine_Cl_VEF& domaine_Cl_VEF=ref_cast(Domaine_Cl_VEF, zcl);
   const Conds_lim& les_cl = domaine_Cl_VEF.les_conditions_limites();
-  const IntTab& face_voisins = domaine_VEF.face_voisins();
-  int nint = domaine_VEF.premiere_face_int();
-  int nb_faces = domaine_VEF.nb_faces_tot();
-  int nb_som = domaine_VEF.domaine().nb_som_tot();
-  int elem1, elem2, face, ok;
-  IntLists voisins(nb_som);
-  DoubleLists coeffs(nb_som);
-  ArrOfInt sommets(dimension+2);
-  ArrOfInt face_opp1(dimension+2);
-  ArrOfInt face_opp2(dimension+2);
-  // Faces de bord :
-  for(int i=0; i<les_cl.size(); i++)
-    {
-      const Cond_lim& la_cl = les_cl[i];
-      const Front_VF& le_bord = ref_cast(Front_VF,la_cl->frontiere_dis());
-      int nb_faces_bord_tot = le_bord.nb_faces_tot();
-      for(int ind_face=0; ind_face<nb_faces_bord_tot; ind_face++)
-        {
-          ok=okface(ind_face, face, la_cl);
-          if (ok==-1) break;
-          elem1=face_voisins(face, 0);
-          elem2=face_voisins(face, 1);
-          remplir_sommets(domaine_VEF, face, elem1, elem2, sommets,
-                          face_opp1, face_opp2);
-          sort(sommets, face_opp1, face_opp2);
-          if(ok==3)
-            {
-              contribuer_matrice_NeumannP1P1(elem1, sommets, voisins, coeffs);
-            }
-          else if(ok==4)
-            {
-              contribuer_matrice_SymetrieP1P1(elem1, sommets, voisins, coeffs);
-            }
-          else
-            contribuer_matriceP1P1(elem1, elem2, sommets, voisins, coeffs);
-        }
-    }
-  face_associee=-1;
-  for(face=nint; face<nb_faces; face++)
-    {
-      elem1=face_voisins(face, 0);
-      elem2=face_voisins(face, 1);
-      if (!domaine_VEF.est_une_face_virt_bord(face)) // On ne traite que les faces internes
-        {
-          remplir_sommets(domaine_VEF, face, elem1, elem2, sommets, face_opp1, face_opp2);
-          sort(sommets, face_opp1, face_opp2);
-          contribuer_matriceP1P1(elem1, elem2, sommets, voisins, coeffs);
-        }
-    }
-  DoubleVect diag(nb_som);
-  diag=1;
+  int nb_faces_tot = domaine_VEF.nb_faces_tot();
+  int nb_som_tot = domaine_VEF.domaine().nb_som_tot();
+
+  ArrOfInt tab_cl(nb_faces_tot);
+  build_cl(tab_cl, les_cl);
+  const int max_nnz_per_line = (dimension + 2) * (dimension + 1) / 2;
+  Stencil tab_stencil;
+  tab_stencil.resize(nb_faces_tot * max_nnz_per_line + nb_som_tot, 2);
+  auto stencil = tab_stencil.view_wo();
+  // Diagonal:
+  Kokkos::parallel_for(start_gpu_timer(__KERNEL_NAME__), nb_som_tot, KOKKOS_LAMBDA(const int s)
+  {
+    stencil(s, 0) = s;
+    stencil(s, 1) = s;
+  });
+  end_gpu_timer(__KERNEL_NAME__);
+  using nnz_t = decltype(tab_stencil.dimension(0));
+  TRUSTArray<nnz_t, int> tab_nnz(1);
+  tab_nnz(0) = nb_som_tot;
+  auto nnz = tab_nnz.view_rw();
+  CIntArrView cl = tab_cl.view_ro();
+  CIntTabView face_voisins = domaine_VEF.face_voisins().view_ro();
+  CIntTabView elem_som = domaine_VEF.domaine().les_elems().view_ro();
+  CIntTabView face_som = domaine_VEF.face_sommets().view_ro();
+  CIntTabView elem_faces = domaine_VEF.elem_faces().view_ro();
+  CIntArrView renum_som_perio = domaine_VEF.domaine().get_renum_som_perio().view_ro();
+  Kokkos::parallel_for(start_gpu_timer(__KERNEL_NAME__), nb_faces_tot, KOKKOS_LAMBDA(const int face)
+  {
+    if (cl(face)==NEUMANN // Neumann
+        || cl(face)==SYMMETRY // Symetrie
+        || cl(face)>0 // Periodique
+        || cl(face)==INTERNAL) // Face interne
+      {
+        int sommets[5] = { -1,-1,-1,-1,-1};
+        int face_opp1[5]= { -1,-1,-1,-1,-1};
+        int face_opp2[5]= { -1,-1,-1,-1,-1};
+        int elem1=face_voisins(face, 0);
+        int elem2=face_voisins(face, 1);
+        int face_associee = cl(face) < 0 ? -1 : cl(face); // Periodique
+        remplir_sommets(elem_som, face_som, elem_faces, renum_som_perio, face, face_associee, elem1, elem2, sommets, face_opp1, face_opp2);
+        sort(sommets, face_opp1, face_opp2);
+        int size = dimension + 2;
+        for (int i = 0; i < size; i++)
+          {
+            int si = sommets[i];
+            if (si<0) break;
+            for (int j = i + 1; j < size; j++)
+              {
+                int sj = sommets[j];
+                if (sj<0) break;
+                if (sj > si || elem2==-1)
+                  {
+                    nnz_t slot = Kokkos::atomic_fetch_add(&nnz(0), 1);
+                    stencil(slot, 0) = si;
+                    stencil(slot, 1) = sj;
+                  }
+              }
+          }
+      }
+  });
+  end_gpu_timer(__KERNEL_NAME__);
+  tab_stencil.resize(tab_nnz(0), 2);
   matrice.typer("Matrice_Bloc");
   Matrice_Bloc& matrice_bloc=ref_cast(Matrice_Bloc, matrice.valeur());
-  matrice_bloc.remplir(voisins, coeffs, diag, domaine.nb_som(), domaine.nb_som_tot());
+  matrice_bloc.remplir(tab_stencil, domaine.nb_som(), domaine.nb_som_tot());
   Cerr << "Assemblage P1 OK" << finl;
 }
 
@@ -1920,66 +1448,92 @@ void assemblerP1P1(const Domaine_dis_base& z,
 void updateP1P1(const Domaine_dis_base& z,
                 const Domaine_Cl_dis_base& zcl,
                 Matrice& matrice,
-                const DoubleTab& inverse_quantitee_entrelacee, const ArrOfDouble& coef_som)
+                const DoubleTab& tab_inverse_quantitee_entrelacee, const ArrOfDouble& tab_coef_som)
 {
   int dimension=Objet_U::dimension;
   const Domaine_VEF& domaine_VEF=ref_cast(Domaine_VEF, z);
   const Domaine_Cl_VEF& domaine_Cl_VEF=ref_cast(Domaine_Cl_VEF, zcl);
   const Conds_lim& les_cl = domaine_Cl_VEF.les_conditions_limites();
-  const IntTab& face_voisins = domaine_VEF.face_voisins();
-  int nint = domaine_VEF.premiere_face_int();
-  int nb_faces = domaine_VEF.nb_faces_tot();
-  int elem1, elem2, face, ok;
-  ArrOfInt sommets(dimension+2);
-  ArrOfInt face_opp1(dimension+2);
-  ArrOfInt face_opp2(dimension+2);
+  int nb_faces_tot = domaine_VEF.nb_faces_tot();
+
+  ArrOfInt tab_cl(nb_faces_tot);
+  build_cl(tab_cl, les_cl);
+  int nb_som=domaine_VEF.nb_som();
   Matrice_Bloc& A=ref_cast(Matrice_Bloc, matrice.valeur());
-  Matrice_Morse_Sym& ARR=ref_cast(Matrice_Morse_Sym, A.get_bloc(0,0).valeur());
-  Matrice_Morse& ARV=ref_cast(Matrice_Morse, A.get_bloc(0,1).valeur());
-  Matrice_Morse& AVR=ref_cast(Matrice_Morse, A.get_bloc(1,0).valeur());
-  Matrice_Morse& AVV=ref_cast(Matrice_Morse, A.get_bloc(1,1).valeur());
-  // Faces de bord :
-  for (auto &itr : les_cl)
-    {
-      const Cond_lim& la_cl = itr;
-      const Front_VF& le_bord = ref_cast(Front_VF, la_cl->frontiere_dis());
-      int nb_faces_bord_tot = le_bord.nb_faces_tot();
-      for (int ind_face = 0; ind_face < nb_faces_bord_tot; ind_face++)
-        {
-          ok = okface(ind_face, face, la_cl);
-          if (ok == -1)
-            break;
-          elem1 = face_voisins(face, 0);
-          elem2 = face_voisins(face, 1);
-          remplir_sommets(domaine_VEF, face, elem1, elem2, sommets, face_opp1, face_opp2);
-          sort(sommets, face_opp1, face_opp2);
-          if (ok == 3)
-            update_matrice_NeumannP1P1(domaine_VEF, inverse_quantitee_entrelacee, face, elem1, sommets, face_opp1, coef_som, ARR, ARV, AVR, AVV);
-          else if (ok == 4)
-            update_matrice_SymetrieP1P1(domaine_VEF, inverse_quantitee_entrelacee, face, elem1, sommets, face_opp1, coef_som, ARR, ARV, AVR, AVV);
-          else
-            update_matriceP1P1(domaine_VEF, inverse_quantitee_entrelacee, face, elem1, elem2, sommets, face_opp1, face_opp2, coef_som, ARR, ARV, AVR, AVV);
-        }
-    }
-  face_associee=-1;
-  for (face = nint; face < nb_faces; face++)
-    {
-      elem1 = face_voisins(face, 0);
-      elem2 = face_voisins(face, 1);
-      if (!domaine_VEF.est_une_face_virt_bord(face)) // On ne traite que les faces internes
-        {
-          remplir_sommets(domaine_VEF, face, elem1, elem2, sommets, face_opp1, face_opp2);
-          sort(sommets, face_opp1, face_opp2);
-          update_matriceP1P1(domaine_VEF, inverse_quantitee_entrelacee, face, elem1, elem2, sommets, face_opp1, face_opp2, coef_som, ARR, ARV, AVR, AVV);
-        }
-    }
-  int nb_som=domaine_VEF.domaine().nb_som();
-  for(int i=0; i<nb_som; i++)
-    if(ARR(i,i)==0)
+  Matrice_Morse_View ARR, ARV, AVR, AVV;
+  ARR.set(ref_cast(Matrice_Morse_Sym, A.get_bloc(0,0).valeur()));
+  ARV.set(ref_cast(Matrice_Morse, A.get_bloc(0,1).valeur()));
+  AVR.set(ref_cast(Matrice_Morse, A.get_bloc(1,0).valeur()));
+  AVV.set(ref_cast(Matrice_Morse, A.get_bloc(1,1).valeur()));
+  CIntArrView cl = tab_cl.view_ro();
+  CIntTabView face_voisins = domaine_VEF.face_voisins().view_ro();
+  CIntTabView elem_som = domaine_VEF.domaine().les_elems().view_ro();
+  CIntTabView face_som = domaine_VEF.face_sommets().view_ro();
+  CIntTabView elem_faces = domaine_VEF.elem_faces().view_ro();
+  CIntArrView renum_som_perio = domaine_VEF.domaine().get_renum_som_perio().view_ro();
+  CDoubleTabView normales = domaine_VEF.face_normales().view_ro();
+  CDoubleArrView coef_som = tab_coef_som.view_ro();
+  CDoubleTabView inverse_quantitee_entrelacee = tab_inverse_quantitee_entrelacee.view_ro();
+  Kokkos::parallel_for(start_gpu_timer(__KERNEL_NAME__), range_1D(0, nb_faces_tot), KOKKOS_LAMBDA(const int face)
+  {
+    int cl_face = cl(face);
+    if (cl_face == DIRICHLET || cl_face == OTHER) return;
+
+    int sommets[5] = { -1,-1,-1,-1,-1};
+    int face_opp1[5]= { -1,-1,-1,-1,-1};
+    int face_opp2[5]= { -1,-1,-1,-1,-1};
+    int elem1 = face_voisins(face, 0);
+    int elem2 = face_voisins(face, 1);
+    int face_associee = cl_face < 0 ? -1 : cl_face;
+    remplir_sommets(elem_som, face_som, elem_faces, renum_som_perio, face, face_associee, elem1, elem2, sommets, face_opp1, face_opp2);
+    sort(sommets, face_opp1, face_opp2);
+    bool is_symetrie = (cl_face == SYMMETRY);
+
+    int size = dimension+2;
+    double unsurdim = 1./dimension;
+    for (int i = 0; i < size; i++)
       {
-        //Cerr << "On modifie la ligne (sommet) orpheline " << i << finl;
-        ARR(i,i)=1.;
+        int si = sommets[i];
+        if (si < 0) break;
+        double gradi_[3];
+        calculer_grad(face_voisins, elem1, elem2, coef_som, si, face_opp1[i], face_opp2[i], normales, gradi_);
+        if (elem2 == -1)
+          {
+            if (is_symetrie)
+              projette(gradi_, face, normales); // Symetrie
+            else if (face_opp1[i] != face)
+              for (int comp = 0; comp < dimension; comp++)
+                gradi_[comp] += normales(face, comp) * unsurdim; // Neumann
+          }
+        if (si < nb_som)
+          ARR.atomic_add(si, si, dotproduct_array_fois_inverse_quantitee_entrelacee(gradi_, gradi_, inverse_quantitee_entrelacee, face));
+        for (int j = i+1; j < size; j++)
+          {
+            int sj = sommets[j];
+            if (sj < 0) break;
+            double gradj_[3];
+            calculer_grad(face_voisins, elem1, elem2, coef_som, sj, face_opp1[j], face_opp2[j], normales, gradj_);
+            if (elem2 == -1)
+              {
+                if (is_symetrie)
+                  projette(gradj_, face, normales); // Symetrie
+                else if (face_opp1[j] != face)
+                  for (int comp = 0; comp < dimension; comp++)
+                    gradj_[comp] += normales(face, comp) * unsurdim; // Neumann
+              }
+            double psc = dotproduct_array_fois_inverse_quantitee_entrelacee(gradi_, gradj_, inverse_quantitee_entrelacee, face);
+            range(si, nb_som, sj, nb_som, ARR, ARV, AVR, AVV, psc);
+          }
       }
+  });
+  end_gpu_timer(__KERNEL_NAME__);
+
+  Kokkos::parallel_for(start_gpu_timer(__KERNEL_NAME__), range_1D(0, nb_som), KOKKOS_LAMBDA(const int i)
+  {
+    //Cerr << "On modifie la ligne (sommet) orpheline " << i << finl;
+    if (ARR(i, i) == 0.) ARR.store(i, i, 1.);
+  });
+  end_gpu_timer(__KERNEL_NAME__);
   Cerr << "Update P1 OK" << finl;
 }
 
@@ -1989,44 +1543,35 @@ void modifieP1P1neumann(const Domaine_dis_base& z,
                         Matrice& matrice,
                         const DoubleTab& inverse_quantitee_entrelacee, const ArrOfDouble& coef_som)
 {
-
-  //int dimension=Objet_U::dimension;
   const Domaine_VEF& domaine_VEF=ref_cast(Domaine_VEF, z);
-  //  const Domaine& domaine=domaine_VEF.domaine();
   const Domaine_Cl_VEF& domaine_Cl_VEF=ref_cast(Domaine_Cl_VEF, zcl);
   const Conds_lim& les_cl = domaine_Cl_VEF.les_conditions_limites();
-
-  //  int nnz=nb_som;
-  //ArrOfInt sommets(dimension+2);
-  //ArrOfInt face_opp1(dimension+2);
-  //ArrOfInt face_opp2(dimension+2);
   Matrice_Bloc& A=ref_cast(Matrice_Bloc, matrice.valeur());
-  Matrice_Morse_Sym& ARR=ref_cast(Matrice_Morse_Sym, A.get_bloc(0,0).valeur());
-  // Faces de bord :
   assert(ref_cast(Domaine_VEF, z).get_cl_pression_sommet_faible()==0);
   int nb_som_tot=z.nb_som();
+  Matrice_Morse_View ARR;
+  ARR.set(ref_cast(Matrice_Morse_Sym, A.get_bloc(0,0).valeur()));
+  CIntTabView face_som = domaine_VEF.face_sommets().view_ro();
+  int nbsf = domaine_VEF.face_sommets().dimension(1);
   for(auto& itr : les_cl)
     {
-
       const Cond_lim_base& la_cl = itr.valeur();
       if (sub_type(Neumann_sortie_libre,la_cl))
         {
           const Front_VF& le_bord = ref_cast(Front_VF,la_cl.frontiere_dis());
           int nb_faces_bord = le_bord.nb_faces_tot();
-          const IntTab& faces=domaine_VEF.face_sommets();
-          int nbsf=faces.dimension(1);
-          for(int ind_face=0; ind_face<nb_faces_bord; ind_face++)
-            {
-              int face=le_bord.num_face(ind_face);
-              for(int som=0; som<nbsf; som++)
-                {
-
-                  int som_glob=faces(face,som);
-                  if (som_glob<nb_som_tot)
-                    ARR(som_glob,som_glob)=1e12;
-                  //        Cout<<ref_cast(Domaine_VEF, z).numero_premier_sommet()<<" ici "<<som_glob<<finl;
-                }
-            }
+          CIntArrView num_faces = le_bord.num_face().view_ro();
+          Kokkos::parallel_for(start_gpu_timer(__KERNEL_NAME__), range_1D(0, nb_faces_bord), KOKKOS_LAMBDA(const int ind_face)
+          {
+            int face = num_faces(ind_face);
+            for(int som=0; som<nbsf; som++)
+              {
+                int som_glob = face_som(face, som);
+                if (som_glob < nb_som_tot)
+                  ARR.atomic_store(som_glob, som_glob, 1e12); // CL pression forte
+              }
+          });
+          end_gpu_timer(__KERNEL_NAME__);
         }
     }
   Cerr << "Modifie P1P1 Neumann OK" << finl;
@@ -2037,64 +1582,50 @@ void assemblerPaPa(const Domaine_dis_base& z,
                    Matrice& matrice,
                    const DoubleTab& inverse_quantitee_entrelacee)
 {
-  int dimension=Objet_U::dimension;
   const Domaine_VEF& domaine_VEF=ref_cast(Domaine_VEF, z);
   const Domaine& domaine=domaine_VEF.domaine();
   const Domaine_Cl_VEF& domaine_Cl_VEF=ref_cast(Domaine_Cl_VEF, zcl);
   const Conds_lim& les_cl = domaine_Cl_VEF.les_conditions_limites();
   const IntTab& face_voisins = domaine_VEF.face_voisins();
-  int nint = domaine_VEF.premiere_face_int();
-  int nb_faces = domaine_VEF.nb_faces_tot();
-  int nb_arete = domaine.nb_aretes_tot();
-  int elem1, elem2, face, ok;
-  IntLists voisins(nb_arete);
-  DoubleLists coeffs(nb_arete);
-  ArrOfInt sommets(dimension+2);
-  ArrOfInt face_opp1(dimension+2);
-  ArrOfInt face_opp2(dimension+2);
-  // Faces de bord :
-  for(int i=0; i<les_cl.size(); i++)
-    {
-      const Cond_lim& la_cl = les_cl[i];
-      const Front_VF& le_bord = ref_cast(Front_VF,la_cl->frontiere_dis());
-      int nb_faces_bord_tot = le_bord.nb_faces_tot();
-      for(int ind_face=0; ind_face<nb_faces_bord_tot; ind_face++)
-        {
-          ok=okface(ind_face, face, la_cl);
-          if (ok==-1) break;
-          elem1=face_voisins(face, 0);
-          elem2=face_voisins(face, 1);
-          remplir_sommets(domaine_VEF, face, elem1, elem2, sommets,
-                          face_opp1, face_opp2);
-          if(ok==3)
-            {
-              contribuer_matrice_NeumannPaPa(domaine_VEF, elem1, sommets, voisins, coeffs);
-            }
-          else if(ok==4)
-            {
-              contribuer_matrice_SymetriePaPa(domaine_VEF, elem1, sommets, voisins, coeffs);
-            }
-          else
-            contribuer_matricePaPa(domaine_VEF, elem1, elem2, sommets, voisins, coeffs);
-        }
-    }
-  face_associee=-1;
-  for(face=nint; face<nb_faces; face++)
-    {
-      elem1=face_voisins(face, 0);
-      elem2=face_voisins(face, 1);
-      if (!domaine_VEF.est_une_face_virt_bord(face)) // On ne traite que les faces internes
-        {
-          remplir_sommets(domaine_VEF, face, elem1, elem2, sommets, face_opp1, face_opp2);
-          contribuer_matricePaPa(domaine_VEF, elem1, elem2, sommets, voisins, coeffs);
-        }
+  const IntTab& elem_som = domaine.les_elems();
+  const IntTab& face_som = domaine_VEF.face_sommets();
+  const IntTab& elem_faces = domaine_VEF.elem_faces();
+  const ArrOfInt& renum_som_perio = domaine.get_renum_som_perio();
+  int nb_faces_tot = domaine_VEF.nb_faces_tot();
+  ArrOfInt cl(nb_faces_tot);
+  build_cl(cl, les_cl);
+  int nb_aretes_tot = domaine.nb_aretes_tot();
+  int sommets[5] = { -1,-1,-1,-1,-1};
+  int face_opp1[5]= { -1,-1,-1,-1,-1};
+  int face_opp2[5]= { -1,-1,-1,-1,-1};
+  ToDo_Kokkos("assemblerPaPa: parallelise this loop over faces");
+  Stencil stencil;
+  stencil.resize(nb_aretes_tot + nb_faces_tot * 54, 2);
+  int nnz = 0;
+  for (int a = 0; a < nb_aretes_tot; a++)
+    {
+      stencil(nnz, 0) = a;
+      stencil(nnz, 1) = a;
+      nnz++;
+    }
+  for (int face = 0; face < nb_faces_tot; face++)
+    {
+      if (cl(face) == DIRICHLET || cl(face) == OTHER) continue;
+      int elem1 = face_voisins(face, 0);
+      int elem2 = face_voisins(face, 1);
+      int face_associee = cl(face) < 0 ? -1 : cl(face);
+      remplir_sommets(elem_som, face_som, elem_faces, renum_som_perio, face, face_associee, elem1, elem2, sommets, face_opp1, face_opp2);
+      if (cl(face) == NEUMANN)
+        contribuer_matrice_NeumannPaPa(domaine_VEF, elem1, sommets, stencil, nnz);
+      else if (cl(face) == SYMMETRY)
+        contribuer_matrice_SymetriePaPa(domaine_VEF, elem1, sommets, stencil, nnz);
+      else
+        contribuer_matricePaPa(domaine_VEF, elem1, elem2, sommets, stencil, nnz);
     }
-
-  DoubleVect diag(nb_arete);
-  diag = 1;
+  stencil.resize(nnz, 2);
   matrice.typer("Matrice_Bloc");
   Matrice_Bloc& matrice_bloc=ref_cast(Matrice_Bloc, matrice.valeur());
-  matrice_bloc.remplir(voisins, coeffs, diag, domaine.nb_aretes(), domaine.nb_aretes_tot());
+  matrice_bloc.remplir(stencil, domaine.nb_aretes(), domaine.nb_aretes_tot());
   Cerr << "Assemblage Pa OK" << finl;
 }
 
@@ -2103,56 +1634,40 @@ void updatePaPa(const Domaine_dis_base& z,
                 Matrice& matrice,
                 const DoubleTab& inverse_quantitee_entrelacee)
 {
-  int dimension=Objet_U::dimension;
   const Domaine_VEF& domaine_VEF=ref_cast(Domaine_VEF, z);
   const Domaine& domaine=domaine_VEF.domaine();
   const Domaine_Cl_VEF& domaine_Cl_VEF=ref_cast(Domaine_Cl_VEF, zcl);
   const Conds_lim& les_cl = domaine_Cl_VEF.les_conditions_limites();
   const IntTab& face_voisins = domaine_VEF.face_voisins();
-  int nint = domaine_VEF.premiere_face_int();
-  int nb_faces = domaine_VEF.nb_faces_tot();
+  const IntTab& elem_som = domaine.les_elems();
+  const IntTab& face_som = domaine_VEF.face_sommets();
+  const IntTab& elem_faces = domaine_VEF.elem_faces();
+  const ArrOfInt& renum_som_perio = domaine.get_renum_som_perio();
+  int nb_faces_tot = domaine_VEF.nb_faces_tot();
+  ArrOfInt cl(nb_faces_tot);
+  build_cl(cl, les_cl);
   int nb_arete = domaine.nb_aretes();
-  int elem1, elem2, face, ok;
-  ArrOfInt sommets(dimension+2);
-  ArrOfInt face_opp1(dimension+2);
-  ArrOfInt face_opp2(dimension+2);
+  int sommets[5] = { -1,-1,-1,-1,-1};
+  int face_opp1[5]= { -1,-1,-1,-1,-1};
+  int face_opp2[5]= { -1,-1,-1,-1,-1};
   Matrice_Bloc& A=ref_cast(Matrice_Bloc, matrice.valeur());
   Matrice_Morse_Sym& ARR=ref_cast(Matrice_Morse_Sym, A.get_bloc(0,0).valeur());
   Matrice_Morse& ARV=ref_cast(Matrice_Morse, A.get_bloc(0,1).valeur());
   Matrice_Morse& AVR=ref_cast(Matrice_Morse, A.get_bloc(1,0).valeur());
   Matrice_Morse& AVV=ref_cast(Matrice_Morse, A.get_bloc(1,1).valeur());
-  // Faces de bord :
-  for (auto &itr : les_cl)
-    {
-      const Cond_lim& la_cl = itr;
-      const Front_VF& le_bord = ref_cast(Front_VF, la_cl->frontiere_dis());
-      int nb_faces_bord_tot = le_bord.nb_faces_tot();
-      for (int ind_face = 0; ind_face < nb_faces_bord_tot; ind_face++)
-        {
-          ok = okface(ind_face, face, la_cl);
-          if (ok == -1)
-            break;
-          elem1 = face_voisins(face, 0);
-          elem2 = face_voisins(face, 1);
-          remplir_sommets(domaine_VEF, face, elem1, elem2, sommets, face_opp1, face_opp2);
-          if (ok == 3)
-            update_matrice_NeumannPaPa(domaine_VEF, inverse_quantitee_entrelacee, face, elem1, sommets, face_opp1, ARR, ARV, AVR, AVV);
-          else if (ok == 4)
-            update_matrice_SymetriePaPa(domaine_VEF, inverse_quantitee_entrelacee, face, elem1, sommets, face_opp1, ARR, ARV, AVR, AVV);
-          else
-            update_matricePaPa(domaine_VEF, inverse_quantitee_entrelacee, face, elem1, elem2, sommets, face_opp1, face_opp2, ARR, ARV, AVR, AVV);
-        }
-    }
-  face_associee=-1;
-  for (face = nint; face < nb_faces; face++)
-    {
-      elem1 = face_voisins(face, 0);
-      elem2 = face_voisins(face, 1);
-      if (!domaine_VEF.est_une_face_virt_bord(face)) // On ne traite que les faces internes
-        {
-          remplir_sommets(domaine_VEF, face, elem1, elem2, sommets, face_opp1, face_opp2);
-          update_matricePaPa(domaine_VEF, inverse_quantitee_entrelacee, face, elem1, elem2, sommets, face_opp1, face_opp2, ARR, ARV, AVR, AVV);
-        }
+  for (int face = 0; face < nb_faces_tot; face++)
+    {
+      if (cl(face) == DIRICHLET || cl(face) == OTHER) continue;
+      int elem1 = face_voisins(face, 0);
+      int elem2 = face_voisins(face, 1);
+      int face_associee = cl(face) < 0 ? -1 : cl(face);
+      remplir_sommets(elem_som, face_som, elem_faces, renum_som_perio, face, face_associee, elem1, elem2, sommets, face_opp1, face_opp2);
+      if (cl(face) == NEUMANN)
+        update_matrice_NeumannPaPa(domaine_VEF, inverse_quantitee_entrelacee, face, elem1, sommets, face_opp1, ARR, ARV, AVR, AVV);
+      else if (cl(face) == SYMMETRY)
+        update_matrice_SymetriePaPa(domaine_VEF, inverse_quantitee_entrelacee, face, elem1, sommets, face_opp1, ARR, ARV, AVR, AVV);
+      else
+        update_matricePaPa(domaine_VEF, inverse_quantitee_entrelacee, face, elem1, elem2, sommets, face_opp1, face_opp2, ARR, ARV, AVR, AVV);
     }
   for(int i=0; i<nb_arete; i++)
     if(ARR(i,i)==0)
@@ -2170,62 +1685,41 @@ void assemblerP0Pa(const Domaine_dis_base& z,
                    Matrice& matrice,
                    const DoubleTab& inverse_quantitee_entrelacee)
 {
-  int dimension=Objet_U::dimension;
   const Domaine_VEF& domaine_VEF=ref_cast(Domaine_VEF, z);
   const Domaine& domaine=domaine_VEF.domaine();
   const Domaine_Cl_VEF& domaine_Cl_VEF=ref_cast(Domaine_Cl_VEF, zcl);
   const Conds_lim& les_cl = domaine_Cl_VEF.les_conditions_limites();
   const IntTab& face_voisins = domaine_VEF.face_voisins();
-  int nint = domaine_VEF.premiere_face_int();
-  int nb_faces = domaine_VEF.nb_faces_tot();
-  int nb_elem = domaine.nb_elem_tot();
-  int elem1, elem2, face, ok;
-  IntLists voisins(nb_elem);
-  DoubleLists coeffs(nb_elem);
-  ArrOfInt sommets(dimension+2);
-  ArrOfInt face_opp1(dimension+2);
-  ArrOfInt face_opp2(dimension+2);
-  // Faces de bord :
-  for(int i=0; i<les_cl.size(); i++)
-    {
-      const Cond_lim& la_cl = les_cl[i];
-      const Front_VF& le_bord = ref_cast(Front_VF,la_cl->frontiere_dis());
-      int nb_faces_bord_tot = le_bord.nb_faces_tot();
-      for(int ind_face=0; ind_face<nb_faces_bord_tot; ind_face++)
-        {
-          ok=okface(ind_face, face, la_cl);
-          if (ok==-1) break;
-          elem1=face_voisins(face, 0);
-          elem2=face_voisins(face, 1);
-          remplir_sommets(domaine_VEF, face, elem1, elem2, sommets,
-                          face_opp1, face_opp2);
-          if(ok==3)
-            {
-              contribuer_matrice_NeumannP0Pa(domaine_VEF, elem1, sommets, voisins, coeffs);
-            }
-          else if(ok==4)
-            {
-              ; // RIEN
-            }
-          else
-            contribuer_matriceP0Pa(domaine_VEF, elem1, elem2, sommets, voisins, coeffs);
-        }
-    }
-  face_associee=-1;
-  for(face=nint; face<nb_faces; face++)
-    {
-      elem1=face_voisins(face, 0);
-      elem2=face_voisins(face, 1);
-      if (!domaine_VEF.est_une_face_virt_bord(face)) // On ne traite que les faces internes
-        {
-          remplir_sommets(domaine_VEF, face, elem1, elem2, sommets,
-                          face_opp1, face_opp2);
-          contribuer_matriceP0Pa(domaine_VEF, elem1, elem2, sommets, voisins, coeffs);
-        }
+  const IntTab& elem_som = domaine.les_elems();
+  const IntTab& face_som = domaine_VEF.face_sommets();
+  const IntTab& elem_faces = domaine_VEF.elem_faces();
+  const ArrOfInt& renum_som_perio = domaine.get_renum_som_perio();
+  int nb_faces_tot = domaine_VEF.nb_faces_tot();
+  ArrOfInt cl(nb_faces_tot);
+  build_cl(cl, les_cl);
+  int sommets[5] = { -1,-1,-1,-1,-1};
+  int face_opp1[5]= { -1,-1,-1,-1,-1};
+  int face_opp2[5]= { -1,-1,-1,-1,-1};
+  ToDo_Kokkos("assemblerP0Pa: parallelise this loop over faces");
+  Stencil stencil;
+  stencil.resize(nb_faces_tot * 20, 2);
+  int nnz = 0;
+  for (int face = 0; face < nb_faces_tot; face++)
+    {
+      if (cl(face) == DIRICHLET || cl(face) == SYMMETRY || cl(face) == OTHER) continue; // Dirichlet, Symetrie (rien), reverse periodic
+      int elem1 = face_voisins(face, 0);
+      int elem2 = face_voisins(face, 1);
+      int face_associee = cl(face) < 0 ? -1 : cl(face);
+      remplir_sommets(elem_som, face_som, elem_faces, renum_som_perio, face, face_associee, elem1, elem2, sommets, face_opp1, face_opp2);
+      if (cl(face) == NEUMANN)
+        contribuer_matrice_NeumannP0Pa(domaine_VEF, elem1, sommets, stencil, nnz);
+      else
+        contribuer_matriceP0Pa(domaine_VEF, elem1, elem2, sommets, stencil, nnz);
     }
+  stencil.resize(nnz, 2);
   matrice.typer("Matrice_Bloc");
   Matrice_Bloc& matrice_bloc=ref_cast(Matrice_Bloc, matrice.valeur());
-  matrice_bloc.remplir(voisins, coeffs, domaine.nb_elem(), domaine.nb_elem_tot(), domaine.nb_aretes(), domaine.nb_aretes_tot());
+  matrice_bloc.remplir(stencil, domaine.nb_elem(), domaine.nb_elem_tot(), domaine.nb_aretes(), domaine.nb_aretes_tot());
   Cerr << "Assemblage P0Pa OK" << finl;
 }
 
@@ -2234,53 +1728,36 @@ void updateP0Pa(const Domaine_dis_base& z,
                 Matrice& matrice,
                 const DoubleTab& inverse_quantitee_entrelacee)
 {
-  int dimension=Objet_U::dimension;
   const Domaine_VEF& domaine_VEF=ref_cast(Domaine_VEF, z);
   const Domaine_Cl_VEF& domaine_Cl_VEF=ref_cast(Domaine_Cl_VEF, zcl);
   const Conds_lim& les_cl = domaine_Cl_VEF.les_conditions_limites();
   const IntTab& face_voisins = domaine_VEF.face_voisins();
-  int nint = domaine_VEF.premiere_face_int();
-  int nb_faces = domaine_VEF.nb_faces_tot();
-  int elem1, elem2, face, ok;
-  ArrOfInt sommets(dimension+2);
-  ArrOfInt face_opp1(dimension+2);
-  ArrOfInt face_opp2(dimension+2);
+  const IntTab& elem_som = domaine_VEF.domaine().les_elems();
+  const IntTab& face_som = domaine_VEF.face_sommets();
+  const IntTab& elem_faces = domaine_VEF.elem_faces();
+  const ArrOfInt& renum_som_perio = domaine_VEF.domaine().get_renum_som_perio();
+  int nb_faces_tot = domaine_VEF.nb_faces_tot();
+  ArrOfInt cl(nb_faces_tot);
+  build_cl(cl, les_cl);
+  int sommets[5] = { -1,-1,-1,-1,-1};
+  int face_opp1[5]= { -1,-1,-1,-1,-1};
+  int face_opp2[5]= { -1,-1,-1,-1,-1};
   Matrice_Bloc& A=ref_cast(Matrice_Bloc, matrice.valeur());
   Matrice_Morse& ARR=ref_cast(Matrice_Morse, A.get_bloc(0,0).valeur());
   Matrice_Morse& ARV=ref_cast(Matrice_Morse, A.get_bloc(0,1).valeur());
   Matrice_Morse& AVR=ref_cast(Matrice_Morse, A.get_bloc(1,0).valeur());
   Matrice_Morse& AVV=ref_cast(Matrice_Morse, A.get_bloc(1,1).valeur());
-  // Faces de bord :
-  for (auto &itr : les_cl)
-    {
-      const Cond_lim& la_cl = itr;
-      const Front_VF& le_bord = ref_cast(Front_VF, la_cl->frontiere_dis());
-      int nb_faces_bord_tot = le_bord.nb_faces_tot();
-      for (int ind_face = 0; ind_face < nb_faces_bord_tot; ind_face++)
-        {
-          ok = okface(ind_face, face, la_cl);
-          if (ok == -1)
-            break;
-          elem1 = face_voisins(face, 0);
-          elem2 = face_voisins(face, 1);
-          remplir_sommets(domaine_VEF, face, elem1, elem2, sommets, face_opp1, face_opp2);
-          if (ok == 3)
-            update_matrice_NeumannP0Pa(domaine_VEF, inverse_quantitee_entrelacee, face, elem1, sommets, face_opp1, ARR, ARV, AVR, AVV);
-          else if (ok == 4) { /* Do nothing */ }
-          else
-            update_matriceP0Pa(domaine_VEF, inverse_quantitee_entrelacee, face, elem1, elem2, sommets, face_opp1, face_opp2, ARR, ARV, AVR, AVV);
-        }
-    }
-  face_associee = -1;
-  for (face = nint; face < nb_faces; face++)
-    {
-      elem1 = face_voisins(face, 0);
-      elem2 = face_voisins(face, 1);
-      if (!domaine_VEF.est_une_face_virt_bord(face)) // On ne traite que les faces internes
-        {
-          remplir_sommets(domaine_VEF, face, elem1, elem2, sommets, face_opp1, face_opp2);
-          update_matriceP0Pa(domaine_VEF, inverse_quantitee_entrelacee, face, elem1, elem2, sommets, face_opp1, face_opp2, ARR, ARV, AVR, AVV);
-        }
+  for (int face = 0; face < nb_faces_tot; face++)
+    {
+      if (cl(face) == DIRICHLET || cl(face) == SYMMETRY || cl(face) == OTHER) continue; // Dirichlet, Symetrie (rien), reverse periodic
+      int elem1 = face_voisins(face, 0);
+      int elem2 = face_voisins(face, 1);
+      int face_associee = cl(face) < 0 ? -1 : cl(face);
+      remplir_sommets(elem_som, face_som, elem_faces, renum_som_perio, face, face_associee, elem1, elem2, sommets, face_opp1, face_opp2);
+      if (cl(face) == NEUMANN)
+        update_matrice_NeumannP0Pa(domaine_VEF, inverse_quantitee_entrelacee, face, elem1, sommets, face_opp1, ARR, ARV, AVR, AVV);
+      else
+        update_matriceP0Pa(domaine_VEF, inverse_quantitee_entrelacee, face, elem1, elem2, sommets, face_opp1, face_opp2, ARR, ARV, AVR, AVV);
     }
   Cerr << "Update P0Pa OK" << finl;
 }
@@ -2290,63 +1767,43 @@ void assemblerP1Pa(const Domaine_dis_base& z,
                    Matrice& matrice,
                    const DoubleTab& inverse_quantitee_entrelacee, const ArrOfDouble& coef_som)
 {
-  int dimension=Objet_U::dimension;
   const Domaine_VEF& domaine_VEF=ref_cast(Domaine_VEF, z);
   const Domaine& domaine=domaine_VEF.domaine();
   const Domaine_Cl_VEF& domaine_Cl_VEF=ref_cast(Domaine_Cl_VEF, zcl);
   const Conds_lim& les_cl = domaine_Cl_VEF.les_conditions_limites();
   const IntTab& face_voisins = domaine_VEF.face_voisins();
-  int nint = domaine_VEF.premiere_face_int();
-  int nb_faces = domaine_VEF.nb_faces_tot();
-  int elem1, elem2, face, ok;
-  int nb_som = domaine.nb_som_tot();
-  IntLists voisins(nb_som);
-  DoubleLists coeffs(nb_som);
-  ArrOfInt sommets(dimension+2);
-  ArrOfInt face_opp1(dimension+2);
-  ArrOfInt face_opp2(dimension+2);
-  // Faces de bord :
-  for(int i=0; i<les_cl.size(); i++)
-    {
-      const Cond_lim& la_cl = les_cl[i];
-      const Front_VF& le_bord = ref_cast(Front_VF,la_cl->frontiere_dis());
-      int nb_faces_bord_tot = le_bord.nb_faces_tot();
-      for(int ind_face=0; ind_face<nb_faces_bord_tot; ind_face++)
-        {
-          ok=okface(ind_face, face, la_cl);
-          if (ok==-1) break;
-          elem1=face_voisins(face, 0);
-          elem2=face_voisins(face, 1);
-          remplir_sommets(domaine_VEF, face, elem1, elem2, sommets,
-                          face_opp1, face_opp2);
-          if(ok==3)
-            {
-              contribuer_matrice_NeumannP1Pa(domaine_VEF, elem1, sommets, voisins, coeffs);
-            }
-          else if(ok==4)
-            {
-              contribuer_matrice_SymetrieP1Pa(domaine_VEF, elem1, sommets, voisins, coeffs);
-            }
-          else
-            contribuer_matriceP1Pa(domaine_VEF, elem1, elem2, sommets, voisins, coeffs);
-        }
-    }
-  face_associee=-1;
-  for(face=nint; face<nb_faces; face++)
-    {
-      elem1=face_voisins(face, 0);
-      elem2=face_voisins(face, 1);
-      if (!domaine_VEF.est_une_face_virt_bord(face)) // On ne traite que les faces internes
-        {
-          remplir_sommets(domaine_VEF, face, elem1, elem2, sommets,
-                          face_opp1, face_opp2);
-          contribuer_matriceP1Pa(domaine_VEF, elem1, elem2, sommets, voisins, coeffs);
-        }
+  const IntTab& elem_som = domaine.les_elems();
+  const IntTab& face_som = domaine_VEF.face_sommets();
+  const IntTab& elem_faces = domaine_VEF.elem_faces();
+  const ArrOfInt& renum_som_perio = domaine.get_renum_som_perio();
+  int nb_faces_tot = domaine_VEF.nb_faces_tot();
+  ArrOfInt cl(nb_faces_tot);
+  build_cl(cl, les_cl);
+  int sommets[5] = { -1,-1,-1,-1,-1};
+  int face_opp1[5]= { -1,-1,-1,-1,-1};
+  int face_opp2[5]= { -1,-1,-1,-1,-1};
+  ToDo_Kokkos("assemblerP1Pa: parallelise this loop over faces");
+  Stencil stencil;
+  stencil.resize(nb_faces_tot * 50, 2);
+  int nnz = 0;
+  for (int face = 0; face < nb_faces_tot; face++)
+    {
+      if (cl(face) == DIRICHLET || cl(face) == OTHER) continue;
+      int elem1 = face_voisins(face, 0);
+      int elem2 = face_voisins(face, 1);
+      int face_associee = cl(face) < 0 ? -1 : cl(face);
+      remplir_sommets(elem_som, face_som, elem_faces, renum_som_perio, face, face_associee, elem1, elem2, sommets, face_opp1, face_opp2);
+      if (cl(face) == NEUMANN)
+        contribuer_matrice_NeumannP1Pa(domaine_VEF, elem1, sommets, stencil, nnz);
+      else if (cl(face) == SYMMETRY)
+        contribuer_matrice_SymetrieP1Pa(domaine_VEF, elem1, sommets, stencil, nnz);
+      else
+        contribuer_matriceP1Pa(domaine_VEF, elem1, elem2, sommets, stencil, nnz);
     }
-
+  stencil.resize(nnz, 2);
   matrice.typer("Matrice_Bloc");
   Matrice_Bloc& matrice_bloc=ref_cast(Matrice_Bloc, matrice.valeur());
-  matrice_bloc.remplir(voisins, coeffs, domaine.nb_som(), domaine.nb_som_tot(), domaine.nb_aretes(), domaine.nb_aretes_tot());
+  matrice_bloc.remplir(stencil, domaine.nb_som(), domaine.nb_som_tot(), domaine.nb_aretes(), domaine.nb_aretes_tot());
   Cerr << "Assemblage P1Pa OK" << finl;
 }
 
@@ -2355,58 +1812,38 @@ void updateP1Pa(const Domaine_dis_base& z,
                 Matrice& matrice,
                 const DoubleTab& inverse_quantitee_entrelacee, const ArrOfDouble& coef_som)
 {
-  int dimension=Objet_U::dimension;
   const Domaine_VEF& domaine_VEF=ref_cast(Domaine_VEF, z);
-  //const Domaine& domaine=domaine_VEF.domaine();
   const Domaine_Cl_VEF& domaine_Cl_VEF=ref_cast(Domaine_Cl_VEF, zcl);
   const Conds_lim& les_cl = domaine_Cl_VEF.les_conditions_limites();
   const IntTab& face_voisins = domaine_VEF.face_voisins();
-  int nint = domaine_VEF.premiere_face_int();
-  int nb_faces = domaine_VEF.nb_faces_tot();
-  int elem1, elem2, face, ok;
-  //int nb_som = domaine.nb_som_tot();
-  //IntLists voisins(nb_som);
-  //DoubleLists coeffs(nb_som);
-  ArrOfInt sommets(dimension+2);
-  ArrOfInt face_opp1(dimension+2);
-  ArrOfInt face_opp2(dimension+2);
+  const IntTab& elem_som = domaine_VEF.domaine().les_elems();
+  const IntTab& face_som = domaine_VEF.face_sommets();
+  const IntTab& elem_faces = domaine_VEF.elem_faces();
+  const ArrOfInt& renum_som_perio = domaine_VEF.domaine().get_renum_som_perio();
+  int nb_faces_tot = domaine_VEF.nb_faces_tot();
+  ArrOfInt cl(nb_faces_tot);
+  build_cl(cl, les_cl);
+  int sommets[5] = { -1,-1,-1,-1,-1};
+  int face_opp1[5]= { -1,-1,-1,-1,-1};
+  int face_opp2[5]= { -1,-1,-1,-1,-1};
   Matrice_Bloc& A=ref_cast(Matrice_Bloc, matrice.valeur());
   Matrice_Morse& ARR=ref_cast(Matrice_Morse, A.get_bloc(0,0).valeur());
   Matrice_Morse& ARV=ref_cast(Matrice_Morse, A.get_bloc(0,1).valeur());
   Matrice_Morse& AVR=ref_cast(Matrice_Morse, A.get_bloc(1,0).valeur());
   Matrice_Morse& AVV=ref_cast(Matrice_Morse, A.get_bloc(1,1).valeur());
-  // Faces de bord :
-  for (auto &itr : les_cl)
-    {
-      const Cond_lim& la_cl = itr;
-      const Front_VF& le_bord = ref_cast(Front_VF, la_cl->frontiere_dis());
-      int nb_faces_bord_tot = le_bord.nb_faces_tot();
-      for (int ind_face = 0; ind_face < nb_faces_bord_tot; ind_face++)
-        {
-          ok = okface(ind_face, face, la_cl);
-          if (ok == -1)
-            break;
-          elem1 = face_voisins(face, 0);
-          elem2 = face_voisins(face, 1);
-          remplir_sommets(domaine_VEF, face, elem1, elem2, sommets, face_opp1, face_opp2);
-          if (ok == 3)
-            update_matrice_NeumannP1Pa(domaine_VEF, inverse_quantitee_entrelacee, face, elem1, sommets, face_opp1, coef_som, ARR, ARV, AVR, AVV);
-          else if (ok == 4)
-            update_matrice_SymetrieP1Pa(domaine_VEF, inverse_quantitee_entrelacee, face, elem1, sommets, face_opp1, coef_som, ARR, ARV, AVR, AVV);
-          else
-            update_matriceP1Pa(domaine_VEF, inverse_quantitee_entrelacee, face, elem1, elem2, sommets, face_opp1, face_opp2, coef_som, ARR, ARV, AVR, AVV);
-        }
-    }
-  face_associee=-1;
-  for (face = nint; face < nb_faces; face++)
-    {
-      elem1 = face_voisins(face, 0);
-      elem2 = face_voisins(face, 1);
-      if (!domaine_VEF.est_une_face_virt_bord(face)) // On ne traite que les faces internes
-        {
-          remplir_sommets(domaine_VEF, face, elem1, elem2, sommets, face_opp1, face_opp2);
-          update_matriceP1Pa(domaine_VEF, inverse_quantitee_entrelacee, face, elem1, elem2, sommets, face_opp1, face_opp2, coef_som, ARR, ARV, AVR, AVV);
-        }
+  for (int face = 0; face < nb_faces_tot; face++)
+    {
+      if (cl(face) == DIRICHLET || cl(face) == OTHER) continue;
+      int elem1 = face_voisins(face, 0);
+      int elem2 = face_voisins(face, 1);
+      int face_associee = cl(face) < 0 ? -1 : cl(face);
+      remplir_sommets(elem_som, face_som, elem_faces, renum_som_perio, face, face_associee, elem1, elem2, sommets, face_opp1, face_opp2);
+      if (cl(face) == NEUMANN)
+        update_matrice_NeumannP1Pa(domaine_VEF, inverse_quantitee_entrelacee, face, elem1, sommets, face_opp1, coef_som, ARR, ARV, AVR, AVV);
+      else if (cl(face) == SYMMETRY)
+        update_matrice_SymetrieP1Pa(domaine_VEF, inverse_quantitee_entrelacee, face, elem1, sommets, face_opp1, coef_som, ARR, ARV, AVR, AVV);
+      else
+        update_matriceP1Pa(domaine_VEF, inverse_quantitee_entrelacee, face, elem1, elem2, sommets, face_opp1, face_opp2, coef_som, ARR, ARV, AVR, AVV);
     }
 
   Cerr << "Update P1Pa OK" << finl;
@@ -2422,122 +1859,129 @@ void assemblerP0P1(const Domaine_dis_base& z,
   const Domaine& domaine=domaine_VEF.domaine();
   const Domaine_Cl_VEF& domaine_Cl_VEF=ref_cast(Domaine_Cl_VEF, zcl);
   const Conds_lim& les_cl = domaine_Cl_VEF.les_conditions_limites();
-  const IntTab& face_voisins = domaine_VEF.face_voisins();
-  int nint = domaine_VEF.premiere_face_int();
-  int nb_faces = domaine_VEF.nb_faces_tot();
-  int nb_elem = domaine.nb_elem_tot();
-  int elem1, elem2, face, ok;
-  IntLists voisins(nb_elem);
-  DoubleLists coeffs(nb_elem);
-  ArrOfInt sommets(dimension+2);
-  ArrOfInt face_opp1(dimension+2);
-  ArrOfInt face_opp2(dimension+2);
-  // Faces de bord :
-  for(int i=0; i<les_cl.size(); i++)
-    {
-      const Cond_lim& la_cl = les_cl[i];
-      const Front_VF& le_bord = ref_cast(Front_VF,la_cl->frontiere_dis());
-      int nb_faces_bord_tot = le_bord.nb_faces_tot();
-      for(int ind_face=0; ind_face<nb_faces_bord_tot; ind_face++)
-        {
-          ok=okface(ind_face, face, la_cl);
-          if (ok==-1) break;
-          elem1=face_voisins(face, 0);
-          elem2=face_voisins(face, 1);
-          remplir_sommets(domaine_VEF, face, elem1, elem2, sommets, face_opp1, face_opp2);
-          sort(sommets, face_opp1, face_opp2);
-          if(ok==3)
-            {
-              contribuer_matrice_NeumannP0P1(elem1, sommets, voisins, coeffs);
-            }
-          else if(ok==4)
-            {
-              ;// RIEN
-            }
-          else
-            contribuer_matriceP0P1(elem1, elem2, sommets, voisins, coeffs);
-        }
-    }
-  face_associee=-1;
-  for(face=nint; face<nb_faces; face++)
-    {
-      elem1=face_voisins(face, 0);
-      elem2=face_voisins(face, 1);
-      if (!domaine_VEF.est_une_face_virt_bord(face)) // On ne traite que les faces internes
-        {
-          remplir_sommets(domaine_VEF, face, elem1, elem2, sommets, face_opp1, face_opp2);
-          sort(sommets, face_opp1, face_opp2);
-          contribuer_matriceP0P1(elem1, elem2, sommets, voisins, coeffs);
-        }
-    }
-
+  int nb_faces_tot = domaine_VEF.nb_faces_tot();
+
+  ArrOfInt tab_cl(nb_faces_tot);
+  build_cl(tab_cl, les_cl);
+  const int max_nnz_per_face = 2 * (dimension + 2);
+  Stencil tab_stencil;
+  tab_stencil.resize(nb_faces_tot * max_nnz_per_face, 2);
+  using nnz_t = decltype(tab_stencil.dimension(0));
+  TRUSTArray<nnz_t, int> tab_nnz(1);
+  tab_nnz(0) = 0;
+  auto nnz = tab_nnz.view_rw();
+  CIntArrView cl = tab_cl.view_ro();
+  CIntTabView face_voisins = domaine_VEF.face_voisins().view_ro();
+  CIntTabView elem_som = domaine_VEF.domaine().les_elems().view_ro();
+  CIntTabView face_som = domaine_VEF.face_sommets().view_ro();
+  CIntTabView elem_faces = domaine_VEF.elem_faces().view_ro();
+  CIntArrView renum_som_perio = domaine_VEF.domaine().get_renum_som_perio().view_ro();
+  auto stencil = tab_stencil.view_wo();
+  Kokkos::parallel_for(start_gpu_timer(__KERNEL_NAME__), nb_faces_tot, KOKKOS_LAMBDA(const int face)
+  {
+    if (cl(face)==NEUMANN // Neumann
+        || cl(face)==SYMMETRY // Symetrie
+        || cl(face)>0 // Periodique
+        || cl(face)==INTERNAL) // Face interne
+      {
+        int sommets[5] = { -1,-1,-1,-1,-1};
+        int face_opp1[5]= { -1,-1,-1,-1,-1};
+        int face_opp2[5]= { -1,-1,-1,-1,-1};
+        int elem1=face_voisins(face, 0);
+        int elem2=face_voisins(face, 1);
+        int face_associee = cl(face) < 0 ? -1 : cl(face); // Periodique
+        remplir_sommets(elem_som, face_som, elem_faces, renum_som_perio, face, face_associee, elem1, elem2, sommets, face_opp1, face_opp2);
+        sort(sommets, face_opp1, face_opp2);
+        int size = dimension + 2;
+        for (int i = 0; i < size; i++)
+          {
+            int si = sommets[i];
+            if (si < 0) break;
+            nnz_t slot = Kokkos::atomic_fetch_add(&nnz(0), 1);
+            stencil(slot, 0) = elem1;
+            stencil(slot, 1) = si;
+            if (elem2 != -1)
+              {
+                slot = Kokkos::atomic_fetch_add(&nnz(0), 1);
+                stencil(slot, 0) = elem2;
+                stencil(slot, 1) = si;
+              }
+          }
+      }
+  });
+  end_gpu_timer(__KERNEL_NAME__);
+  tab_stencil.resize(tab_nnz(0), 2);
   matrice.typer("Matrice_Bloc");
   Matrice_Bloc& matrice_bloc=ref_cast(Matrice_Bloc, matrice.valeur());
-  matrice_bloc.remplir(voisins, coeffs, domaine.nb_elem(), domaine.nb_elem_tot(), domaine.nb_som(), domaine.nb_som_tot());
+  matrice_bloc.remplir(tab_stencil, domaine.nb_elem(), domaine.nb_elem_tot(), domaine.nb_som(), domaine.nb_som_tot());
   Cerr << "Assemblage POP1 OK" << finl;
 }
 
 void updateP0P1(const Domaine_dis_base& z,
                 const Domaine_Cl_dis_base& zcl,
                 Matrice& matrice,
-                const DoubleTab& inverse_quantitee_entrelacee, const ArrOfDouble& coef_som)
+                const DoubleTab& tab_inverse_quantitee_entrelacee, const ArrOfDouble& tab_coef_som)
 {
   int dimension=Objet_U::dimension;
   const Domaine_VEF& domaine_VEF=ref_cast(Domaine_VEF, z);
   const Domaine_Cl_VEF& domaine_Cl_VEF=ref_cast(Domaine_Cl_VEF, zcl);
   const Conds_lim& les_cl = domaine_Cl_VEF.les_conditions_limites();
-  const IntTab& face_voisins = domaine_VEF.face_voisins();
-  int nint = domaine_VEF.premiere_face_int();
-  int nb_faces = domaine_VEF.nb_faces_tot();
-  int elem1, elem2, face, ok;
-  ArrOfInt sommets(dimension+2);
-  ArrOfInt face_opp1(dimension+2);
-  ArrOfInt face_opp2(dimension+2);
+  int nb_faces_tot = domaine_VEF.nb_faces_tot();
+  ArrOfInt tab_cl(nb_faces_tot);
+  build_cl(tab_cl, les_cl);
+  int nb_elem=domaine_VEF.nb_elem();
+  int nb_som=domaine_VEF.nb_som();
   Matrice_Bloc& A=ref_cast(Matrice_Bloc, matrice.valeur());
-  Matrice_Morse& ARR=ref_cast(Matrice_Morse, A.get_bloc(0,0).valeur());
-  Matrice_Morse& ARV=ref_cast(Matrice_Morse, A.get_bloc(0,1).valeur());
-  Matrice_Morse& AVR=ref_cast(Matrice_Morse, A.get_bloc(1,0).valeur());
-  Matrice_Morse& AVV=ref_cast(Matrice_Morse, A.get_bloc(1,1).valeur());
-  // Faces de bord :
-  for (auto &itr : les_cl)
-    {
-      const Cond_lim& la_cl = itr;
-      const Front_VF& le_bord = ref_cast(Front_VF, la_cl->frontiere_dis());
-      int nb_faces_bord_tot = le_bord.nb_faces_tot();
-      for (int ind_face = 0; ind_face < nb_faces_bord_tot; ind_face++)
-        {
-          ok = okface(ind_face, face, la_cl);
-          if (ok == -1)
-            break;
-          elem1 = face_voisins(face, 0);
-          elem2 = face_voisins(face, 1);
-          remplir_sommets(domaine_VEF, face, elem1, elem2, sommets, face_opp1, face_opp2);
-          sort(sommets, face_opp1, face_opp2);
-          if (ok == 3)
-            update_matrice_NeumannP0P1(domaine_VEF, inverse_quantitee_entrelacee, face, elem1, sommets, face_opp1, coef_som, ARR, ARV, AVR, AVV);
-          else if (ok == 4) { /* Do nothing */ }
-          else
-            update_matriceP0P1(domaine_VEF, inverse_quantitee_entrelacee, face, elem1, elem2, sommets, face_opp1, face_opp2, coef_som, ARR, ARV, AVR, AVV);
-        }
-    }
-  face_associee=-1;
-  for(face=nint; face<nb_faces; face++)
-    {
-      elem1=face_voisins(face, 0);
-      elem2=face_voisins(face, 1);
-      if (!domaine_VEF.est_une_face_virt_bord(face)) // On ne traite que les faces internes
-        {
-          remplir_sommets(domaine_VEF, face, elem1, elem2, sommets,
-                          face_opp1, face_opp2);
-          sort(sommets, face_opp1, face_opp2);
-          update_matriceP0P1(domaine_VEF,
-                             inverse_quantitee_entrelacee,
-                             face, elem1, elem2, sommets,
-                             face_opp1, face_opp2, coef_som,
-                             ARR,ARV,AVR,AVV);
-        }
-    }
+  Matrice_Morse_View ARR, ARV, AVR, AVV;
+  ARR.set(ref_cast(Matrice_Morse, A.get_bloc(0,0).valeur()));
+  ARV.set(ref_cast(Matrice_Morse, A.get_bloc(0,1).valeur()));
+  AVR.set(ref_cast(Matrice_Morse, A.get_bloc(1,0).valeur()));
+  AVV.set(ref_cast(Matrice_Morse, A.get_bloc(1,1).valeur()));
+  CIntArrView cl = tab_cl.view_ro();
+  CIntTabView face_voisins = domaine_VEF.face_voisins().view_ro();
+  CIntTabView elem_som = domaine_VEF.domaine().les_elems().view_ro();
+  CIntTabView face_som = domaine_VEF.face_sommets().view_ro();
+  CIntTabView elem_faces = domaine_VEF.elem_faces().view_ro();
+  CIntArrView renum_som_perio = domaine_VEF.domaine().get_renum_som_perio().view_ro();
+  CDoubleTabView normales = domaine_VEF.face_normales().view_ro();
+  CDoubleArrView coef_som = tab_coef_som.view_ro();
+  CDoubleTabView inverse_quantitee_entrelacee = tab_inverse_quantitee_entrelacee.view_ro();
+  Kokkos::parallel_for(start_gpu_timer(__KERNEL_NAME__), range_1D(0, nb_faces_tot), KOKKOS_LAMBDA(const int face)
+  {
+    int cl_face = cl(face);
+    if (cl_face != NEUMANN && cl_face <= 0 && cl_face != INTERNAL) return; // skip Dirichlet, Symetrie, CL inconnue
+
+    int sommets[5] = { -1,-1,-1,-1,-1};
+    int face_opp1[5]= { -1,-1,-1,-1,-1};
+    int face_opp2[5]= { -1,-1,-1,-1,-1};
+    int elem1 = face_voisins(face, 0);
+    int elem2 = face_voisins(face, 1);
+    int face_associee = cl_face < 0 ? -1 : cl_face; // Periodique
+    remplir_sommets(elem_som, face_som, elem_faces, renum_som_perio, face, face_associee, elem1, elem2, sommets, face_opp1, face_opp2);
+    sort(sommets, face_opp1, face_opp2);
+
+    int size = dimension + 2;
+    double unsurdim = 1./dimension;
+    for (int i = 0; i < size; i++)
+      {
+        int si = sommets[i];
+        if (si < 0) break;
+        double gradi_[3];
+        calculer_grad(face_voisins, elem1, elem2, coef_som, si, face_opp1[i], elem2==-1 ? -1 : face_opp2[i], normales, gradi_);
+        if (elem2==-1 && face_opp1[i]!=face)
+          for (int k=0; k<dimension; k++)
+            gradi_[k] += normales(face, k)*unsurdim;
+        double gradj_[3];
+        for (int k = 0; k < dimension; k++)
+          gradj_[k] = normales(face, k);
+
+        double psc = dotproduct_array_fois_inverse_quantitee_entrelacee(gradi_, gradj_, inverse_quantitee_entrelacee, face);
+        range(elem1, nb_elem, si, nb_som, ARR, ARV, AVR, AVV, psc);
+        if (elem2 != -1)
+          range(elem2, nb_elem, si, nb_som, ARR, ARV, AVR, AVV, -psc);
+      }
+  });
+  end_gpu_timer(__KERNEL_NAME__);
   Cerr << "Update POP1 OK" << finl;
 }
 
-
diff --git a/src/VEF/Solveurs/Assembleur_P_VEFPreP1B.cpp b/src/VEF/Solveurs/Assembleur_P_VEFPreP1B.cpp
index ee1c0c80d7..c05baf6e78 100644
--- a/src/VEF/Solveurs/Assembleur_P_VEFPreP1B.cpp
+++ b/src/VEF/Solveurs/Assembleur_P_VEFPreP1B.cpp
@@ -213,6 +213,7 @@ void zero(Matrice_Bloc_Sym& matrice)
 
 int Assembleur_P_VEFPreP1B::assembler_mat(Matrice& la_matrice,const DoubleVect& quantitee_entrelacee, int incr_pression, int resoudre_en_u)
 {
+  statistics().begin_count(STD_COUNTERS::matrix_assembly,statistics().get_last_opened_counter_level()+1);
   // On fixe les drapeaux de Assembleur_base
   set_resoudre_increment_pression(incr_pression);
   set_resoudre_en_u(resoudre_en_u);
@@ -393,7 +394,7 @@ int Assembleur_P_VEFPreP1B::assembler_mat(Matrice& la_matrice,const DoubleVect&
   char* theValue2 = getenv("TRUST_CONDITIONNEMENT_MATRICE");
   if(theValue2 != nullptr)
     Cout << "Estimation du conditionnement de la matrice: " << estim_cond(la_matrice)<<finl;
-
+  statistics().end_count(STD_COUNTERS::matrix_assembly);
   return 1;
 }
 
@@ -966,32 +967,31 @@ int Assembleur_P_VEFPreP1B::modifier_matrice(Matrice& la_matrice)
   return matrice_modifiee;
 }
 
-inline void range(double& prod, int& i, int& n, int& j, int& m, Matrice_Morse& ARR, Matrice_Morse& ARV, Matrice_Morse& AVR, Matrice_Morse& AVV)
+KOKKOS_INLINE_FUNCTION static void range(double prod, int i, int n, int j, int m,
+                                         Matrice_Morse_View ARR, Matrice_Morse_View ARV,
+                                         Matrice_Morse_View AVR, Matrice_Morse_View AVV)
 {
   if (i<n)
-    if (j<m)
-      ARR(i,j)+=prod;
-    else
-      ARV(i,j-m)+=prod;
-  else if (j<m)
-    AVR(i-n,j)+=prod;
-  else
-    AVV(i-n,j-m)+=prod;
+    {
+      if (j<m) ARR.atomic_add(i, j, prod);
+      else ARV.atomic_add(i, j-m, prod);
+    }
+  else if (j<m) AVR.atomic_add(i-n, j, prod);
+  else AVV.atomic_add(i-n, j-m, prod);
 }
 
 void operation11(Matrice_Bloc& A00, Matrice_Bloc& A01, Matrice_Bloc& A11, double beta, const Domaine& domaine)
 {
-  //Cerr << "Operation11" << finl;
-  Matrice_Morse_Sym& A11RR=ref_cast(Matrice_Morse_Sym, A11.get_bloc(0,0).valeur());
-  Matrice_Morse& A11RV=ref_cast(Matrice_Morse, A11.get_bloc(0,1).valeur());
-  Matrice_Morse& A11VR=ref_cast(Matrice_Morse, A11.get_bloc(1,0).valeur());
-  Matrice_Morse& A11VV=ref_cast(Matrice_Morse, A11.get_bloc(1,1).valeur());
-  const IntTab& les_elems=domaine.les_elems();
-  const Domaine& dom=domaine;
-  int nb_som=A11RR.nb_lignes();
-  int nb_som_elem=les_elems.dimension(1);
+  int nb_som = ref_cast(Matrice_Morse_Sym, A11.get_bloc(0,0).valeur()).nb_lignes();
+  Matrice_Morse_View A11RR, A11RV, A11VR, A11VV;
+  A11RR.set(ref_cast(Matrice_Morse_Sym, A11.get_bloc(0,0).valeur()));
+  A11RV.set(ref_cast(Matrice_Morse, A11.get_bloc(0,1).valeur()));
+  A11VR.set(ref_cast(Matrice_Morse, A11.get_bloc(1,0).valeur()));
+  A11VV.set(ref_cast(Matrice_Morse, A11.get_bloc(1,1).valeur()));
+  CIntTabView les_elems = domaine.les_elems().view_ro();
+  CIntArrView renum_som_perio = domaine.get_renum_som_perio().view_ro();
+  int nb_som_elem = domaine.les_elems().dimension(1);
   // On parcours les elements de la matrice A00
-  //Cerr << "[" << Process::me() << "] Contribution de A00 dans A11~" << finl;
   int ligne=0;
   for (int i_bloc=0; i_bloc<A00.nb_bloc_lignes(); i_bloc++)
     {
@@ -1000,50 +1000,50 @@ void operation11(Matrice_Bloc& A00, Matrice_Bloc& A01, Matrice_Bloc& A11, double
       for (int j_bloc=0; j_bloc<A00.nb_bloc_colonnes(); j_bloc++)
         {
           Matrice_Morse& A00ij=ref_cast(Matrice_Morse, A00.get_bloc(i_bloc,j_bloc).valeur());
-          const auto* tab1=A00ij.get_tab1().addr();
-          const int* tab2=A00ij.get_tab2().addr();
-          const double* coeff=A00ij.get_coeff().addr();
+          auto tab1 = A00ij.get_tab1().view_ro(); // type differs between GPU (CTIDArrView) and CPU (CIntArrView)
+          CIntArrView tab2 = A00ij.get_tab2().view_ro();
+          CDoubleArrView coeff = A00ij.get_coeff().view_ro();
           nb_lignes=A00ij.nb_lignes();
           int nb_colonnes=A00ij.nb_colonnes();
-          for (int i=0; i<nb_lignes; i++)
-            {
-              int k1=ligne+i; // Element k1
-              for (auto n=tab1[i]-1; n<tab1[i+1]-1; n++)
-                {
-                  int k2=colonne+tab2[n]-1; // Element k2
-                  if (k2>=k1)
-                    {
-                      double prod = beta * beta * coeff[n];        // Calcul de beta*beta*Ak1k2
-                      for (int som1=0; som1<nb_som_elem; som1++)
-                        {
-                          int s1 = dom.get_renum_som_perio(les_elems(k1,som1));
-                          for (int som2=0; som2<nb_som_elem; som2++)
-                            {
-                              int s2 = dom.get_renum_som_perio(les_elems(k2,som2));
-                              if (s2>=s1) range(prod,s1,nb_som,s2,nb_som,A11RR,A11RV,A11VR,A11VV);
-                            }
-                        }
-                      if (k1!=k2)
-                        {
-                          for (int som1=0; som1<nb_som_elem; som1++)
-                            {
-                              int s1 = dom.get_renum_som_perio(les_elems(k2,som1));
-                              for (int som2=0; som2<nb_som_elem; som2++)
-                                {
-                                  int s2 = dom.get_renum_som_perio(les_elems(k1,som2));
-                                  if (s2>=s1) range(prod,s1,nb_som,s2,nb_som,A11RR,A11RV,A11VR,A11VV);
-                                }
-                            }
-                        }
-                    }
-                }
-            }
+          Kokkos::parallel_for(start_gpu_timer(__KERNEL_NAME__), range_1D(0, nb_lignes), KOKKOS_LAMBDA(const int i)
+          {
+            int k1=ligne+i; // Element k1
+            for (auto n=tab1(i)-1; n<tab1(i+1)-1; n++)
+              {
+                int k2=colonne+tab2(n)-1; // Element k2
+                if (k2>=k1)
+                  {
+                    double prod = beta * beta * coeff(n); // Calcul de beta*beta*Ak1k2
+                    for (int som1=0; som1<nb_som_elem; som1++)
+                      {
+                        int s1 = renum_som_perio(les_elems(k1,som1));
+                        for (int som2=0; som2<nb_som_elem; som2++)
+                          {
+                            int s2 = renum_som_perio(les_elems(k2,som2));
+                            if (s2>=s1) range(prod,s1,nb_som,s2,nb_som,A11RR,A11RV,A11VR,A11VV);
+                          }
+                      }
+                    if (k1!=k2)
+                      {
+                        for (int som1=0; som1<nb_som_elem; som1++)
+                          {
+                            int s1 = renum_som_perio(les_elems(k2,som1));
+                            for (int som2=0; som2<nb_som_elem; som2++)
+                              {
+                                int s2 = renum_som_perio(les_elems(k1,som2));
+                                if (s2>=s1) range(prod,s1,nb_som,s2,nb_som,A11RR,A11RV,A11VR,A11VV);
+                              }
+                          }
+                      }
+                  }
+              }
+          });
+          end_gpu_timer(__KERNEL_NAME__);
           colonne+=nb_colonnes;
         }
       ligne+=nb_lignes;
     }
   // On parcours les elements de la matrice A01
-  //Cerr << "[" << Process::me() << "] Contribution de A01 dans A11~" << finl;
   ligne=0;
   for (int i_bloc=0; i_bloc<A01.nb_bloc_lignes(); i_bloc++)
     {
@@ -1052,26 +1052,27 @@ void operation11(Matrice_Bloc& A00, Matrice_Bloc& A01, Matrice_Bloc& A11, double
       for (int j_bloc=0; j_bloc<A01.nb_bloc_colonnes(); j_bloc++)
         {
           Matrice_Morse& A01ij=ref_cast(Matrice_Morse, A01.get_bloc(i_bloc,j_bloc).valeur());
-          const auto* tab1=A01ij.get_tab1().addr();
-          const int* tab2=A01ij.get_tab2().addr();
-          const double* coeff=A01ij.get_coeff().addr();
+          auto tab1 = A01ij.get_tab1().view_ro(); // type differs between GPU (CTIDArrView) and CPU (CIntArrView)
+          CIntArrView tab2 = A01ij.get_tab2().view_ro();
+          CDoubleArrView coeff = A01ij.get_coeff().view_ro();
           nb_lignes=A01ij.nb_lignes();
           int nb_colonnes=A01ij.nb_colonnes();
-          for (int i=0; i<nb_lignes; i++)
-            {
-              int k=ligne+i; // Element k
-              for (auto n=tab1[i]-1; n<tab1[i+1]-1; n++)
-                {
-                  int s1 = dom.get_renum_som_perio(colonne+tab2[n]-1); // Sommet s1
-                  double prod = -beta * coeff[n];        // Calcul de -beta*Aks
-                  for (int som=0; som<nb_som_elem; som++)
-                    {
-                      int s2 = dom.get_renum_som_perio(les_elems(k,som)); // Sommet s2
-                      if (s2>=s1) range(prod,s1,nb_som,s2,nb_som,A11RR,A11RV,A11VR,A11VV);
-                      if (s1>=s2) range(prod,s2,nb_som,s1,nb_som,A11RR,A11RV,A11VR,A11VV);
-                    }
-                }
-            }
+          Kokkos::parallel_for(start_gpu_timer(__KERNEL_NAME__), range_1D(0, nb_lignes), KOKKOS_LAMBDA(const int i)
+          {
+            int k=ligne+i; // Element k
+            for (auto n=tab1(i)-1; n<tab1(i+1)-1; n++)
+              {
+                int s1 = renum_som_perio(colonne+tab2(n)-1); // Sommet s1
+                double prod = -beta * coeff(n); // Calcul de -beta*Aks
+                for (int som=0; som<nb_som_elem; som++)
+                  {
+                    int s2 = renum_som_perio(les_elems(k,som)); // Sommet s2
+                    if (s2>=s1) range(prod,s1,nb_som,s2,nb_som,A11RR,A11RV,A11VR,A11VV);
+                    if (s1>=s2) range(prod,s2,nb_som,s1,nb_som,A11RR,A11RV,A11VR,A11VV);
+                  }
+              }
+          });
+          end_gpu_timer(__KERNEL_NAME__);
           colonne+=nb_colonnes;
         }
       ligne+=nb_lignes;
@@ -1081,15 +1082,16 @@ void operation11(Matrice_Bloc& A00, Matrice_Bloc& A01, Matrice_Bloc& A11, double
 void operation01(Matrice_Bloc& A00, Matrice_Bloc& A01, double alpha, double beta, const Domaine& domaine)
 {
   //Cerr << "[" << Process::me() << "] Operation01" << finl;
-  Matrice_Morse& A01RR=ref_cast(Matrice_Morse, A01.get_bloc(0,0).valeur());
-  Matrice_Morse& A01RV=ref_cast(Matrice_Morse, A01.get_bloc(0,1).valeur());
-  Matrice_Morse& A01VR=ref_cast(Matrice_Morse, A01.get_bloc(1,0).valeur());
-  Matrice_Morse& A01VV=ref_cast(Matrice_Morse, A01.get_bloc(1,1).valeur());
-  const IntTab& les_elems=domaine.les_elems();
-  const Domaine& dom=domaine;
-  int nb_elem=A01RR.nb_lignes();
-  int nb_som=A01RR.nb_colonnes();
-  int nb_som_elem=les_elems.dimension(1);
+  int nb_elem = ref_cast(Matrice_Morse, A01.get_bloc(0,0).valeur()).nb_lignes();
+  int nb_som  = ref_cast(Matrice_Morse, A01.get_bloc(0,0).valeur()).nb_colonnes();
+  Matrice_Morse_View A01RR, A01RV, A01VR, A01VV;
+  A01RR.set(ref_cast(Matrice_Morse, A01.get_bloc(0,0).valeur()));
+  A01RV.set(ref_cast(Matrice_Morse, A01.get_bloc(0,1).valeur()));
+  A01VR.set(ref_cast(Matrice_Morse, A01.get_bloc(1,0).valeur()));
+  A01VV.set(ref_cast(Matrice_Morse, A01.get_bloc(1,1).valeur()));
+  CIntTabView les_elems = domaine.les_elems().view_ro();
+  CIntArrView renum_som_perio = domaine.get_renum_som_perio().view_ro();
+  int nb_som_elem = domaine.les_elems().dimension(1);
   // On parcours les coefficients de A00
   int ligne=0;
   for (int i_bloc=0; i_bloc<A00.nb_bloc_lignes(); i_bloc++)
@@ -1099,34 +1101,34 @@ void operation01(Matrice_Bloc& A00, Matrice_Bloc& A01, double alpha, double beta
       for (int j_bloc=0; j_bloc<A00.nb_bloc_colonnes(); j_bloc++)
         {
           Matrice_Morse& A00ij=ref_cast(Matrice_Morse, A00.get_bloc(i_bloc,j_bloc).valeur());
-          const auto* tab1=A00ij.get_tab1().addr();
-          const int* tab2=A00ij.get_tab2().addr();
-          const double* coeff=A00ij.get_coeff().addr();
+          auto tab1 = A00ij.get_tab1().view_ro(); // type differs between GPU (CTIDArrView) and CPU (CIntArrView)
+          CIntArrView tab2 = A00ij.get_tab2().view_ro();
+          CDoubleArrView coeff = A00ij.get_coeff().view_ro();
           nb_lignes=A00ij.nb_lignes();
           int nb_colonnes=A00ij.nb_colonnes();
-          int s;
-          for (int i=0; i<nb_lignes; i++)
-            {
-              int k1=ligne+i; // Element k1
-              for (auto n=tab1[i]-1; n<tab1[i+1]-1; n++)
-                {
-                  int k2=colonne+tab2[n]-1; // Element k2
-                  if (k2>=k1)
-                    {
-                      double prod = -alpha * beta * coeff[n];        // Calcul de -alpha*beta*Ak1k2
-                      for (int som=0; som<nb_som_elem; som++)
-                        {
-                          s = dom.get_renum_som_perio(les_elems(k2,som));
-                          range(prod,k1,nb_elem,s,nb_som,A01RR,A01RV,A01VR,A01VV);
-                          if (k1!=k2)
-                            {
-                              s = dom.get_renum_som_perio(les_elems(k1,som));
-                              range(prod,k2,nb_elem,s,nb_som,A01RR,A01RV,A01VR,A01VV);
-                            }
-                        }
-                    }
-                }
-            }
+          Kokkos::parallel_for(start_gpu_timer(__KERNEL_NAME__), range_1D(0, nb_lignes), KOKKOS_LAMBDA(const int i)
+          {
+            int k1=ligne+i; // Element k1
+            for (auto n=tab1(i)-1; n<tab1(i+1)-1; n++)
+              {
+                int k2=colonne+tab2(n)-1; // Element k2
+                if (k2>=k1)
+                  {
+                    double prod = -alpha * beta * coeff(n); // Calcul de -alpha*beta*Ak1k2
+                    for (int som=0; som<nb_som_elem; som++)
+                      {
+                        int s = renum_som_perio(les_elems(k2,som));
+                        range(prod,k1,nb_elem,s,nb_som,A01RR,A01RV,A01VR,A01VV);
+                        if (k1!=k2)
+                          {
+                            s = renum_som_perio(les_elems(k1,som));
+                            range(prod,k2,nb_elem,s,nb_som,A01RR,A01RV,A01VR,A01VV);
+                          }
+                      }
+                  }
+              }
+          });
+          end_gpu_timer(__KERNEL_NAME__);
           colonne+=nb_colonnes;
         }
       ligne+=nb_lignes;
diff --git a/src/VEF/Sources/Canal/Terme_Source_Canal_perio_VEF_P1NC.cpp b/src/VEF/Sources/Canal/Terme_Source_Canal_perio_VEF_P1NC.cpp
index e73111e5b9..7fc7e0f819 100644
--- a/src/VEF/Sources/Canal/Terme_Source_Canal_perio_VEF_P1NC.cpp
+++ b/src/VEF/Sources/Canal/Terme_Source_Canal_perio_VEF_P1NC.cpp
@@ -1,5 +1,5 @@
 /****************************************************************************
-* Copyright (c) 2025, CEA
+* Copyright (c) 2026, CEA
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
@@ -153,7 +153,7 @@ void Terme_Source_Canal_perio_VEF_P1NC::calculer_debit(double& debit_e) const
                 double debit_face = 0;
                 if (axe >= 0)
                   debit_face += porosite_face(num_face) * vitesse(num_face, axe) *
-                                std::fabs(face_normales(num_face, axe));
+                                Kokkos::fabs(face_normales(num_face, axe));
                 else
                   {
                     for (int i = 0; i < dim; i++)
@@ -170,4 +170,3 @@ void Terme_Source_Canal_perio_VEF_P1NC::calculer_debit(double& debit_e) const
 }
 
 
-
diff --git a/src/VEF/Sources/Canal/Terme_Source_Canal_perio_VEF_P1NC.h b/src/VEF/Sources/Canal/Terme_Source_Canal_perio_VEF_P1NC.h
index c3c51e079e..6e69a2ba02 100644
--- a/src/VEF/Sources/Canal/Terme_Source_Canal_perio_VEF_P1NC.h
+++ b/src/VEF/Sources/Canal/Terme_Source_Canal_perio_VEF_P1NC.h
@@ -47,7 +47,7 @@ protected :
   OBS_PTR(Domaine_Cl_VEF) le_dom_Cl_VEF;
   void associer_domaines(const Domaine_dis_base& ,const Domaine_Cl_dis_base& ) override;
 
-  public_for_cuda
+  protected_but_public_for_cuda
   void calculer_debit(double&) const override;
   // les attributs ont ete mis dans la classe mere
 
diff --git a/src/VEF/Sources/Dilatable/Source_Fluide_Dilatable_VEF_Proto.h b/src/VEF/Sources/Dilatable/Source_Fluide_Dilatable_VEF_Proto.h
index dd181ea78c..46ad13345e 100644
--- a/src/VEF/Sources/Dilatable/Source_Fluide_Dilatable_VEF_Proto.h
+++ b/src/VEF/Sources/Dilatable/Source_Fluide_Dilatable_VEF_Proto.h
@@ -32,7 +32,7 @@ class Source_Fluide_Dilatable_VEF_Proto
   void associer_domaines_impl(const Domaine_dis_base& domaine,const Domaine_Cl_dis_base& domaine_cl);
   void associer_volume_porosite_impl(const Domaine_dis_base& domaine, DoubleVect& volumes, DoubleVect& porosites);
 
-  public_for_cuda
+  protected_but_public_for_cuda
   void ajouter_impl(const Equation_base& eqn, const DoubleVect& g, const int dimension, const double rho_m, const DoubleTab& tab_rho, DoubleTab& resu) const;
 
 protected:
diff --git a/src/VEF/Sources/Sources_It_Eval/Iterateur_Source_VEF_Face.h b/src/VEF/Sources/Sources_It_Eval/Iterateur_Source_VEF_Face.h
index ee720720ed..f866ee003f 100644
--- a/src/VEF/Sources/Sources_It_Eval/Iterateur_Source_VEF_Face.h
+++ b/src/VEF/Sources/Sources_It_Eval/Iterateur_Source_VEF_Face.h
@@ -70,7 +70,7 @@ class Iterateur_Source_VEF_Face: public Iterateur_Source_base
   mutable DoubleTab tab_coef_;
   DoubleVect volumes_cl_dirichlet_;
 
-  public_for_cuda
+  protected_but_public_for_cuda
   template <typename Type_Double>  DoubleTab& ajouter_faces_standard(const int, DoubleTab& ) const;
   template <typename Type_Double> DoubleTab& ajouter_faces_non_standard(const int, DoubleTab& ) const;
 };
@@ -191,6 +191,7 @@ DoubleTab& Iterateur_Source_VEF_Face<_TYPE_>::ajouter_faces_non_standard(const i
           CDoubleArrView coef = static_cast<const ArrOfDouble&>(tab_coef_).view_ro();
           DoubleArrView bilan = tab_bilan.view_rw();
           DoubleTabView resu = tab_resu.view_rw();
+          ToDo_Kokkos("create once source_view");
           Kokkos::View<double**, DoubleTabView::array_layout> source_view("source", nf, ncomp);
           Kokkos::parallel_for(start_gpu_timer(__KERNEL_NAME__), range_1D(0, nf), KOKKOS_LAMBDA(const int ind_face)
           {
diff --git a/src/VEF/Sources/Terme_Source_Acceleration_VEF_Face.cpp b/src/VEF/Sources/Terme_Source_Acceleration_VEF_Face.cpp
index 7c88ed7178..68ca781236 100644
--- a/src/VEF/Sources/Terme_Source_Acceleration_VEF_Face.cpp
+++ b/src/VEF/Sources/Terme_Source_Acceleration_VEF_Face.cpp
@@ -202,7 +202,6 @@ DoubleTab& Terme_Source_Acceleration_VEF_Face::ajouter(DoubleTab& resu) const
             const Front_VF& le_bord = ref_cast(Front_VF,la_cl->frontiere_dis());
             int nb_faces_bord=le_bord.nb_faces();
             ArrOfInt fait(nb_faces_bord);
-            fait = 0;
             for (int ind_face=0; ind_face<nb_faces_bord; ind_face++)
               {
                 if (fait[ind_face] == 0)
diff --git a/src/VEF/Turbulence/Modele_turbulence_hyd_LES_Smago_VEF.h b/src/VEF/Turbulence/Modele_turbulence_hyd_LES_Smago_VEF.h
index d9187c5837..0ddea22ac8 100644
--- a/src/VEF/Turbulence/Modele_turbulence_hyd_LES_Smago_VEF.h
+++ b/src/VEF/Turbulence/Modele_turbulence_hyd_LES_Smago_VEF.h
@@ -30,7 +30,7 @@ class Modele_turbulence_hyd_LES_Smago_VEF: public Modele_turbulence_hyd_LES_VEF_
 public:
   void set_param(Param& param) const override;
 
-  public_for_cuda
+  protected_but_public_for_cuda
   Champ_Fonc_base& calculer_viscosite_turbulente() override;
 
 protected:
diff --git a/src/VEF/Turbulence/Modele_turbulence_hyd_LES_Wale_VEF.h b/src/VEF/Turbulence/Modele_turbulence_hyd_LES_Wale_VEF.h
index 0002345643..bdc70dfbe5 100644
--- a/src/VEF/Turbulence/Modele_turbulence_hyd_LES_Wale_VEF.h
+++ b/src/VEF/Turbulence/Modele_turbulence_hyd_LES_Wale_VEF.h
@@ -30,7 +30,7 @@ class Modele_turbulence_hyd_LES_Wale_VEF: public Modele_turbulence_hyd_LES_VEF_b
   Modele_turbulence_hyd_LES_Wale_VEF();
   void set_param(Param& param) const override;
 
-  public_for_cuda
+  protected_but_public_for_cuda
   Champ_Fonc_base& calculer_viscosite_turbulente() override;
 
 protected:
diff --git a/tests/GPU/BFS/BFS.data b/tests/GPU/BFS/BFS.data
new file mode 100644
index 0000000000..b821954e15
--- /dev/null
+++ b/tests/GPU/BFS/BFS.data
@@ -0,0 +1,488 @@
+# LES with periodic box #
+Dimension 3
+
+Pb_Hydraulique_Turbulent pb_box
+Pb_Hydraulique_Turbulent pb_dom
+
+Domaine box
+Domaine dom
+
+# BEGIN MESH #
+Mailler box
+{
+    Pave Entree
+    {
+        /* warning dumb geometry */
+        Origine 0. 0. 0.
+        Nombre_de_Noeuds 6 6 6
+        /* Nombre_de_Noeuds 101 101 101 */
+        Longueurs 1 1 1
+    }
+    {
+        Bord periox   X = 0.  0. <= Y <= 1. 0. <= Z <= 1.
+        Bord periox   X = 1.  0. <= Y <= 1. 0. <= Z <= 1.
+        Bord wall_h   Y = 0.  0. <= X <= 1. 0. <= Z <= 1.
+        Bord wall_b   Y = 1.  0. <= X <= 1. 0. <= Z <= 1.
+        Bord perioz   Z = 0.  0. <= X <= 1. 0. <= Y <= 1.
+        Bord perioz   Z = 1.  0. <= X <= 1. 0. <= Y <= 1.
+    }
+}
+Declarer_bord_perio { domaine box bord periox }
+Declarer_bord_perio { domaine box bord perioz }
+Mailler dom
+{
+    Pave Entree
+    {
+        /* warning dumb geometry */
+        Origine 1. 0. 0.
+        Nombre_de_Noeuds 6 6 6
+        /* Nombre_de_Noeuds 101 101 101 */
+        Longueurs 1 1 1
+    }
+    {
+        Bord Inlet        X = 1.  0. <= Y <= 1. 0. <= Z <= 1.
+        Bord Outlet       X = 2.  0. <= Y <= 1. 0. <= Z <= 1.
+        Bord Up           Y = 0.  1. <= X <= 2. 0. <= Z <= 1.
+        Bord Down         Y = 1.  1. <= X <= 2. 0. <= Z <= 1.
+        Bord Perioz_dom   Z = 0.  1. <= X <= 2. 0. <= Y <= 1.
+        Bord Perioz_dom   Z = 1.  1. <= X <= 2. 0. <= Y <= 1.
+    }
+}
+Declarer_bord_perio { domaine dom bord Perioz_dom }
+# END MESH #
+
+# BEGIN PARTITION
+Partition dom
+{
+    Partition_tool Metis { Nb_parts 2 }
+    Larg_joint 2
+    zones_name DOM
+}
+Partition box
+{
+    Partition_tool Metis { Nb_parts 2 }
+    Larg_joint 2
+    zones_name BOX
+}
+End
+END PARTITION #
+
+# BEGIN SCATTER
+Scatter DOM.Zones dom
+Scatter BOX.Zones box
+END SCATTER #
+
+VDF dis 
+Lire dis { reorder { algo hilbert } }
+
+Runge_Kutta_ordre_3 sch
+Read sch
+{
+	nb_pas_dt_max 10
+	tinit 0
+	tmax 500
+	dt_impr 1e-6
+	facsec 3
+	periode_sauvegarde_securite_en_heures 1000
+	precision_impr 8
+	tcpumax 23
+}
+
+Associate pb_box box
+Associate pb_dom dom
+Probleme_Couple pb
+Associate pb pb_box
+Associate pb pb_dom
+Associate pb sch
+
+Discretize pb dis
+
+Domaine Parois_box
+Extraire_surface { probleme pb_box domaine Parois_box avec_certains_bords 2 wall_b wall_h }
+
+Domaine ParoisD_dom
+Extraire_surface { probleme pb_dom domaine ParoisD_dom avec_certains_bords 2 Up Down }
+
+Lire pb_box
+{
+	Fluide_incompressible
+	{
+		mu	champ_uniforme 1 1.41289e-04
+		rho	champ_uniforme 1 1
+	}
+	Navier_Stokes_turbulent
+	{
+		Parametre_equation	parametre_diffusion_implicite	{
+									Crank 1
+									niter_max_diffusion_implicite 300
+									preconditionnement_diag 1
+									seuil_diffusion_implicite 1e-10
+									}
+		Solveur_pression	AMG GCP { atol 1e-8 impr }
+		Convection		{ centre }
+		Diffusion		{ }
+		Conditions_initiales	{ vitesse Champ_uniforme 3 12.56 0 0 }
+		Conditions_limites	{
+					periox	periodique
+					wall_b paroi_fixe
+					wall_h paroi_fixe
+					perioz	periodique
+					}
+		Modele_turbulence	sous_maille_WALE { turbulence_paroi negligeable # loi_standard_hydr # }
+		Sources			{ canal_perio { bord periox debit_impose 12.56 } }
+	}
+	Liste_postraitements
+	{
+		postraitement INST_box
+			{
+			Definition_champs	{
+						vitesse_max	reduction_0D	{ methode max source refChamp { Pb_champ pb_box vitesse } }
+						nut_max		reduction_0D	{ methode max source refChamp { Pb_champ pb_box viscosite_turbulente } }
+						yplus_max	reduction_0D	{ methode max source refChamp { Pb_champ pb_box y_plus } }
+						Taux_cis_wall	reduction_0D	{
+										methode moyenne
+										source interpolation	{
+													localisation elem
+													domaine Parois_box
+													source refChamp { Pb_champ pb_box Taux_cisaillement }
+													}
+										}
+						integrale_U	reduction_0D	{
+										methode somme_ponderee
+										sources	{
+											transformation	{
+													methode composante numero 0
+													localisation elem
+													source refChamp { Pb_champ pb_box vitesse }
+													}
+											}
+										}
+						# y+ et utau moyennes sur les parois #
+						yplus		reduction_0D	{
+										methode moyenne
+										source interpolation	{
+													localisation elem
+													domaine Parois_box
+													source refChamp { Pb_champ pb_box y_plus }
+													}
+										}
+						utau		transformation	{
+										methode formule
+										expression 1 (2.84e-5/0.0976)*ustar/0.00125
+										localisation elem
+										source refChamp { Pb_champ pb_box u_star nom_source ustar }
+										}
+						}
+			Sondes			{
+						yplus_box		yplus		periode 1e-6 numero_elem_sur_maitre 0
+						utau_box		utau		periode 1e-6 numero_elem_sur_maitre 0
+						ustar_box		u_star		periode 1e-6 numero_elem_sur_maitre 0
+						vitesse_max_box		vitesse_max	periode 1e-6 numero_elem_sur_maitre 0
+						nut_max_box		nut_max		periode 1e-6 numero_elem_sur_maitre 0
+						yplus_max_box		yplus_max	periode 1e-6 numero_elem_sur_maitre 0
+						Taux_cis_elem0_box	Taux_cisaillement periode 1e-6 numero_elem_sur_maitre 0
+						Taux_cis_0_box		Taux_cisaillement periode 1e-6 points 1 -9.2 1.5 3.14
+						Taux_cis_wall_box 	Taux_cis_wall 	periode 1e-6 numero_elem_sur_maitre 0
+						vitesse_box		vitesse		periode 1e-6 segment 10 -9.2 1 0 -9.2 2 0
+						}
+			format lata
+			Champs binaire dt_post 1e+6
+						{
+						vitesse elem
+						viscosite_turbulente elem
+						y_plus elem
+						}
+			}
+	}
+	Sauvegarde_simple single_hdf Cas_box.sauv
+}
+
+Read pb_dom
+{
+	Fluide_incompressible
+	{
+		mu	champ_uniforme 1 1.41289e-04
+		rho	champ_uniforme 1 1
+	}
+	Navier_Stokes_Turbulent
+	{
+		Solveur_pression	AMG GCP { atol 1e-8 impr }
+		convection		{ centre }
+		diffusion		{ }
+		Conditions_initiales	{ vitesse champ_uniforme 3 12.56 0 0 }
+		boundary_conditions	{
+					Up Paroi_Fixe
+					Down Paroi_Fixe
+					Perioz_dom periodique
+					Outlet Frontiere_ouverte_pression_moyenne_imposee 0
+                                        Inlet  frontiere_ouverte_vitesse_imposee champ_front_recyclage	{
+													pb_champ_evaluateur pb_box vitesse 3
+													ampli_moyenne_imposee 3 0 0 0
+													ampli_moyenne_recyclee 3 0 0 0
+													ampli_fluctuation 3 1 1 1
+													}
+					}
+		Modele_turbulence	sous_maille_WALE { turbulence_paroi negligeable # loi_standard_hydr # }
+	}
+
+	Liste_postraitements
+	{
+		postraitement IMAGES
+			{
+			domaine dom_boundaries_Perioz_dom
+			fichier IMAGES
+			format lata
+			champs binaire dt_post 0.1
+						{
+						vitesse elem
+						vorticite elem
+                                                }
+                        }
+		postraitement INST_dom
+		{
+			Probes
+			{
+				pression_dom 	pression periode 1e-6 segment 10 7.5 0 5 7.5 2 5
+				vitesse_dom 	vitesse periode 1e-6 segment 10 7.5 0 5 7.5 2 5
+				visc_dom 	viscosite_turbulente periode 1e-6 segment 10 7.5 0 5 7.5 2 5
+				k_dom 		k periode 1e-6 segment 10 7.5 0 5 7.5 2 5
+				vitesse_dom_pt_DNS vitesse periode 1e-6 points 4 1.6 1 3.14 1.6 0.484 3.14 9.2 0.484 3.14 16.8 1 3.14
+			}
+			format lata fichier BFS
+			Champs dt_post 1e6
+			{
+				pression elem
+				vitesse elem
+				vorticite elem
+				viscosite_turbulente elem
+				k elem
+			}
+		}
+		postraitement STAT_dom
+			{
+			Definition_champs	{
+						MOY_pression	moyenne		{ t_deb 0 t_fin 1e6 source refChamp { Pb_champ pb_dom pression } }
+						MOY_yplus	moyenne		{ t_deb 0 t_fin 1e6 source refChamp { Pb_champ pb_dom y_plus } } # 2.581169e-01 #
+						EC_yplus	ecart_type	{ t_deb 0 t_fin 1e6 source refChamp { Pb_champ pb_dom y_plus } }
+						MOY_ustar	moyenne		{ t_deb 0 t_fin 1e6 source refChamp { Pb_champ pb_dom u_star } }
+						MOY_vitesse	moyenne		{ t_deb 0 t_fin 1e6 source refChamp { Pb_champ pb_dom vitesse } }
+						EC_vitesse	ecart_type	{ t_deb 0 t_fin 1e6 source refChamp { Pb_champ pb_dom vitesse } }
+						MOY_nut		moyenne		{ t_deb 0 t_fin 1e6 source refChamp { Pb_champ pb_dom viscosite_turbulente } }
+						MOY_taux_cis	moyenne		{ t_deb 0 t_fin 1e6 source refChamp { Pb_champ pb_dom Taux_cisaillement } }
+						Taux_cis_wall_	reduction_0D	{
+										methode moyenne
+										source interpolation	{
+													localisation elem
+													domaine ParoisD_dom
+													source refChamp { Pb_champ pb_dom Taux_cisaillement }
+													}
+										}
+						MOY_taux_cis_wall moyenne	{ t_deb 0 t_fin 1e6 sources_reference { Taux_cis_wall_ } }
+						
+
+						# Fluctuation de vitesse #
+						uprime		transformation	{
+										methode formule expression 1 u-UMOY
+										localisation elem
+										sources	{
+											refChamp { Pb_champ pb_dom vitesse nom_source u } ,
+											moyenne { t_deb 0 t_fin 1e6 source refChamp { Pb_champ pb_dom vitesse } nom_source UMOY }
+											}
+										}
+
+						# Composantes du tenseur de Reynolds <ui'*uj'> #
+						uprime_uprime	moyenne		{
+										t_deb 0 t_fin 1e6
+										sources	{
+											transformation	{
+													methode formule
+													expression 1 uprime*uprime
+													localisation elem
+													sources	{
+														transformation	{ methode composante numero 0 nom_source uprime sources_reference { uprime } }
+														}
+													}
+											}
+										}
+						vprime_vprime	moyenne		{
+										t_deb 0 t_fin 1e6
+										sources	{
+											transformation	{
+													methode formule
+													expression 1 vprime*vprime
+													localisation elem
+													sources	{
+														transformation	{ methode composante numero 1 nom_source vprime sources_reference { uprime } }
+														}
+													}
+											}
+										}
+						wprime_wprime	moyenne		{
+										t_deb 0 t_fin 1e6
+										sources	{
+											transformation	{
+													methode formule
+													expression 1 wprime*wprime
+													localisation elem
+													sources	{
+														transformation	{ methode composante numero 2 nom_source wprime sources_reference { uprime } }
+														}
+													}
+											}
+										}
+						uprime_vprime	moyenne		{
+										t_deb 0 t_fin 1e6
+										sources	{
+											transformation	{
+													methode formule
+													expression 1 uprime*vprime
+													localisation elem
+													sources	{
+														transformation	{ methode composante numero 0 nom_source uprime sources_reference { uprime } } ,
+														transformation	{ methode composante numero 1 nom_source vprime sources_reference { uprime } }
+														}
+													}
+											}
+										}
+						uprime_wprime	moyenne		{
+										t_deb 0 t_fin 1e6
+										sources	{
+											transformation	{
+													methode formule
+													expression 1 uprime*wprime
+													localisation elem
+													sources	{
+														transformation	{ methode composante numero 0 nom_source uprime sources_reference { uprime } } ,
+														transformation	{ methode composante numero 2 nom_source wprime sources_reference { uprime } }
+														}
+													}
+											}
+										}
+						vprime_wprime	moyenne		{
+										t_deb 0 t_fin 1e6
+										sources	{
+											transformation	{
+													methode formule
+													expression 1 vprime*wprime
+													localisation elem
+													sources	{
+														transformation	{ methode composante numero 1 nom_source vprime sources_reference { uprime } } ,
+														transformation	{ methode composante numero 2 nom_source wprime sources_reference { uprime } }
+														}
+													}
+											}
+										}
+										
+						# Derivees de vitesse dui/dxj #
+						du_dx		transformation	{ methode composante numero 0 localisation elem sources { refChamp { Pb_champ pb_dom gradient_vitesse } } }
+						du_dy		transformation	{ methode composante numero 1 localisation elem sources { refChamp { Pb_champ pb_dom gradient_vitesse } } }
+						du_dz		transformation	{ methode composante numero 2 localisation elem sources { refChamp { Pb_champ pb_dom gradient_vitesse } } }
+						dv_dx		transformation	{ methode composante numero 3 localisation elem sources { refChamp { Pb_champ pb_dom gradient_vitesse } } }
+						dv_dy		transformation	{ methode composante numero 4 localisation elem sources { refChamp { Pb_champ pb_dom gradient_vitesse } } }
+						dv_dz		transformation	{ methode composante numero 5 localisation elem sources { refChamp { Pb_champ pb_dom gradient_vitesse } } }
+						dw_dx		transformation	{ methode composante numero 6 localisation elem sources { refChamp { Pb_champ pb_dom gradient_vitesse } } }
+						dw_dy		transformation	{ methode composante numero 7 localisation elem sources { refChamp { Pb_champ pb_dom gradient_vitesse } } }
+						dw_dz		transformation	{ methode composante numero 8 localisation elem sources { refChamp { Pb_champ pb_dom gradient_vitesse } } }
+
+						# Moyenne des derivees de vitesse (obligatoire pour calculer ensuite les ecarts-types) #
+						MOY_du_dx	moyenne		{ t_deb 0 t_fin 1e6 sources_reference { du_dx } }
+						MOY_du_dy	moyenne		{ t_deb 0 t_fin 1e6 sources_reference { du_dy } }
+						MOY_du_dz	moyenne		{ t_deb 0 t_fin 1e6 sources_reference { du_dz } }
+						MOY_dv_dx	moyenne		{ t_deb 0 t_fin 1e6 sources_reference { dv_dx } }
+						MOY_dv_dy	moyenne		{ t_deb 0 t_fin 1e6 sources_reference { dv_dy } }
+						MOY_dv_dz	moyenne		{ t_deb 0 t_fin 1e6 sources_reference { dv_dz } }
+						MOY_dw_dx	moyenne		{ t_deb 0 t_fin 1e6 sources_reference { dw_dx } }
+						MOY_dw_dy	moyenne		{ t_deb 0 t_fin 1e6 sources_reference { dw_dy } }
+						MOY_dw_dz	moyenne		{ t_deb 0 t_fin 1e6 sources_reference { dw_dz } }
+
+						# Ecart-type des derivees de vitesse #
+						EC_du_dx	ecart_type	{ t_deb 0 t_fin 1e6 sources_reference { du_dx } }
+						EC_du_dy	ecart_type	{ t_deb 0 t_fin 1e6 sources_reference { du_dy } }
+						EC_du_dz	ecart_type	{ t_deb 0 t_fin 1e6 sources_reference { du_dz } }
+						EC_dv_dx	ecart_type	{ t_deb 0 t_fin 1e6 sources_reference { dv_dx } }
+						EC_dv_dy	ecart_type	{ t_deb 0 t_fin 1e6 sources_reference { dv_dy } }
+						EC_dv_dz	ecart_type	{ t_deb 0 t_fin 1e6 sources_reference { dv_dz } }
+						EC_dw_dx	ecart_type	{ t_deb 0 t_fin 1e6 sources_reference { dw_dx } }
+						EC_dw_dy	ecart_type	{ t_deb 0 t_fin 1e6 sources_reference { dw_dy } }
+						EC_dw_dz	ecart_type	{ t_deb 0 t_fin 1e6 sources_reference { dw_dz } }
+
+						# Taille de maille #
+						Delta		transformation	{ methode formule expression 1 vol^(1/3) localisation elem source refChamp { Pb_champ pb_dom volume_maille nom_source vol } }
+
+						# Energie cinetique turbulente #
+						TKE		transformation	{
+										methode formule
+										expression 1 0.5*norme_EC_vitesse*norme_EC_vitesse
+										sources	{
+											transformation	{
+													methode norme
+													localisation elem
+													nom_source norme_EC_vitesse
+													sources_reference { EC_vitesse }
+													}
+											}
+										} 
+
+						# Dissipation turbulente #
+						epsilon		transformation	{
+										methode formule
+										expression 1 (1.41289e-04/1)*(EC_du_dx^2+EC_du_dy^2+EC_du_dz^2+EC_dv_dx^2+EC_dv_dy^2+EC_dv_dz^2+EC_dw_dx^2+EC_dw_dy^2+EC_dw_dz^2)
+										localisation elem
+										sources_reference { EC_du_dx , EC_du_dy , EC_du_dz , EC_dv_dx , EC_dv_dy , EC_dv_dz , EC_dw_dx , EC_dw_dy , EC_dw_dz }
+										}
+						# y+ et utau moyennes sur les parois #
+						MOY_yplus_	reduction_0D	{
+										methode moyenne
+										source interpolation	{
+													localisation elem
+													domaine ParoisD_dom
+													sources_reference { MOY_yplus }
+													}
+										}
+						MOY_utau_	transformation	{
+										methode formule
+										expression 1 (2.84e-5/0.0976)*MOY_ustar/0.00125
+										localisation elem
+										sources_reference { MOY_ustar }
+										}
+						}
+			Sondes			{
+						MOY_yplus__dom		MOY_yplus_	periode 1e-6 numero_elem_sur_maitre 0
+						MOY_utau_dom		MOY_utau_	periode 1e-6 numero_elem_sur_maitre 0
+						MOY_taux_cis_dom	MOY_taux_cis	periode 1e-6 numero_elem_sur_maitre 0
+						MOY_taux_cis_wall_dom 	MOY_taux_cis_wall periode 1e-6 numero_elem_sur_maitre 0
+						MOY_vitesse_dom		MOY_vitesse	periode 1e-6 segment 10 1 0 0 1 2 0
+						EC_vitesse_dom		EC_vitesse	periode 1e-6 position_like MOY_vitesse_dom
+						uprime_dom		uprime		periode 1e-6 position_like MOY_vitesse_dom
+						}
+			format lml fichier BFS
+			Champs dt_post 1e+6
+						{
+						MOY_du_dy elem						
+						MOY_pression elem
+						MOY_pression som
+						# uprime elem # # diff CPU-GPU #
+						# MOY_vitesse elem # # diff CPU-GPU #
+						# EC_vitesse elem # # diff CPU-GPU #
+						MOY_nut elem
+						TKE elem
+						epsilon elem
+						# MOY_yplus elem # # assert #
+						EC_yplus elem
+						uprime_uprime elem
+						uprime_vprime elem
+						uprime_wprime elem
+						vprime_vprime elem
+						vprime_wprime elem
+						wprime_wprime elem
+						}
+			}
+	}
+	Sauvegarde_simple single_hdf Cas_dom.sauv
+}
+
+EcritureLectureSpecial 0
+
+Solve pb
+
+End
diff --git a/tests/GPU/BFS/BFS.lml.gz b/tests/GPU/BFS/BFS.lml.gz
new file mode 100644
index 0000000000..648d393137
Binary files /dev/null and b/tests/GPU/BFS/BFS.lml.gz differ
diff --git a/tests/GPU/BFS/BFS_BENCH.TU.adastra_gfx90a b/tests/GPU/BFS/BFS_BENCH.TU.adastra_gfx90a
new file mode 100644
index 0000000000..fc9eb16502
--- /dev/null
+++ b/tests/GPU/BFS/BFS_BENCH.TU.adastra_gfx90a
@@ -0,0 +1,78 @@
+                                             # Global performance file #
+
+This is the global file for tracking performance in TRUST. It stores aggregated quantities.
+More detailed statistics can be found in the BFS_BENCH_csv.TU file
+For time loop, only standard counters of level 1 are printed alongside your custom counters
+Time is given in seconds
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+                                              Context of the computation
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Date:     08-06-2026 -- 14:38:46
+OS:       g1157__Linux__x86_64__5.14.0-570.69.1.el9_6.x86_64__#1 SMP PREEMPT_DYNAMIC Tue Nov 25 01:30:14 EST 2025
+CPU model : AMD EPYC 7A53 64-Core Processor
+Total number of threads:128
+GPU model: AMD Instinct MI250X
+HIP runtime version: 6.43
+HIP drivers version: 6.43
+Nb procs used for the computation: 1
+TRUST version: 1.9.8_beta
+Total number of elements used for the calculation: 2000000
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+                                           Computation start-up statistics
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Total time of the start-up:                                                32.5304        
+
+Number of calls to the linear solver per time step:                        4              
+Average time of the resolution of the linear problem per call:             0.762309       
+Average number of iteration of the linear solver per call:                 0              
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+                                                 Time loop statistics
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+The first time step is not accounted for the computation of the time loop statistics
+Total time of the time loop:                                               6.35718        
+Number of time steps:                                                      9              
+Skipped time steps:                                                        1              
+Average time per time step:                                                0.706353       
+Standard deviation between time steps:                                     0.202927       
+Time elapsed in the skipped time steps:                                    0.741004       
+
+
+Standard counter description             | Time/step       | % loop time | Call(s)/step   
+------------------------------------------------------------------------------------------
+Linear solver resolutions Ax=B           | 0.0733417       | 10.4        | 6              
+Convection operator                      | 0.008983073     |  1.3        | 6              
+Diffusion operator                       | 0.01601393      |  2.3        | 6              
+Gradient operator                        | 0.004995961     |  0.7        | 12             
+Divergence operator                      | 0.005065148     |  0.7        | 8              
+Source terms                             | 0.0007179104    |  0.1        | 3              
+Update ::mettre_a_jour                   | 0.1242929       | 17.6        | 2              
+Computation of the time step dt          | 0.0009378377    |  0.1        | 4              
+Turbulence model::update                 | 0.01756681      |  2.5        | 2              
+Post-treatment operations                | 0.4624406       | 65.5        | 2              
+Other operations                         | -0.008002771    | -1.1        | 
+
+Average number of iteration of the linear solver per call:                 11.6           
+
+
+-----------------------------------------------------------------------------------------------------------
+                                                    GPU statistics
+-----------------------------------------------------------------------------------------------------------
+Counter description                      | Time per step   | % loop time | Call(s)/step    | Bandwidth 
+-----------------------------------------------------------------------------------------------------------
+Libraries:                               | 0.0725315       | 10.3        | 6               | 
+Kernels:                                 | 0.561421        | 79.5        | 1328            | 
+Copy host to device:                     | 0.0021288       |  0.3        | 92              | 3.8 GB/s
+Copy device to host:                     | 0.0036422       |  0.5        | 47              | 14.7 GB/s
+Alloc/Free on device:                    | 0.00138089      |  0.2        | 590             | 
+GPU: 90% Copy H<->D: 0.82% Alloc/free: 0.2% Comm: 0% CPU & I/O: 9.2%
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+                                              Post-resolution statistics
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Time of the post-resolution:                                               0.976141       
+
+Total time for the whole computation                                       40.6048        
+
+[Slurm] Power consumption (51 s):  0.486 kW  0.007 kWh  0.001 € (0.10€/kWh)
diff --git a/tests/GPU/BFS/BFS_BENCH.TU.adastra_gfx942 b/tests/GPU/BFS/BFS_BENCH.TU.adastra_gfx942
new file mode 100644
index 0000000000..4294744882
--- /dev/null
+++ b/tests/GPU/BFS/BFS_BENCH.TU.adastra_gfx942
@@ -0,0 +1,78 @@
+                                             # Global performance file #
+
+This is the global file for tracking performance in TRUST. It stores aggregated quantities.
+More detailed statistics can be found in the BFS_BENCH_csv.TU file
+For time loop, only standard counters of level 1 are printed alongside your custom counters
+Time is given in seconds
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+                                              Context of the computation
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Date:     08-06-2026 -- 14:33:56
+OS:       a1007__Linux__x86_64__5.14.0-570.69.1.el9_6.x86_64__#1 SMP PREEMPT_DYNAMIC Tue Nov 25 01:30:14 EST 2025
+CPU model : AMD Instinct MI300A Accelerator
+Total number of threads:192
+GPU model: AMD Instinct MI300A
+HIP runtime version: 6.43
+HIP drivers version: 6.43
+Nb procs used for the computation: 1
+TRUST version: 1.9.8_beta
+Total number of elements used for the calculation: 2000000
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+                                           Computation start-up statistics
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Total time of the start-up:                                                29.709         
+
+Number of calls to the linear solver per time step:                        4              
+Average time of the resolution of the linear problem per call:             0.807292       
+Average number of iteration of the linear solver per call:                 0              
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+                                                 Time loop statistics
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+The first time step is not accounted for the computation of the time loop statistics
+Total time of the time loop:                                               1.91469        
+Number of time steps:                                                      9              
+Skipped time steps:                                                        1              
+Average time per time step:                                                0.212743       
+Standard deviation between time steps:                                     0.0987842      
+Time elapsed in the skipped time steps:                                    0.47084        
+
+
+Standard counter description             | Time/step       | % loop time | Call(s)/step   
+------------------------------------------------------------------------------------------
+Linear solver resolutions Ax=B           | 0.0450238       | 21.2        | 6              
+Convection operator                      | 0.005347514     |  2.5        | 6              
+Diffusion operator                       | 0.008616399     |  4.1        | 6              
+Gradient operator                        | 0.003200318     |  1.5        | 12             
+Divergence operator                      | 0.003620461     |  1.7        | 8              
+Source terms                             | 0.000460308     |  0.2        | 3              
+Update ::mettre_a_jour                   | 0.05670063      | 26.7        | 2              
+Computation of the time step dt          | 0.0005922301    |  0.3        | 4              
+Turbulence model::update                 | 0.004389234     |  2.1        | 2              
+Post-treatment operations                | 0.08205428      | 38.6        | 2              
+Other operations                         | 0.002738007     |  1.3        | 
+
+Average number of iteration of the linear solver per call:                 11.6           
+
+
+-----------------------------------------------------------------------------------------------------------
+                                                    GPU statistics
+-----------------------------------------------------------------------------------------------------------
+Counter description                      | Time per step   | % loop time | Call(s)/step    | Bandwidth 
+-----------------------------------------------------------------------------------------------------------
+Libraries:                               | 0.044401        | 20.9        | 6               | 
+Kernels:                                 | 0.105516        | 49.6        | 1328            | 
+Copy host to device:                     | 0.00202539      |  1.0        | 92              | 3.9 GB/s
+Copy device to host:                     | 0.00238012      |  1.1        | 47              | 22.5 GB/s
+Alloc/Free on device:                    | 0.000647061     |  0.3        | 590             | 
+GPU: 70% Copy H<->D: 2.1% Alloc/free: 0.3% Comm: 0% CPU & I/O: 27%
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+                                              Post-resolution statistics
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Time of the post-resolution:                                               1.10731        
+
+Total time for the whole computation                                       33.2018        
+
+[Slurm] Power consumption (44 s):  0.665 kW  0.008 kWh  0.001 € (0.10€/kWh)
diff --git a/tests/GPU/BFS/BFS_BENCH.TU.dalianvl_cc100 b/tests/GPU/BFS/BFS_BENCH.TU.dalianvl_cc100
new file mode 100644
index 0000000000..509b9d567f
--- /dev/null
+++ b/tests/GPU/BFS/BFS_BENCH.TU.dalianvl_cc100
@@ -0,0 +1,77 @@
+                                             # Global performance file #
+
+This is the global file for tracking performance in TRUST. It stores aggregated quantities.
+More detailed statistics can be found in the BFS_BENCH_csv.TU file
+For time loop, only standard counters of level 1 are printed alongside your custom counters
+Time is given in seconds
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+                                              Context of the computation
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Date:     22-05-2026 -- 12:12:20
+OS:       dalianvl14__Linux__aarch64__5.14.0-503.40.1.el9_5.aarch64__#1 SMP PREEMPT_DYNAMIC Thu Apr 24 13:05:29 UTC 2025
+CPU model : Unknown Linux CPU
+Total number of threads:144
+GPU model: NVIDIA GB200
+CUDA runtime version: 13.0
+CUDA drivers version: 13.20
+Nb procs used for the computation: 1
+TRUST version: 1.9.8_beta
+Total number of elements used for the calculation: 2000000
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+                                           Computation start-up statistics
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Total time of the start-up:                                                14.0361        
+
+Number of calls to the linear solver per time step:                        4              
+Average time of the resolution of the linear problem per call:             0.444288       
+Average number of iteration of the linear solver per call:                 0              
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+                                                 Time loop statistics
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+The first time step is not accounted for the computation of the time loop statistics
+Total time of the time loop:                                               1.31512        
+Number of time steps:                                                      9              
+Skipped time steps:                                                        1              
+Average time per time step:                                                0.146125       
+Standard deviation between time steps:                                     0.0606734      
+Time elapsed in the skipped time steps:                                    0.284713       
+
+
+Standard counter description             | Time/step       | % loop time | Call(s)/step   
+------------------------------------------------------------------------------------------
+Linear solver resolutions Ax=B           | 0.0404097       | 27.7        | 6              
+Convection operator                      | 0.003551145     |  2.4        | 6              
+Diffusion operator                       | 0.00504075      |  3.4        | 6              
+Gradient operator                        | 0.002410326     |  1.6        | 12             
+Divergence operator                      | 0.003548001     |  2.4        | 8              
+Source terms                             | 0.000598559     |  0.4        | 3              
+Update ::mettre_a_jour                   | 0.02725214      | 18.6        | 2              
+Computation of the time step dt          | 0.0004161804    |  0.3        | 4              
+Turbulence model::update                 | 0.001749288     |  1.2        | 2              
+Post-treatment operations                | 0.05599893      | 38.3        | 2              
+Other operations                         | 0.005149901     |  3.5        | 
+
+Average number of iteration of the linear solver per call:                 11.6           
+
+
+-----------------------------------------------------------------------------------------------------------
+                                                    GPU statistics
+-----------------------------------------------------------------------------------------------------------
+Counter description                      | Time per step   | % loop time | Call(s)/step    | Bandwidth 
+-----------------------------------------------------------------------------------------------------------
+Libraries:                               | 0.0392981       | 26.9        | 6               | 
+Kernels:                                 | 0.0387839       | 26.5        | 1328            | 
+Copy host to device:                     | 0.00171442      |  1.2        | 92              | 4.7 GB/s
+Copy device to host:                     | 0.00120926      |  0.8        | 35              | 44.3 GB/s
+Alloc/Free on device:                    | 0.00383176      |  2.6        | 614             | 
+GPU: 53% Copy H<->D: 2% Alloc/free: 2.6% Comm: 0% CPU & I/O: 42%
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+                                              Post-resolution statistics
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Time of the post-resolution:                                               0.943158       
+
+Total time for the whole computation                                       16.5791        
+
diff --git a/tests/GPU/BFS/BFS_BENCH.TU.eureka_cc89 b/tests/GPU/BFS/BFS_BENCH.TU.eureka_cc89
new file mode 100644
index 0000000000..ae0b9baaa8
--- /dev/null
+++ b/tests/GPU/BFS/BFS_BENCH.TU.eureka_cc89
@@ -0,0 +1,77 @@
+                                             # Global performance file #
+
+This is the global file for tracking performance in TRUST. It stores aggregated quantities.
+More detailed statistics can be found in the BFS_BENCH_csv.TU file
+For time loop, only standard counters of level 1 are printed alongside your custom counters
+Time is given in seconds
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+                                              Context of the computation
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Date:     30-04-2026 -- 19:56:19
+OS:       eureka__Linux__x86_64__6.14.0-37-generic__#37~24.04.1-Ubuntu SMP PREEMPT_DYNAMIC Thu Nov 20 10:25:38 UTC 2
+CPU model : INTEL(R) XEON(R) PLATINUM 8580
+Total number of threads:240
+GPU model: NVIDIA RTX 6000 Ada Generation
+CUDA runtime version: 12.90
+CUDA drivers version: 13.10
+Nb procs used for the computation: 1
+TRUST version: 1.9.8_beta
+Total number of elements used for the calculation: 2000000
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+                                           Computation start-up statistics
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Total time of the start-up:                                                112.693        
+
+Number of calls to the linear solver per time step:                        4              
+Average time of the resolution of the linear problem per call:             1.45516        
+Average number of iteration of the linear solver per call:                 0              
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+                                                 Time loop statistics
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+The first time step is not accounted for the computation of the time loop statistics
+Total time of the time loop:                                               166.877        
+Number of time steps:                                                      9              
+Skipped time steps:                                                        1              
+Average time per time step:                                                18.5419        
+Standard deviation between time steps:                                     5.80467        
+Time elapsed in the skipped time steps:                                    17.0173        
+
+
+Standard counter description             | Time/step       | % loop time | Call(s)/step   
+------------------------------------------------------------------------------------------
+Linear solver resolutions Ax=B           | 0.109143        |  0.6        | 6              
+Convection operator                      | 0.0102421       |  0.1        | 6              
+Diffusion operator                       | 0.01638421      |  0.1        | 6              
+Gradient operator                        | 0.008781796     |  0.0        | 12             
+Divergence operator                      | 0.008933099     |  0.0        | 8              
+Source terms                             | 0.08644986      |  0.5        | 3              
+Update ::mettre_a_jour                   | 12.18951        | 65.7        | 2              
+Computation of the time step dt          | 0.08896099      |  0.5        | 4              
+Turbulence model::update                 | 2.906248        | 15.7        | 2              
+Post-treatment operations                | 5.947749        | 32.1        | 2              
+Other operations                         | -2.830534       | -15.3       | 
+
+Average number of iteration of the linear solver per call:                 11.7           
+
+
+-----------------------------------------------------------------------------------------------------------
+                                                    GPU statistics
+-----------------------------------------------------------------------------------------------------------
+Counter description                      | Time per step   | % loop time | Call(s)/step    | Bandwidth 
+-----------------------------------------------------------------------------------------------------------
+Libraries:                               | 0.0760458       |  0.4        | 6               | 
+Kernels:                                 | 0.494343        |  2.7        | 1161            | 
+Copy host to device:                     | 0.421876        |  2.3        | 128             | 5.0 GB/s
+Copy device to host:                     | 0.589114        |  3.2        | 213             | 5.7 GB/s
+Alloc/Free on device:                    | 0.00224632      |  0.0        | 528             | 
+GPU: 3.1% Copy H<->D: 5.5% Alloc/free: 0.012% Comm: 0% CPU & I/O: 91%
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+                                              Post-resolution statistics
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Time of the post-resolution:                                               2.21071        
+
+Total time for the whole computation                                       298.798        
+
diff --git a/tests/GPU/BFS/BFS_BENCH.TU.irene-amd-ccrt_cc70 b/tests/GPU/BFS/BFS_BENCH.TU.irene-amd-ccrt_cc70
new file mode 100644
index 0000000000..2b67b15e45
--- /dev/null
+++ b/tests/GPU/BFS/BFS_BENCH.TU.irene-amd-ccrt_cc70
@@ -0,0 +1,78 @@
+                                             # Global performance file #
+
+This is the global file for tracking performance in TRUST. It stores aggregated quantities.
+More detailed statistics can be found in the BFS_BENCH_csv.TU file
+For time loop, only standard counters of level 1 are printed alongside your custom counters
+Time is given in seconds
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+                                              Context of the computation
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Date:     23-04-2026 -- 09:14:58
+OS:       irene7066__Linux__x86_64__4.18.0-553.89.1.el8_10.x86_64__#1 SMP Sat Nov 29 00:49:18 EST 2025
+CPU model : Intel(R) Xeon(R) Gold 6230 CPU @ 2.10GHz
+Total number of threads:80
+GPU model: Tesla V100-SXM2-16GB
+CUDA runtime version: 12.90
+CUDA drivers version: 13.0
+Nb procs used for the computation: 1
+TRUST version: 1.9.8_beta
+Total number of elements used for the calculation: 2000000
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+                                           Computation start-up statistics
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Total time of the start-up:                                                62.3224        
+
+Number of calls to the linear solver per time step:                        4              
+Average time of the resolution of the linear problem per call:             1.11062        
+Average number of iteration of the linear solver per call:                 0              
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+                                                 Time loop statistics
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+The first time step is not accounted for the computation of the time loop statistics
+Total time of the time loop:                                               93.2092        
+Number of time steps:                                                      9              
+Skipped time steps:                                                        1              
+Average time per time step:                                                10.3566        
+Standard deviation between time steps:                                     3.95843        
+Time elapsed in the skipped time steps:                                    9.48538        
+
+
+Standard counter description             | Time/step       | % loop time | Call(s)/step   
+------------------------------------------------------------------------------------------
+Linear solver resolutions Ax=B           | 0.132225        |  1.3        | 6              
+Convection operator                      | 0.009519812     |  0.1        | 6              
+Diffusion operator                       | 0.01566439      |  0.2        | 6              
+Gradient operator                        | 0.01099676      |  0.1        | 12             
+Divergence operator                      | 0.006549923     |  0.1        | 8              
+Source terms                             | 0.05627705      |  0.5        | 3              
+Update ::mettre_a_jour                   | 6.661139        | 64.3        | 2              
+Computation of the time step dt          | 0.04126574      |  0.4        | 4              
+Turbulence model::update                 | 1.384435        | 13.4        | 2              
+Post-treatment operations                | 3.349094        | 32.3        | 2              
+Other operations                         | -1.310585       | -12.7       | 
+
+Average number of iteration of the linear solver per call:                 11.7           
+
+
+-----------------------------------------------------------------------------------------------------------
+                                                    GPU statistics
+-----------------------------------------------------------------------------------------------------------
+Counter description                      | Time per step   | % loop time | Call(s)/step    | Bandwidth 
+-----------------------------------------------------------------------------------------------------------
+Libraries:                               | 0.110164        |  1.1        | 6               | 
+Kernels:                                 | 0.485927        |  4.7        | 1161            | 
+Copy host to device:                     | 0.480492        |  4.6        | 128             | 4.4 GB/s
+Copy device to host:                     | 0.755923        |  7.3        | 213             | 4.5 GB/s
+Alloc/Free on device:                    | 0.00129667      |  0.0        | 528             | 
+GPU: 5.8% Copy H<->D: 12% Alloc/free: 0.013% Comm: 0% CPU & I/O: 82%
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+                                              Post-resolution statistics
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Time of the post-resolution:                                               1.46305        
+
+Total time for the whole computation                                       166.48         
+
+[Slurm] Power consumption (196 s): 25.598 kW  1.394 kWh  0.139 € (0.10€/kWh)
diff --git a/tests/GPU/BFS/BFS_BENCH.TU.is157091_cc86 b/tests/GPU/BFS/BFS_BENCH.TU.is157091_cc86
new file mode 100644
index 0000000000..a46ebac5f2
--- /dev/null
+++ b/tests/GPU/BFS/BFS_BENCH.TU.is157091_cc86
@@ -0,0 +1,77 @@
+                                             # Global performance file #
+
+This is the global file for tracking performance in TRUST. It stores aggregated quantities.
+More detailed statistics can be found in the BFS_BENCH_csv.TU file
+For time loop, only standard counters of level 1 are printed alongside your custom counters
+Time is given in seconds
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+                                              Context of the computation
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Date:     31-05-2026 -- 19:50:30
+OS:       is157091__Linux__x86_64__5.11.12-300.fc34.x86_64__#1 SMP Wed Apr 7 16:31:13 UTC 2021
+CPU model : AMD Ryzen Threadripper PRO 5975WX 32-Cores
+Total number of threads:64
+GPU model: NVIDIA RTX A6000
+CUDA runtime version: 12.90
+CUDA drivers version: 13.20
+Nb procs used for the computation: 1
+TRUST version: 1.9.8_beta
+Total number of elements used for the calculation: 2000000
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+                                           Computation start-up statistics
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Total time of the start-up:                                                15.8403        
+
+Number of calls to the linear solver per time step:                        4              
+Average time of the resolution of the linear problem per call:             0.559062       
+Average number of iteration of the linear solver per call:                 0              
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+                                                 Time loop statistics
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+The first time step is not accounted for the computation of the time loop statistics
+Total time of the time loop:                                               2.46987        
+Number of time steps:                                                      9              
+Skipped time steps:                                                        1              
+Average time per time step:                                                0.27443        
+Standard deviation between time steps:                                     0.0601585      
+Time elapsed in the skipped time steps:                                    0.465938       
+
+
+Standard counter description             | Time/step       | % loop time | Call(s)/step   
+------------------------------------------------------------------------------------------
+Linear solver resolutions Ax=B           | 0.114364        | 41.7        | 6              
+Convection operator                      | 0.007063331     |  2.6        | 6              
+Diffusion operator                       | 0.01262074      |  4.6        | 6              
+Gradient operator                        | 0.006030292     |  2.2        | 12             
+Divergence operator                      | 0.004501881     |  1.6        | 8              
+Source terms                             | 0.0007731341    |  0.3        | 3              
+Update ::mettre_a_jour                   | 0.05517616      | 20.1        | 2              
+Computation of the time step dt          | 0.001173593     |  0.4        | 4              
+Turbulence model::update                 | 0.00733994      |  2.7        | 2              
+Post-treatment operations                | 0.06002287      | 21.9        | 2              
+Other operations                         | 0.005364006     |  2.0        | 
+
+Average number of iteration of the linear solver per call:                 11.6           
+
+
+-----------------------------------------------------------------------------------------------------------
+                                                    GPU statistics
+-----------------------------------------------------------------------------------------------------------
+Counter description                      | Time per step   | % loop time | Call(s)/step    | Bandwidth 
+-----------------------------------------------------------------------------------------------------------
+Libraries:                               | 0.113484        | 41.4        | 6               | 
+Kernels:                                 | 0.10959         | 39.9        | 1328            | 
+Copy host to device:                     | 0.0015817       |  0.6        | 92              | 5.1 GB/s
+Copy device to host:                     | 0.00538939      |  2.0        | 35              | 9.9 GB/s
+Alloc/Free on device:                    | 0.00187427      |  0.7        | 614             | 
+GPU: 81% Copy H<->D: 2.5% Alloc/free: 0.68% Comm: 0% CPU & I/O: 15%
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+                                              Post-resolution statistics
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Time of the post-resolution:                                               0.673743       
+
+Total time for the whole computation                                       19.4498        
+
diff --git a/tests/GPU/BFS/BFS_BENCH.TU.is159479_cc120 b/tests/GPU/BFS/BFS_BENCH.TU.is159479_cc120
new file mode 100644
index 0000000000..e4c8b3fe0a
--- /dev/null
+++ b/tests/GPU/BFS/BFS_BENCH.TU.is159479_cc120
@@ -0,0 +1,77 @@
+                                             # Global performance file #
+
+This is the global file for tracking performance in TRUST. It stores aggregated quantities.
+More detailed statistics can be found in the BFS_BENCH_csv.TU file
+For time loop, only standard counters of level 1 are printed alongside your custom counters
+Time is given in seconds
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+                                              Context of the computation
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Date:     13-05-2026 -- 07:01:01
+OS:       is159479.intra.cea.fr__Linux__x86_64__6.12.0-160000.29-default__#1 SMP PREEMPT_DYNAMIC Fri May  1 12:45:19 UTC 2026 (6
+CPU model : AMD Ryzen Threadripper PRO 9965WX 24-Cores
+Total number of threads:48
+GPU model: NVIDIA RTX PRO 6000 Blackwell Workstation Edition
+CUDA runtime version: 13.0
+CUDA drivers version: 13.0
+Nb procs used for the computation: 1
+TRUST version: 1.9.8_beta
+Total number of elements used for the calculation: 2000000
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+                                           Computation start-up statistics
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Total time of the start-up:                                                12.9809        
+
+Number of calls to the linear solver per time step:                        4              
+Average time of the resolution of the linear problem per call:             0.365144       
+Average number of iteration of the linear solver per call:                 0              
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+                                                 Time loop statistics
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+The first time step is not accounted for the computation of the time loop statistics
+Total time of the time loop:                                               1.20401        
+Number of time steps:                                                      9              
+Skipped time steps:                                                        1              
+Average time per time step:                                                0.133779       
+Standard deviation between time steps:                                     0.0368064      
+Time elapsed in the skipped time steps:                                    0.346852       
+
+
+Standard counter description             | Time/step       | % loop time | Call(s)/step   
+------------------------------------------------------------------------------------------
+Linear solver resolutions Ax=B           | 0.0500903       | 37.4        | 6              
+Convection operator                      | 0.003679159     |  2.8        | 6              
+Diffusion operator                       | 0.006074906     |  4.5        | 6              
+Gradient operator                        | 0.001982798     |  1.5        | 12             
+Divergence operator                      | 0.002428775     |  1.8        | 8              
+Source terms                             | 0.0003626534    |  0.3        | 3              
+Update ::mettre_a_jour                   | 0.02154634      | 16.1        | 2              
+Computation of the time step dt          | 0.0004629608    |  0.3        | 4              
+Turbulence model::update                 | 0.002795345     |  2.1        | 2              
+Post-treatment operations                | 0.04208834      | 31.5        | 2              
+Other operations                         | 0.002267656     |  1.7        | 
+
+Average number of iteration of the linear solver per call:                 11.6           
+
+
+-----------------------------------------------------------------------------------------------------------
+                                                    GPU statistics
+-----------------------------------------------------------------------------------------------------------
+Counter description                      | Time per step   | % loop time | Call(s)/step    | Bandwidth 
+-----------------------------------------------------------------------------------------------------------
+Libraries:                               | 0.0496036       | 37.1        | 6               | 
+Kernels:                                 | 0.0422833       | 31.6        | 1390            | 
+Copy host to device:                     | 0.0012351       |  0.9        | 92              | 6.5 GB/s
+Copy device to host:                     | 0.00569077      |  4.3        | 36              | 9.4 GB/s
+Alloc/Free on device:                    | 0.00122093      |  0.9        | 614             | 
+GPU: 69% Copy H<->D: 5.2% Alloc/free: 0.91% Comm: 0% CPU & I/O: 25%
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+                                              Post-resolution statistics
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Time of the post-resolution:                                               0.542056       
+
+Total time for the whole computation                                       15.0739        
+
diff --git a/tests/GPU/BFS/BFS_BENCH.TU.is246827_cc86 b/tests/GPU/BFS/BFS_BENCH.TU.is246827_cc86
new file mode 100644
index 0000000000..f89780e8bd
--- /dev/null
+++ b/tests/GPU/BFS/BFS_BENCH.TU.is246827_cc86
@@ -0,0 +1,77 @@
+                                             # Global performance file #
+
+This is the global file for tracking performance in TRUST. It stores aggregated quantities.
+More detailed statistics can be found in the BFS_BENCH_csv.TU file
+For time loop, only standard counters of level 1 are printed alongside your custom counters
+Time is given in seconds
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+                                              Context of the computation
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Date:     04-05-2026 -- 19:18:54
+OS:       is246827__Linux__x86_64__6.2.9-300.fc38.x86_64__#1 SMP PREEMPT_DYNAMIC Thu Mar 30 22:32:58 UTC 2023
+CPU model : 12th Gen Intel(R) Core(TM) i7-12850HX
+Total number of threads:24
+GPU model: NVIDIA RTX A3000 12GB Laptop GPU
+CUDA runtime version: 12.90
+CUDA drivers version: 12.10
+Nb procs used for the computation: 1
+TRUST version: 1.9.8_beta
+Total number of elements used for the calculation: 2000000
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+                                           Computation start-up statistics
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Total time of the start-up:                                                26.4517        
+
+Number of calls to the linear solver per time step:                        4              
+Average time of the resolution of the linear problem per call:             0.818174       
+Average number of iteration of the linear solver per call:                 0              
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+                                                 Time loop statistics
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+The first time step is not accounted for the computation of the time loop statistics
+Total time of the time loop:                                               8.40028        
+Number of time steps:                                                      9              
+Skipped time steps:                                                        1              
+Average time per time step:                                                0.933365       
+Standard deviation between time steps:                                     0.137856       
+Time elapsed in the skipped time steps:                                    1.01629        
+
+
+Standard counter description             | Time/step       | % loop time | Call(s)/step   
+------------------------------------------------------------------------------------------
+Linear solver resolutions Ax=B           | 0.485421        | 52.0        | 6              
+Convection operator                      | 0.0194105       |  2.1        | 6              
+Diffusion operator                       | 0.04692109      |  5.0        | 6              
+Gradient operator                        | 0.01502096      |  1.6        | 12             
+Divergence operator                      | 0.01077627      |  1.2        | 8              
+Source terms                             | 0.001760336     |  0.2        | 3              
+Update ::mettre_a_jour                   | 0.1845253       | 19.8        | 2              
+Computation of the time step dt          | 0.005321171     |  0.6        | 4              
+Turbulence model::update                 | 0.02338764      |  2.5        | 2              
+Post-treatment operations                | 0.1294219       | 13.9        | 2              
+Other operations                         | 0.01139873      |  1.2        | 
+
+Average number of iteration of the linear solver per call:                 11.6           
+
+
+-----------------------------------------------------------------------------------------------------------
+                                                    GPU statistics
+-----------------------------------------------------------------------------------------------------------
+Counter description                      | Time per step   | % loop time | Call(s)/step    | Bandwidth 
+-----------------------------------------------------------------------------------------------------------
+Libraries:                               | 0.48335         | 51.8        | 6               | 
+Kernels:                                 | 0.355819        | 38.1        | 1346            | 
+Copy host to device:                     | 0.00315205      |  0.3        | 80              | 4.5 GB/s
+Copy device to host:                     | 0.0202789       |  2.2        | 86              | 7.7 GB/s
+Alloc/Free on device:                    | 0.00149475      |  0.2        | 568             | 
+GPU: 90% Copy H<->D: 2.5% Alloc/free: 0.16% Comm: 0% CPU & I/O: 7.4%
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+                                              Post-resolution statistics
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Time of the post-resolution:                                               0.929967       
+
+Total time for the whole computation                                       36.7983        
+
diff --git a/tests/GPU/BFS/BFS_BENCH.TU.is247793_gfx1100 b/tests/GPU/BFS/BFS_BENCH.TU.is247793_gfx1100
new file mode 100644
index 0000000000..3d23b652a7
--- /dev/null
+++ b/tests/GPU/BFS/BFS_BENCH.TU.is247793_gfx1100
@@ -0,0 +1,77 @@
+                                             # Global performance file #
+
+This is the global file for tracking performance in TRUST. It stores aggregated quantities.
+More detailed statistics can be found in the BFS_BENCH_csv.TU file
+For time loop, only standard counters of level 1 are printed alongside your custom counters
+Time is given in seconds
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+                                              Context of the computation
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Date:     22-05-2026 -- 18:29:42
+OS:       is247793__Linux__x86_64__6.8.0-107-generic__#107~22.04.1-Ubuntu SMP PREEMPT_DYNAMIC Wed Mar 18 23:40:43 UTC 
+CPU model : AMD Ryzen Threadripper PRO 7965WX 24-Cores
+Total number of threads:48
+GPU model: AMD Radeon PRO W7900
+HIP runtime version: 7.53
+HIP drivers version: 7.53
+Nb procs used for the computation: 1
+TRUST version: 1.9.8_beta
+Total number of elements used for the calculation: 2000000
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+                                           Computation start-up statistics
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Total time of the start-up:                                                15.1276        
+
+Number of calls to the linear solver per time step:                        4              
+Average time of the resolution of the linear problem per call:             0.812923       
+Average number of iteration of the linear solver per call:                 0              
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+                                                 Time loop statistics
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+The first time step is not accounted for the computation of the time loop statistics
+Total time of the time loop:                                               3.70678        
+Number of time steps:                                                      9              
+Skipped time steps:                                                        1              
+Average time per time step:                                                0.411865       
+Standard deviation between time steps:                                     0.0871853      
+Time elapsed in the skipped time steps:                                    0.705107       
+
+
+Standard counter description             | Time/step       | % loop time | Call(s)/step   
+------------------------------------------------------------------------------------------
+Linear solver resolutions Ax=B           | 0.116422        | 28.3        | 6              
+Convection operator                      | 0.008319375     |  2.0        | 6              
+Diffusion operator                       | 0.01516901      |  3.7        | 6              
+Gradient operator                        | 0.005098916     |  1.2        | 12             
+Divergence operator                      | 0.005316149     |  1.3        | 8              
+Source terms                             | 0.0009473977    |  0.2        | 3              
+Update ::mettre_a_jour                   | 0.05988134      | 14.5        | 2              
+Computation of the time step dt          | 0.001219753     |  0.3        | 4              
+Turbulence model::update                 | 0.008179361     |  2.0        | 2              
+Post-treatment operations                | 0.1878769       | 45.6        | 2              
+Other operations                         | 0.003434582     |  0.8        | 
+
+Average number of iteration of the linear solver per call:                 11.6           
+
+
+-----------------------------------------------------------------------------------------------------------
+                                                    GPU statistics
+-----------------------------------------------------------------------------------------------------------
+Counter description                      | Time per step   | % loop time | Call(s)/step    | Bandwidth 
+-----------------------------------------------------------------------------------------------------------
+Libraries:                               | 0.115614        | 28.1        | 6               | 
+Kernels:                                 | 0.247483        | 60.1        | 1328            | 
+Copy host to device:                     | 0.00247318      |  0.6        | 92              | 3.2 GB/s
+Copy device to host:                     | 0.00313158      |  0.8        | 35              | 17.1 GB/s
+Alloc/Free on device:                    | 0.00252754      |  0.6        | 614             | 
+GPU: 88% Copy H<->D: 1.4% Alloc/free: 0.61% Comm: 0% CPU & I/O: 9.9%
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+                                              Post-resolution statistics
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Time of the post-resolution:                                               0.993357       
+
+Total time for the whole computation                                       20.5329        
+
diff --git a/tests/GPU/BFS/BFS_BENCH.TU.jean-zay_cc90 b/tests/GPU/BFS/BFS_BENCH.TU.jean-zay_cc90
new file mode 100644
index 0000000000..93e7847b7e
--- /dev/null
+++ b/tests/GPU/BFS/BFS_BENCH.TU.jean-zay_cc90
@@ -0,0 +1,78 @@
+                                             # Global performance file #
+
+This is the global file for tracking performance in TRUST. It stores aggregated quantities.
+More detailed statistics can be found in the BFS_BENCH_csv.TU file
+For time loop, only standard counters of level 1 are printed alongside your custom counters
+Time is given in seconds
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+                                              Context of the computation
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Date:     13-05-2026 -- 09:48:00
+OS:       jzxh116__Linux__x86_64__5.14.0-570.58.1.el9_6.x86_64__#1 SMP PREEMPT_DYNAMIC Tue Oct 21 04:15:07 EDT 2025
+CPU model : Intel(R) Xeon(R) Platinum 8468
+Total number of threads:192
+GPU model: NVIDIA H100 80GB HBM3
+CUDA runtime version: 12.60
+CUDA drivers version: 13.0
+Nb procs used for the computation: 1
+TRUST version: 1.9.8_beta
+Total number of elements used for the calculation: 2000000
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+                                           Computation start-up statistics
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Total time of the start-up:                                                27.5174        
+
+Number of calls to the linear solver per time step:                        4              
+Average time of the resolution of the linear problem per call:             0.937468       
+Average number of iteration of the linear solver per call:                 0              
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+                                                 Time loop statistics
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+The first time step is not accounted for the computation of the time loop statistics
+Total time of the time loop:                                               1.7255         
+Number of time steps:                                                      9              
+Skipped time steps:                                                        1              
+Average time per time step:                                                0.191723       
+Standard deviation between time steps:                                     0.135704       
+Time elapsed in the skipped time steps:                                    1.00399        
+
+
+Standard counter description             | Time/step       | % loop time | Call(s)/step   
+------------------------------------------------------------------------------------------
+Linear solver resolutions Ax=B           | 0.0478737       | 25.0        | 6              
+Convection operator                      | 0.005209725     |  2.7        | 6              
+Diffusion operator                       | 0.00718375      |  3.7        | 6              
+Gradient operator                        | 0.002878709     |  1.5        | 12             
+Divergence operator                      | 0.003925051     |  2.0        | 8              
+Source terms                             | 0.0005442999    |  0.3        | 3              
+Update ::mettre_a_jour                   | 0.02739966      | 14.3        | 2              
+Computation of the time step dt          | 0.0004660597    |  0.2        | 4              
+Turbulence model::update                 | 0.002827991     |  1.5        | 2              
+Post-treatment operations                | 0.08851215      | 46.2        | 2              
+Other operations                         | 0.004901591     |  2.6        | 
+
+Average number of iteration of the linear solver per call:                 11.6           
+
+
+-----------------------------------------------------------------------------------------------------------
+                                                    GPU statistics
+-----------------------------------------------------------------------------------------------------------
+Counter description                      | Time per step   | % loop time | Call(s)/step    | Bandwidth 
+-----------------------------------------------------------------------------------------------------------
+Libraries:                               | 0.0470228       | 24.5        | 6               | 
+Kernels:                                 | 0.0509353       | 26.6        | 1390            | 
+Copy host to device:                     | 0.00195763      |  1.0        | 92              | 4.1 GB/s
+Copy device to host:                     | 0.00493976      |  2.6        | 36              | 10.9 GB/s
+Alloc/Free on device:                    | 0.00211832      |  1.1        | 614             | 
+GPU: 51% Copy H<->D: 3.6% Alloc/free: 1.1% Comm: 0% CPU & I/O: 44%
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+                                              Post-resolution statistics
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Time of the post-resolution:                                               0.87826        
+
+Total time for the whole computation                                       31.1252        
+
+[Slurm] Power consumption (57 s):  0.444 kW  0.007 kWh  0.001 € (0.10€/kWh)
diff --git a/tests/GPU/BFS/BFS_BENCH.TU.lumi_gfx90a b/tests/GPU/BFS/BFS_BENCH.TU.lumi_gfx90a
new file mode 100644
index 0000000000..1b6a3f5859
--- /dev/null
+++ b/tests/GPU/BFS/BFS_BENCH.TU.lumi_gfx90a
@@ -0,0 +1,78 @@
+                                             # Global performance file #
+
+This is the global file for tracking performance in TRUST. It stores aggregated quantities.
+More detailed statistics can be found in the BFS_BENCH_csv.TU file
+For time loop, only standard counters of level 1 are printed alongside your custom counters
+Time is given in seconds
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+                                              Context of the computation
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Date:     18-05-2026 -- 08:29:33
+OS:       nid007956__Linux__x86_64__6.4.0-150600.23.73_15.0.14-cray_shasta_c__#1 SMP Tue Oct 21 20:32:25 UTC 2025 (89d3c98)
+CPU model : AMD EPYC 7A53 64-Core Processor
+Total number of threads:128
+GPU model: AMD Instinct MI250X
+HIP runtime version: 6.43
+HIP drivers version: 6.43
+Nb procs used for the computation: 1
+TRUST version: 1.9.8_beta
+Total number of elements used for the calculation: 2000000
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+                                           Computation start-up statistics
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Total time of the start-up:                                                70.5197        
+
+Number of calls to the linear solver per time step:                        4              
+Average time of the resolution of the linear problem per call:             1.27103        
+Average number of iteration of the linear solver per call:                 0              
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+                                                 Time loop statistics
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+The first time step is not accounted for the computation of the time loop statistics
+Total time of the time loop:                                               6.05094        
+Number of time steps:                                                      9              
+Skipped time steps:                                                        1              
+Average time per time step:                                                0.672327       
+Standard deviation between time steps:                                     0.171609       
+Time elapsed in the skipped time steps:                                    0.825163       
+
+
+Standard counter description             | Time/step       | % loop time | Call(s)/step   
+------------------------------------------------------------------------------------------
+Linear solver resolutions Ax=B           | 0.0661606       |  9.8        | 6              
+Convection operator                      | 0.01064927      |  1.6        | 6              
+Diffusion operator                       | 0.01735671      |  2.6        | 6              
+Gradient operator                        | 0.004805947     |  0.7        | 12             
+Divergence operator                      | 0.005487979     |  0.8        | 8              
+Source terms                             | 0.000698267     |  0.1        | 3              
+Update ::mettre_a_jour                   | 0.1170135       | 17.4        | 2              
+Computation of the time step dt          | 0.0009277774    |  0.1        | 4              
+Turbulence model::update                 | 0.01760316      |  2.6        | 2              
+Post-treatment operations                | 0.4398524       | 65.4        | 2              
+Other operations                         | -0.008228412    | -1.2        | 
+
+Average number of iteration of the linear solver per call:                 11.6           
+
+
+-----------------------------------------------------------------------------------------------------------
+                                                    GPU statistics
+-----------------------------------------------------------------------------------------------------------
+Counter description                      | Time per step   | % loop time | Call(s)/step    | Bandwidth 
+-----------------------------------------------------------------------------------------------------------
+Libraries:                               | 0.0653104       |  9.7        | 6               | 
+Kernels:                                 | 0.537028        | 79.9        | 1328            | 
+Copy host to device:                     | 0.00251406      |  0.4        | 92              | 3.2 GB/s
+Copy device to host:                     | 0.00320432      |  0.5        | 35              | 16.7 GB/s
+Alloc/Free on device:                    | 0.00185866      |  0.3        | 614             | 
+GPU: 90% Copy H<->D: 0.85% Alloc/free: 0.28% Comm: 0% CPU & I/O: 9.3%
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+                                              Post-resolution statistics
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Time of the post-resolution:                                               1.66404        
+
+Total time for the whole computation                                       79.0599        
+
+[Slurm] Power consumption (106 s):  0.471 kW  0.014 kWh  0.001 € (0.10€/kWh)
diff --git a/tests/GPU/BFS/BFS_BENCH.TU.topaze_cc80 b/tests/GPU/BFS/BFS_BENCH.TU.topaze_cc80
new file mode 100644
index 0000000000..db25e32d24
--- /dev/null
+++ b/tests/GPU/BFS/BFS_BENCH.TU.topaze_cc80
@@ -0,0 +1,78 @@
+                                             # Global performance file #
+
+This is the global file for tracking performance in TRUST. It stores aggregated quantities.
+More detailed statistics can be found in the BFS_BENCH_csv.TU file
+For time loop, only standard counters of level 1 are printed alongside your custom counters
+Time is given in seconds
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+                                              Context of the computation
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Date:     15-05-2026 -- 13:20:33
+OS:       topaze7064__Linux__x86_64__4.18.0-553.89.1.el8_10.x86_64__#1 SMP Sat Nov 29 00:49:18 EST 2025
+CPU model : AMD EPYC 7763 64-Core Processor
+Total number of threads:256
+GPU model: NVIDIA A100-SXM4-80GB
+CUDA runtime version: 12.90
+CUDA drivers version: 13.0
+Nb procs used for the computation: 1
+TRUST version: 1.9.8_beta
+Total number of elements used for the calculation: 2000000
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+                                           Computation start-up statistics
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Total time of the start-up:                                                22.475         
+
+Number of calls to the linear solver per time step:                        4              
+Average time of the resolution of the linear problem per call:             0.868012       
+Average number of iteration of the linear solver per call:                 0              
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+                                                 Time loop statistics
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+The first time step is not accounted for the computation of the time loop statistics
+Total time of the time loop:                                               1.82319        
+Number of time steps:                                                      9              
+Skipped time steps:                                                        1              
+Average time per time step:                                                0.202576       
+Standard deviation between time steps:                                     0.0779406      
+Time elapsed in the skipped time steps:                                    0.885518       
+
+
+Standard counter description             | Time/step       | % loop time | Call(s)/step   
+------------------------------------------------------------------------------------------
+Linear solver resolutions Ax=B           | 0.0650238       | 32.1        | 6              
+Convection operator                      | 0.006038481     |  3.0        | 6              
+Diffusion operator                       | 0.008644277     |  4.3        | 6              
+Gradient operator                        | 0.003602949     |  1.8        | 12             
+Divergence operator                      | 0.00453026      |  2.2        | 8              
+Source terms                             | 0.000635081     |  0.3        | 3              
+Update ::mettre_a_jour                   | 0.03531282      | 17.4        | 2              
+Computation of the time step dt          | 0.0006298844    |  0.3        | 4              
+Turbulence model::update                 | 0.003786783     |  1.9        | 2              
+Post-treatment operations                | 0.07020698      | 34.7        | 2              
+Other operations                         | 0.004165048     |  2.1        | 
+
+Average number of iteration of the linear solver per call:                 11.6           
+
+
+-----------------------------------------------------------------------------------------------------------
+                                                    GPU statistics
+-----------------------------------------------------------------------------------------------------------
+Counter description                      | Time per step   | % loop time | Call(s)/step    | Bandwidth 
+-----------------------------------------------------------------------------------------------------------
+Libraries:                               | 0.0639933       | 31.6        | 6               | 
+Kernels:                                 | 0.0611641       | 30.2        | 1328            | 
+Copy host to device:                     | 0.00190596      |  0.9        | 92              | 4.2 GB/s
+Copy device to host:                     | 0.00386666      |  1.9        | 35              | 13.9 GB/s
+Alloc/Free on device:                    | 0.00295996      |  1.5        | 614             | 
+GPU: 62% Copy H<->D: 2.8% Alloc/free: 1.5% Comm: 0% CPU & I/O: 34%
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+                                              Post-resolution statistics
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Time of the post-resolution:                                               1.33353        
+
+Total time for the whole computation                                       26.5173        
+
+[Slurm] Power consumption (66 s):  0.545 kW  0.010 kWh  0.001 € (0.10€/kWh)
diff --git a/tests/GPU/BFS/BFS_K_DOM.son.ref b/tests/GPU/BFS/BFS_K_DOM.son.ref
new file mode 100644
index 0000000000..f318db270d
--- /dev/null
+++ b/tests/GPU/BFS/BFS_K_DOM.son.ref
@@ -0,0 +1,15 @@
+# BFS_K_DOM.son
+# Temps x= 7.50000000e+00 y= 0.00000000e+00 z= 5.00000000e+00 x= 7.50000000e+00 y= 2.22222222e-01 z= 5.00000000e+00 x= 7.50000000e+00 y= 4.44444444e-01 z= 5.00000000e+00 x= 7.50000000e+00 y= 6.66666667e-01 z= 5.00000000e+00 x= 7.50000000e+00 y= 8.88888889e-01 z= 5.00000000e+00 x= 7.50000000e+00 y= 1.11111111e+00 z= 5.00000000e+00 x= 7.50000000e+00 y= 1.33333333e+00 z= 5.00000000e+00 x= 7.50000000e+00 y= 1.55555556e+00 z= 5.00000000e+00 x= 7.50000000e+00 y= 1.77777778e+00 z= 5.00000000e+00 x= 7.50000000e+00 y= 2.00000000e+00 z= 5.00000000e+00
+# Champ K [m2/s2]
+# Type SEGMENT 7.500000 0.000000 5.000000 7.500000 2.000000 5.000000
+0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
+4.77545847e-02 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
+9.55010851e-02 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
+1.43253236e-01 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
+1.91000668e-01 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
+2.38742902e-01 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
+2.86475840e-01 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
+3.34185730e-01 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
+3.81859477e-01 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
+4.29444051e-01 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
+4.76994676e-01 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
diff --git a/tests/GPU/BFS/BFS_MOY_TAUX_CIS_DOM.son.ref b/tests/GPU/BFS/BFS_MOY_TAUX_CIS_DOM.son.ref
new file mode 100644
index 0000000000..ff32e84b81
--- /dev/null
+++ b/tests/GPU/BFS/BFS_MOY_TAUX_CIS_DOM.son.ref
@@ -0,0 +1,15 @@
+# BFS_MOY_TAUX_CIS_DOM.son
+# Temps x= 1.10000000e+00 y= 1.00000000e-01 z= 1.00000000e-01
+# Champ MOY_TAUX_CIS [s-1]
+# Type NUMERO_ELEM_SUR_MAITRE
+0.00000000e+00 0.00000000e+00
+4.77545847e-02 3.14010496e+01
+9.55010851e-02 3.14003238e+01
+1.43253236e-01 3.13999909e+01
+1.91000668e-01 3.14006385e+01
+2.38742902e-01 3.14009290e+01
+2.86475840e-01 3.14032656e+01
+3.34185730e-01 3.14032909e+01
+3.81859477e-01 3.14079857e+01
+4.29444051e-01 3.14065507e+01
+4.76994676e-01 3.14143110e+01
diff --git a/tests/GPU/BFS/BFS_MOY_TAUX_CIS_WALL_DOM.son.ref b/tests/GPU/BFS/BFS_MOY_TAUX_CIS_WALL_DOM.son.ref
new file mode 100644
index 0000000000..06bb4ac1ce
--- /dev/null
+++ b/tests/GPU/BFS/BFS_MOY_TAUX_CIS_WALL_DOM.son.ref
@@ -0,0 +1,15 @@
+# BFS_MOY_TAUX_CIS_WALL_DOM.son
+# Temps x= 1.10000000e+00 y= 0.00000000e+00 z= 1.00000000e-01
+# Champ MOY_TAUX_CIS_WALL [s-1]
+# Type NUMERO_ELEM_SUR_MAITRE
+0.00000000e+00 0.00000000e+00
+4.77545847e-02 5.02399803e+01
+9.55010851e-02 5.02370293e+01
+1.43253236e-01 5.02342198e+01
+1.91000668e-01 5.02319335e+01
+2.38742902e-01 5.02303891e+01
+2.86475840e-01 5.02291756e+01
+3.34185730e-01 5.02292593e+01
+3.81859477e-01 5.02286506e+01
+4.29444051e-01 5.02306561e+01
+4.76994676e-01 5.02297405e+01
diff --git a/tests/GPU/BFS/BFS_MOY_UTAU_DOM.son.ref b/tests/GPU/BFS/BFS_MOY_UTAU_DOM.son.ref
new file mode 100644
index 0000000000..0e1da14445
--- /dev/null
+++ b/tests/GPU/BFS/BFS_MOY_UTAU_DOM.son.ref
@@ -0,0 +1,15 @@
+# BFS_MOY_UTAU_DOM.son
+# Temps x= 1.10000000e+00 y= 1.00000000e-01 z= 1.00000000e-01
+# Champ MOY_UTAU_ [??]
+# Type NUMERO_ELEM_SUR_MAITRE
+0.00000000e+00 0.00000000e+00
+4.77545847e-02 0.00000000e+00
+9.55010851e-02 0.00000000e+00
+1.43253236e-01 0.00000000e+00
+1.91000668e-01 0.00000000e+00
+2.38742902e-01 0.00000000e+00
+2.86475840e-01 0.00000000e+00
+3.34185730e-01 0.00000000e+00
+3.81859477e-01 0.00000000e+00
+4.29444051e-01 0.00000000e+00
+4.76994676e-01 0.00000000e+00
diff --git a/tests/GPU/BFS/BFS_MOY_VITESSE_DOM.son.ref b/tests/GPU/BFS/BFS_MOY_VITESSE_DOM.son.ref
new file mode 100644
index 0000000000..cac261ddc0
--- /dev/null
+++ b/tests/GPU/BFS/BFS_MOY_VITESSE_DOM.son.ref
@@ -0,0 +1,15 @@
+# BFS_MOY_VITESSE_DOM.son
+# Temps x= 1.00000000e+00 y= 0.00000000e+00 z= 0.00000000e+00 x= 1.00000000e+00 y= 2.22222222e-01 z= 0.00000000e+00 x= 1.00000000e+00 y= 4.44444444e-01 z= 0.00000000e+00 x= 1.00000000e+00 y= 6.66666667e-01 z= 0.00000000e+00 x= 1.00000000e+00 y= 8.88888889e-01 z= 0.00000000e+00 x= 1.00000000e+00 y= 1.11111111e+00 z= 0.00000000e+00 x= 1.00000000e+00 y= 1.33333333e+00 z= 0.00000000e+00 x= 1.00000000e+00 y= 1.55555556e+00 z= 0.00000000e+00 x= 1.00000000e+00 y= 1.77777778e+00 z= 0.00000000e+00 x= 1.00000000e+00 y= 2.00000000e+00 z= 0.00000000e+00
+# Champ MOY_VITESSE [m/s]
+# Type SEGMENT 1.000000 0.000000 0.000000 1.000000 2.000000 0.000000
+0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
+4.77545847e-02 1.25600000e+01 0.00000000e+00 1.86893863e-10 1.25600000e+01 2.49272462e-04 2.25726378e-10 1.25600000e+01 3.26202885e-05 2.26774129e-10 1.25600000e+01 -1.30174838e-04 1.52115499e-10 1.25600000e+01 -1.51717562e-04 1.94261376e-10 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
+9.55010851e-02 1.25578821e+01 0.00000000e+00 -6.74814495e-11 1.25599998e+01 -5.00094655e-04 -9.33348727e-11 1.25600000e+01 -3.70617170e-05 2.44534252e-10 1.25599998e+01 2.29229623e-04 1.09188400e-10 1.25578821e+01 3.07926204e-04 -2.09276414e-10 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
+1.43253236e-01 1.25558778e+01 0.00000000e+00 -2.33903184e-10 1.25601124e+01 -1.08661111e-03 -4.33076349e-10 1.25601130e+01 -1.34745899e-04 9.29268864e-11 1.25601124e+01 5.59067494e-04 4.05838848e-12 1.25558778e+01 6.62296352e-04 -4.03340234e-10 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
+1.91000668e-01 1.25540127e+01 0.00000000e+00 -2.54717524e-10 1.25603630e+01 -1.11814645e-03 -4.76759542e-10 1.25603643e+01 -1.18765710e-04 2.36734674e-10 1.25603630e+01 5.52917218e-04 3.25693525e-10 1.25540127e+01 6.83996235e-04 -8.06715768e-10 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
+2.38742902e-01 1.25523012e+01 0.00000000e+00 -3.25918676e-10 1.25607663e+01 -1.41217546e-03 -8.40371750e-10 1.25607685e+01 -1.76303606e-04 -2.36876881e-10 1.25607663e+01 7.27905812e-04 6.96667544e-11 1.25523012e+01 8.60593826e-04 -9.60868743e-10 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
+2.86475840e-01 1.25507473e+01 0.00000000e+00 -2.90044977e-11 1.25613259e+01 -1.39385504e-03 -1.95285024e-10 1.25613292e+01 -1.70363051e-04 6.15474726e-10 1.25613259e+01 7.14357395e-04 1.11326854e-09 1.25507473e+01 8.49866465e-04 -1.23527436e-09 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
+3.34185730e-01 1.25493451e+01 0.00000000e+00 -8.07691414e-10 1.25620356e+01 -1.45720521e-03 -1.68645372e-09 1.25620402e+01 -1.55088937e-04 -1.26564152e-09 1.25620356e+01 7.20918718e-04 -7.41614738e-10 1.25493451e+01 8.91452479e-04 -1.50045991e-09 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
+3.81859477e-01 1.25480804e+01 0.00000000e+00 1.17994467e-09 1.25628805e+01 -1.86490361e-03 1.64173303e-09 1.25628868e+01 -2.90941960e-04 2.62554731e-09 1.25628805e+01 1.02667324e-03 3.58882467e-09 1.25480804e+01 1.12919124e-03 -7.34502178e-10 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
+4.29444051e-01 1.25469333e+01 0.00000000e+00 -3.43417239e-09 1.25638388e+01 -9.58508785e-04 -5.50618813e-09 1.25638468e+01 7.74674626e-06 -5.16764276e-09 1.25638388e+01 3.50728206e-04 -5.32881603e-09 1.25469333e+01 6.00250616e-04 -3.57486443e-09 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
+4.76994676e-01 1.25458771e+01 0.00000000e+00 7.27711023e-09 1.25648858e+01 -3.08862823e-03 9.80741271e-09 1.25648959e+01 -6.15540112e-04 1.04150364e-08 1.25648858e+01 1.85084787e-03 1.36067776e-08 1.25458771e+01 1.85341917e-03 4.08437442e-09 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
diff --git a/tests/GPU/BFS/BFS_MOY_YPLUS__DOM.son.ref b/tests/GPU/BFS/BFS_MOY_YPLUS__DOM.son.ref
new file mode 100644
index 0000000000..36fee62da6
--- /dev/null
+++ b/tests/GPU/BFS/BFS_MOY_YPLUS__DOM.son.ref
@@ -0,0 +1,15 @@
+# BFS_MOY_YPLUS__DOM.son
+# Temps x= 1.10000000e+00 y= 0.00000000e+00 z= 1.00000000e-01
+# Champ MOY_YPLUS_ [adimensionnel]
+# Type NUMERO_ELEM_SUR_MAITRE
+0.00000000e+00 0.00000000e+00
+4.77545847e-02 9.42798792e+01
+9.55010851e-02 9.42755547e+01
+1.43253236e-01 9.42701369e+01
+1.91000668e-01 9.42645492e+01
+2.38742902e-01 9.42588960e+01
+2.86475840e-01 9.42535715e+01
+3.34185730e-01 9.42490186e+01
+3.81859477e-01 9.42439293e+01
+4.29444051e-01 9.42412777e+01
+4.76994676e-01 9.42350202e+01
diff --git a/tests/GPU/BFS/BFS_NUT_MAX_BOX.son.ref b/tests/GPU/BFS/BFS_NUT_MAX_BOX.son.ref
new file mode 100644
index 0000000000..d4ffda5873
--- /dev/null
+++ b/tests/GPU/BFS/BFS_NUT_MAX_BOX.son.ref
@@ -0,0 +1,15 @@
+# BFS_NUT_MAX_BOX.son
+# Temps x= 1.00000000e-01 y= 1.00000000e-01 z= 1.00000000e-01
+# Champ NUT_MAX [m2/s]
+# Type NUMERO_ELEM_SUR_MAITRE
+0.00000000e+00 0.00000000e+00
+4.77545847e-02 0.00000000e+00
+9.55010851e-02 0.00000000e+00
+1.43253236e-01 0.00000000e+00
+1.91000668e-01 0.00000000e+00
+2.38742902e-01 0.00000000e+00
+2.86475840e-01 0.00000000e+00
+3.34185730e-01 0.00000000e+00
+3.81859477e-01 0.00000000e+00
+4.29444051e-01 0.00000000e+00
+4.76994676e-01 0.00000000e+00
diff --git a/tests/GPU/BFS/BFS_PRESSION_DOM.son.ref b/tests/GPU/BFS/BFS_PRESSION_DOM.son.ref
new file mode 100644
index 0000000000..9d5fd3115e
--- /dev/null
+++ b/tests/GPU/BFS/BFS_PRESSION_DOM.son.ref
@@ -0,0 +1,15 @@
+# BFS_PRESSION_DOM.son
+# Temps x= 7.50000000e+00 y= 0.00000000e+00 z= 5.00000000e+00 x= 7.50000000e+00 y= 2.22222222e-01 z= 5.00000000e+00 x= 7.50000000e+00 y= 4.44444444e-01 z= 5.00000000e+00 x= 7.50000000e+00 y= 6.66666667e-01 z= 5.00000000e+00 x= 7.50000000e+00 y= 8.88888889e-01 z= 5.00000000e+00 x= 7.50000000e+00 y= 1.11111111e+00 z= 5.00000000e+00 x= 7.50000000e+00 y= 1.33333333e+00 z= 5.00000000e+00 x= 7.50000000e+00 y= 1.55555556e+00 z= 5.00000000e+00 x= 7.50000000e+00 y= 1.77777778e+00 z= 5.00000000e+00 x= 7.50000000e+00 y= 2.00000000e+00 z= 5.00000000e+00
+# Champ PRESSION [Pa.m3/kg]
+# Type SEGMENT 7.500000 0.000000 5.000000 7.500000 2.000000 5.000000
+0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
+4.77545847e-02 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
+9.55010851e-02 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
+1.43253236e-01 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
+1.91000668e-01 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
+2.38742902e-01 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
+2.86475840e-01 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
+3.34185730e-01 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
+3.81859477e-01 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
+4.29444051e-01 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
+4.76994676e-01 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
diff --git a/tests/GPU/BFS/BFS_TAUX_CIS_0_BOX.son.ref b/tests/GPU/BFS/BFS_TAUX_CIS_0_BOX.son.ref
new file mode 100644
index 0000000000..47d276472b
--- /dev/null
+++ b/tests/GPU/BFS/BFS_TAUX_CIS_0_BOX.son.ref
@@ -0,0 +1,15 @@
+# BFS_TAUX_CIS_0_BOX.son
+# Temps x= -9.20000000e+00 y= 1.50000000e+00 z= 3.14000000e+00
+# Champ TAUX_CISAILLEMENT [s-1]
+# Type POINTS
+0.00000000e+00 0.00000000e+00
+4.77545847e-02 0.00000000e+00
+9.55010851e-02 0.00000000e+00
+1.43253236e-01 0.00000000e+00
+1.91000668e-01 0.00000000e+00
+2.38742902e-01 0.00000000e+00
+2.86475840e-01 0.00000000e+00
+3.34185730e-01 0.00000000e+00
+3.81859477e-01 0.00000000e+00
+4.29444051e-01 0.00000000e+00
+4.76994676e-01 0.00000000e+00
diff --git a/tests/GPU/BFS/BFS_TAUX_CIS_ELEM0_BOX.son.ref b/tests/GPU/BFS/BFS_TAUX_CIS_ELEM0_BOX.son.ref
new file mode 100644
index 0000000000..2383069f4f
--- /dev/null
+++ b/tests/GPU/BFS/BFS_TAUX_CIS_ELEM0_BOX.son.ref
@@ -0,0 +1,15 @@
+# BFS_TAUX_CIS_ELEM0_BOX.son
+# Temps x= 1.00000000e-01 y= 1.00000000e-01 z= 1.00000000e-01
+# Champ TAUX_CISAILLEMENT [s-1]
+# Type NUMERO_ELEM_SUR_MAITRE
+0.00000000e+00 6.28000000e+01
+4.77545847e-02 6.27894087e+01
+9.55010851e-02 6.27805172e+01
+1.43253236e-01 6.27738299e+01
+1.91000668e-01 6.27695848e+01
+2.38742902e-01 6.27677501e+01
+2.86475840e-01 6.27680561e+01
+3.34185730e-01 6.27700461e+01
+3.81859477e-01 6.27731398e+01
+4.29444051e-01 6.27767021e+01
+4.76994676e-01 6.27801229e+01
diff --git a/tests/GPU/BFS/BFS_TAUX_CIS_WALL_BOX.son.ref b/tests/GPU/BFS/BFS_TAUX_CIS_WALL_BOX.son.ref
new file mode 100644
index 0000000000..fe62cab1ed
--- /dev/null
+++ b/tests/GPU/BFS/BFS_TAUX_CIS_WALL_BOX.son.ref
@@ -0,0 +1,15 @@
+# BFS_TAUX_CIS_WALL_BOX.son
+# Temps x= 1.00000000e-01 y= 0.00000000e+00 z= 1.00000000e-01
+# Champ TAUX_CIS_WALL [s-1]
+# Type NUMERO_ELEM_SUR_MAITRE
+0.00000000e+00 6.28000000e+01
+4.77545847e-02 6.27894087e+01
+9.55010851e-02 6.27805172e+01
+1.43253236e-01 6.27738299e+01
+1.91000668e-01 6.27695848e+01
+2.38742902e-01 6.27677501e+01
+2.86475840e-01 6.27680561e+01
+3.34185730e-01 6.27700461e+01
+3.81859477e-01 6.27731398e+01
+4.29444051e-01 6.27767021e+01
+4.76994676e-01 6.27801229e+01
diff --git a/tests/GPU/BFS/BFS_UPRIME_DOM.son.ref b/tests/GPU/BFS/BFS_UPRIME_DOM.son.ref
new file mode 100644
index 0000000000..2c34efdee5
--- /dev/null
+++ b/tests/GPU/BFS/BFS_UPRIME_DOM.son.ref
@@ -0,0 +1,15 @@
+# BFS_UPRIME_DOM.son
+# Temps x= 1.00000000e+00 y= 0.00000000e+00 z= 0.00000000e+00 x= 1.00000000e+00 y= 2.22222222e-01 z= 0.00000000e+00 x= 1.00000000e+00 y= 4.44444444e-01 z= 0.00000000e+00 x= 1.00000000e+00 y= 6.66666667e-01 z= 0.00000000e+00 x= 1.00000000e+00 y= 8.88888889e-01 z= 0.00000000e+00 x= 1.00000000e+00 y= 1.11111111e+00 z= 0.00000000e+00 x= 1.00000000e+00 y= 1.33333333e+00 z= 0.00000000e+00 x= 1.00000000e+00 y= 1.55555556e+00 z= 0.00000000e+00 x= 1.00000000e+00 y= 1.77777778e+00 z= 0.00000000e+00 x= 1.00000000e+00 y= 2.00000000e+00 z= 0.00000000e+00
+# Champ UPRIME [??]
+# Type SEGMENT 1.000000 0.000000 0.000000 1.000000 2.000000 0.000000
+0.00000000e+00 1.25600000e+01 0.00000000e+00 0.00000000e+00 1.25600000e+01 0.00000000e+00 0.00000000e+00 1.25600000e+01 0.00000000e+00 0.00000000e+00 1.25600000e+01 0.00000000e+00 0.00000000e+00 1.25600000e+01 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
+4.77545847e-02 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 1.77635684e-15 0.00000000e+00 0.00000000e+00 1.77635684e-15 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
+9.55010851e-02 -1.70451372e-03 -4.13749826e-04 -2.05368209e-10 -3.51204031e-04 -4.76474206e-04 -1.92154026e-10 -1.25448652e-04 -1.10328445e-10 7.95685926e-13 -3.51204035e-04 4.76473702e-04 -1.43875005e-11 -1.70451396e-03 4.13749433e-04 -1.49678097e-10 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
+1.43253236e-01 -3.37059554e-03 -6.37830869e-04 -5.20499010e-10 -2.36895221e-04 -8.13654229e-04 -3.28825657e-10 -1.25695954e-04 -5.60994423e-09 -1.89704729e-10 -2.36915066e-04 8.13662528e-04 -2.00995072e-10 -3.37058137e-03 6.37844777e-04 -2.63889988e-10 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
+1.91000668e-01 -5.53720379e-03 -5.86131934e-05 -8.92472976e-11 6.50296474e-04 -1.54672586e-05 2.45320995e-11 8.40300507e-04 9.67721645e-09 1.60411020e-10 6.50324971e-04 1.54579017e-05 5.40508648e-10 -5.53722287e-03 5.85941593e-05 -5.30380284e-10 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
+2.38742902e-01 -6.21119627e-03 -6.35780484e-04 -8.29535899e-10 1.18474075e-03 -8.42938087e-04 -6.49151322e-10 1.20264813e-03 -3.30813897e-08 -1.07479952e-09 1.18463084e-03 8.42981478e-04 -5.64928920e-10 -6.21111987e-03 6.35856955e-04 -5.29458841e-10 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
+2.86475840e-01 -7.82040184e-03 4.82026519e-05 1.25767714e-09 2.82047684e-03 7.49260474e-05 1.75858909e-09 2.85804539e-03 4.46541338e-08 2.15503276e-09 2.82060851e-03 -7.49676853e-05 2.62876573e-09 -7.82048881e-03 -4.82889439e-05 -5.18621113e-10 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
+3.34185730e-01 -8.19520068e-03 -2.24296228e-04 -3.79980139e-09 3.95431085e-03 -1.41674697e-04 -4.81063324e-09 4.43486449e-03 -1.97341878e-07 -6.21367146e-09 3.95367378e-03 1.41915440e-04 -5.60304701e-09 -8.19476162e-03 2.24734313e-04 -1.26783565e-09 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
+3.81859477e-01 -7.36501478e-03 -1.50041972e-03 8.78165311e-09 5.27973068e-03 -2.35762953e-03 1.28814437e-08 4.21941288e-03 3.29118509e-07 1.53989989e-08 5.28062267e-03 2.35740031e-03 1.55525360e-08 -7.36557577e-03 1.49986138e-03 3.22715690e-09 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
+4.29444051e-01 -1.30274103e-02 3.82176250e-03 -2.17536301e-08 9.35376067e-03 5.97926885e-03 -3.23084243e-08 1.20198925e-02 -6.62978936e-07 -3.65563323e-08 9.35153267e-03 -5.97837566e-03 -3.74076058e-08 -1.30258490e-02 -3.82020633e-03 -1.22488296e-08 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
+4.76994676e-01 6.49716025e-04 -1.01878961e-02 5.36417951e-08 4.33501496e-03 -1.52546095e-02 7.96838950e-08 -6.59891810e-04 1.32714207e-06 8.54376745e-08 4.33828965e-03 1.52540096e-02 9.24238394e-08 6.47779337e-04 1.01859691e-02 3.61955081e-08 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
diff --git a/tests/GPU/BFS/BFS_USTAR_BOX.son.ref b/tests/GPU/BFS/BFS_USTAR_BOX.son.ref
new file mode 100644
index 0000000000..d55d0b146c
--- /dev/null
+++ b/tests/GPU/BFS/BFS_USTAR_BOX.son.ref
@@ -0,0 +1,15 @@
+# BFS_USTAR_BOX.son
+# Temps x= 1.00000000e-01 y= 1.00000000e-01 z= 1.00000000e-01
+# Champ U_STAR [m2/s2]
+# Type NUMERO_ELEM_SUR_MAITRE
+0.00000000e+00 0.00000000e+00
+4.77545847e-02 0.00000000e+00
+9.55010851e-02 0.00000000e+00
+1.43253236e-01 0.00000000e+00
+1.91000668e-01 0.00000000e+00
+2.38742902e-01 0.00000000e+00
+2.86475840e-01 0.00000000e+00
+3.34185730e-01 0.00000000e+00
+3.81859477e-01 0.00000000e+00
+4.29444051e-01 0.00000000e+00
+4.76994676e-01 0.00000000e+00
diff --git a/tests/GPU/BFS/BFS_UTAU_BOX.son.ref b/tests/GPU/BFS/BFS_UTAU_BOX.son.ref
new file mode 100644
index 0000000000..63d1a953b1
--- /dev/null
+++ b/tests/GPU/BFS/BFS_UTAU_BOX.son.ref
@@ -0,0 +1,15 @@
+# BFS_UTAU_BOX.son
+# Temps x= 1.00000000e-01 y= 1.00000000e-01 z= 1.00000000e-01
+# Champ UTAU [??]
+# Type NUMERO_ELEM_SUR_MAITRE
+0.00000000e+00 0.00000000e+00
+4.77545847e-02 0.00000000e+00
+9.55010851e-02 0.00000000e+00
+1.43253236e-01 0.00000000e+00
+1.91000668e-01 0.00000000e+00
+2.38742902e-01 0.00000000e+00
+2.86475840e-01 0.00000000e+00
+3.34185730e-01 0.00000000e+00
+3.81859477e-01 0.00000000e+00
+4.29444051e-01 0.00000000e+00
+4.76994676e-01 0.00000000e+00
diff --git a/tests/GPU/BFS/BFS_VISC_DOM.son.ref b/tests/GPU/BFS/BFS_VISC_DOM.son.ref
new file mode 100644
index 0000000000..b5175fac37
--- /dev/null
+++ b/tests/GPU/BFS/BFS_VISC_DOM.son.ref
@@ -0,0 +1,15 @@
+# BFS_VISC_DOM.son
+# Temps x= 7.50000000e+00 y= 0.00000000e+00 z= 5.00000000e+00 x= 7.50000000e+00 y= 2.22222222e-01 z= 5.00000000e+00 x= 7.50000000e+00 y= 4.44444444e-01 z= 5.00000000e+00 x= 7.50000000e+00 y= 6.66666667e-01 z= 5.00000000e+00 x= 7.50000000e+00 y= 8.88888889e-01 z= 5.00000000e+00 x= 7.50000000e+00 y= 1.11111111e+00 z= 5.00000000e+00 x= 7.50000000e+00 y= 1.33333333e+00 z= 5.00000000e+00 x= 7.50000000e+00 y= 1.55555556e+00 z= 5.00000000e+00 x= 7.50000000e+00 y= 1.77777778e+00 z= 5.00000000e+00 x= 7.50000000e+00 y= 2.00000000e+00 z= 5.00000000e+00
+# Champ VISCOSITE_TURBULENTE [m2/s]
+# Type SEGMENT 7.500000 0.000000 5.000000 7.500000 2.000000 5.000000
+0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
+4.77545847e-02 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
+9.55010851e-02 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
+1.43253236e-01 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
+1.91000668e-01 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
+2.38742902e-01 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
+2.86475840e-01 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
+3.34185730e-01 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
+3.81859477e-01 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
+4.29444051e-01 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
+4.76994676e-01 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
diff --git a/tests/GPU/BFS/BFS_VITESSE_BOX.son.ref b/tests/GPU/BFS/BFS_VITESSE_BOX.son.ref
new file mode 100644
index 0000000000..b43caa59fd
--- /dev/null
+++ b/tests/GPU/BFS/BFS_VITESSE_BOX.son.ref
@@ -0,0 +1,15 @@
+# BFS_VITESSE_BOX.son
+# Temps x= -9.20000000e+00 y= 1.00000000e+00 z= 0.00000000e+00 x= -9.20000000e+00 y= 1.11111111e+00 z= 0.00000000e+00 x= -9.20000000e+00 y= 1.22222222e+00 z= 0.00000000e+00 x= -9.20000000e+00 y= 1.33333333e+00 z= 0.00000000e+00 x= -9.20000000e+00 y= 1.44444444e+00 z= 0.00000000e+00 x= -9.20000000e+00 y= 1.55555556e+00 z= 0.00000000e+00 x= -9.20000000e+00 y= 1.66666667e+00 z= 0.00000000e+00 x= -9.20000000e+00 y= 1.77777778e+00 z= 0.00000000e+00 x= -9.20000000e+00 y= 1.88888889e+00 z= 0.00000000e+00 x= -9.20000000e+00 y= 2.00000000e+00 z= 0.00000000e+00
+# Champ VITESSE [m/s]
+# Type SEGMENT -9.200000 1.000000 0.000000 -9.200000 2.000000 0.000000
+0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
+4.77545847e-02 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
+9.55010851e-02 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
+1.43253236e-01 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
+1.91000668e-01 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
+2.38742902e-01 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
+2.86475840e-01 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
+3.34185730e-01 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
+3.81859477e-01 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
+4.29444051e-01 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
+4.76994676e-01 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
diff --git a/tests/GPU/BFS/BFS_VITESSE_DOM.son.ref b/tests/GPU/BFS/BFS_VITESSE_DOM.son.ref
new file mode 100644
index 0000000000..2461a73758
--- /dev/null
+++ b/tests/GPU/BFS/BFS_VITESSE_DOM.son.ref
@@ -0,0 +1,15 @@
+# BFS_VITESSE_DOM.son
+# Temps x= 7.50000000e+00 y= 0.00000000e+00 z= 5.00000000e+00 x= 7.50000000e+00 y= 2.22222222e-01 z= 5.00000000e+00 x= 7.50000000e+00 y= 4.44444444e-01 z= 5.00000000e+00 x= 7.50000000e+00 y= 6.66666667e-01 z= 5.00000000e+00 x= 7.50000000e+00 y= 8.88888889e-01 z= 5.00000000e+00 x= 7.50000000e+00 y= 1.11111111e+00 z= 5.00000000e+00 x= 7.50000000e+00 y= 1.33333333e+00 z= 5.00000000e+00 x= 7.50000000e+00 y= 1.55555556e+00 z= 5.00000000e+00 x= 7.50000000e+00 y= 1.77777778e+00 z= 5.00000000e+00 x= 7.50000000e+00 y= 2.00000000e+00 z= 5.00000000e+00
+# Champ VITESSE [m/s]
+# Type SEGMENT 7.500000 0.000000 5.000000 7.500000 2.000000 5.000000
+0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
+4.77545847e-02 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
+9.55010851e-02 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
+1.43253236e-01 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
+1.91000668e-01 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
+2.38742902e-01 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
+2.86475840e-01 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
+3.34185730e-01 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
+3.81859477e-01 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
+4.29444051e-01 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
+4.76994676e-01 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
diff --git a/tests/GPU/BFS/BFS_VITESSE_DOM_PT_DNS.son.ref b/tests/GPU/BFS/BFS_VITESSE_DOM_PT_DNS.son.ref
new file mode 100644
index 0000000000..69c9bbeed6
--- /dev/null
+++ b/tests/GPU/BFS/BFS_VITESSE_DOM_PT_DNS.son.ref
@@ -0,0 +1,15 @@
+# BFS_VITESSE_DOM_PT_DNS.son
+# Temps x= 1.60000000e+00 y= 1.00000000e+00 z= 3.14000000e+00 x= 1.60000000e+00 y= 4.84000000e-01 z= 3.14000000e+00 x= 9.20000000e+00 y= 4.84000000e-01 z= 3.14000000e+00 x= 1.68000000e+01 y= 1.00000000e+00 z= 3.14000000e+00
+# Champ VITESSE [m/s]
+# Type POINTS
+0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
+4.77545847e-02 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
+9.55010851e-02 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
+1.43253236e-01 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
+1.91000668e-01 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
+2.38742902e-01 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
+2.86475840e-01 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
+3.34185730e-01 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
+3.81859477e-01 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
+4.29444051e-01 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
+4.76994676e-01 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
diff --git a/tests/GPU/BFS/BFS_VITESSE_MAX_BOX.son.ref b/tests/GPU/BFS/BFS_VITESSE_MAX_BOX.son.ref
new file mode 100644
index 0000000000..0ef4d6f1b3
--- /dev/null
+++ b/tests/GPU/BFS/BFS_VITESSE_MAX_BOX.son.ref
@@ -0,0 +1,15 @@
+# BFS_VITESSE_MAX_BOX.son
+# Temps x= 1.00000000e-01 y= 1.00000000e-01 z= 1.00000000e-01
+# Champ VITESSE_MAX [m/s]
+# Type NUMERO_ELEM_SUR_MAITRE
+0.00000000e+00 1.25600000e+01 0.00000000e+00 0.00000000e+00
+4.77545847e-02 1.25600000e+01 0.00000000e+00 0.00000000e+00
+9.55010851e-02 1.25603389e+01 0.00000000e+00 0.00000000e+00
+1.43253236e-01 1.25611183e+01 0.00000000e+00 0.00000000e+00
+1.91000668e-01 1.25623854e+01 0.00000000e+00 0.00000000e+00
+2.38742902e-01 1.25641338e+01 0.00000000e+00 0.00000000e+00
+2.86475840e-01 1.25663097e+01 0.00000000e+00 0.00000000e+00
+3.34185730e-01 1.25688209e+01 0.00000000e+00 0.00000000e+00
+3.81859477e-01 1.25715511e+01 0.00000000e+00 0.00000000e+00
+4.29444051e-01 1.25743707e+01 0.00000000e+00 0.00000000e+00
+4.76994676e-01 1.25771604e+01 0.00000000e+00 0.00000000e+00
diff --git a/tests/GPU/BFS/BFS_YPLUS_BOX.son.ref b/tests/GPU/BFS/BFS_YPLUS_BOX.son.ref
new file mode 100644
index 0000000000..5e96303584
--- /dev/null
+++ b/tests/GPU/BFS/BFS_YPLUS_BOX.son.ref
@@ -0,0 +1,15 @@
+# BFS_YPLUS_BOX.son
+# Temps x= 1.00000000e-01 y= 0.00000000e+00 z= 1.00000000e-01
+# Champ YPLUS [adimensionnel]
+# Type NUMERO_ELEM_SUR_MAITRE
+0.00000000e+00 9.42845742e+01
+4.77545847e-02 9.42686730e+01
+9.55010851e-02 9.42540521e+01
+1.43253236e-01 9.42410884e+01
+1.91000668e-01 9.42299628e+01
+2.38742902e-01 9.42206518e+01
+2.86475840e-01 9.42129538e+01
+3.34185730e-01 9.42065280e+01
+3.81859477e-01 9.42009407e+01
+4.29444051e-01 9.41957235e+01
+4.76994676e-01 9.41904089e+01
diff --git a/tests/GPU/BFS/BFS_YPLUS_MAX_BOX.son.ref b/tests/GPU/BFS/BFS_YPLUS_MAX_BOX.son.ref
new file mode 100644
index 0000000000..8f61c144fa
--- /dev/null
+++ b/tests/GPU/BFS/BFS_YPLUS_MAX_BOX.son.ref
@@ -0,0 +1,15 @@
+# BFS_YPLUS_MAX_BOX.son
+# Temps x= 1.00000000e-01 y= 1.00000000e-01 z= 1.00000000e-01
+# Champ YPLUS_MAX [adimensionnel]
+# Type NUMERO_ELEM_SUR_MAITRE
+0.00000000e+00 9.42845742e+01
+4.77545847e-02 9.42686730e+01
+9.55010851e-02 9.42540521e+01
+1.43253236e-01 9.42410884e+01
+1.91000668e-01 9.42299628e+01
+2.38742902e-01 9.42206518e+01
+2.86475840e-01 9.42129538e+01
+3.34185730e-01 9.42065280e+01
+3.81859477e-01 9.42009407e+01
+4.29444051e-01 9.41957235e+01
+4.76994676e-01 9.41904089e+01
diff --git a/tests/GPU/BFS/check_perf.sh b/tests/GPU/BFS/check_perf.sh
new file mode 120000
index 0000000000..6d20411c12
--- /dev/null
+++ b/tests/GPU/BFS/check_perf.sh
@@ -0,0 +1 @@
+../DomainFlowLES/check_perf.sh
\ No newline at end of file
diff --git a/tests/GPU/BFS/verifie b/tests/GPU/BFS/verifie
new file mode 100755
index 0000000000..f989b60130
--- /dev/null
+++ b/tests/GPU/BFS/verifie
@@ -0,0 +1,17 @@
+message()
+{
+   [ $1 != $2 ] && echo $ECHO_OPTS "Error ($1!=$2) when checking:\n $msg" && err=1
+   #echo $msg
+}
+
+#####################################
+# Comparaison non regression des .son (reduction)
+#####################################
+err=0
+for file in `ls *.son.ref 2>/dev/null`
+do
+   msg="compare_sonde $file ${file%.ref}"
+   eval $msg 1>verifie.log 2>&1
+   message $? 0
+done
+exit $err
diff --git a/tests/GPU/Canal_VDF/Canal_VDF.data b/tests/GPU/Canal_VDF/Canal_VDF.data
new file mode 100644
index 0000000000..fa40868626
--- /dev/null
+++ b/tests/GPU/Canal_VDF/Canal_VDF.data
@@ -0,0 +1,193 @@
+# LES with periodic box #
+# PARALLEL OK #
+Dimension 3
+
+Pb_hydraulique_turbulent pb
+
+Domaine dom_perio
+
+# BEGIN MESH #
+Mailler dom_perio
+{
+    Pave pave
+    {
+        /* warning dumb geometry */
+        Origine -30 0. 0.
+        Nombre_de_Noeuds 6 6 6
+        /* Nombre_de_Noeuds 101 101 101 */
+        Longueurs 30 2 10
+    }
+    {
+        Bord Periox      X = -30   0. <= Y <= 2. 0. <= Z <= 10.
+        Bord Periox      X = 0     0. <= Y <= 2. 0. <= Z <= 10.
+        Bord LowerWall   Y = 0.  -30. <= X <= 0. 0. <= Z <= 10.
+        Bord UpperWall   Y = 2.  -30. <= X <= 0. 0. <= Z <= 10.
+        Bord Perioz      Z = 0.  -30. <= X <= 0. 0. <= Y <= 2.
+        Bord Perioz      Z = 10. -30. <= X <= 0. 0. <= Y <= 2.
+    }
+}
+Declarer_bord_perio { domaine dom_perio bord Periox }
+Declarer_bord_perio { domaine dom_perio bord Perioz }
+# END MESH #
+
+# BEGIN PARTITION
+Partition dom_perio
+{
+    Partition_tool Metis { Nb_parts 4 }
+    Larg_joint 2
+    zones_name DOM
+}
+End
+END PARTITION #
+
+# BEGIN SCATTER
+Scatter DOM.Zones dom_perio
+END SCATTER #
+
+VDF dis 
+Lire dis { reorder { algo hilbert } }
+
+Runge_Kutta_ordre_3 sch
+Lire sch
+{
+	nb_pas_dt_max 10
+	tinit 0
+	dt_impr 1e-6
+	facsec 2
+	precision_impr 8
+	tcpumax 23
+}
+
+Associer pb dom_perio
+Associer pb sch
+
+Discretiser pb dis
+
+Lire pb
+{
+	Fluide_incompressible
+	{
+		mu	champ_uniforme 1 3.5e-04
+		rho	champ_uniforme 1 1
+	}
+	Navier_Stokes_turbulent
+	{
+                Solveur_pression	AMG GCP { rtol 1e-15 impr }
+#
+		solveur_pression	petsc cli
+						{
+						-ksp_view
+						-ksp_type gmres
+						-ksp_norm_type unpreconditioned
+						-pc_type hypre
+						-pc_hypre_type boomeramg
+						-pc_mg_galerkin_mat_product_algorithm hypre
+						-pc_hypre_boomeramg_relax_type_all l1scaled-Jacobi
+						-pc_hypre_boomeramg_coarsen_type pmis
+						-pc_hypre_boomeramg_interp_type ext+i
+						-pc_hypre_boomeramg_strong_threshold 0.30
+						-pc_hypre_boomeramg_print_statistics 1
+						-ksp_rtol 1e-15 impr
+						}
+#
+		conditions_initiales	{
+					vitesse champ_uniforme 3 1 0 0
+					pression champ_uniforme 1 0
+					}
+		conditions_limites	{
+					Periox		periodique
+					Perioz		periodique
+					LowerWall	paroi_fixe
+					UpperWall	paroi_fixe
+					}
+		convection		{ centre4 }
+		diffusion		{ }
+		sources			{ canal_perio { bord Periox } }
+		modele_turbulence	null { }
+	}
+	Postraitement
+	{
+		definition_champs	{
+					# champs instantanes #
+					p		refChamp { pb_champ pb pression_pa }
+					ui		refChamp { pb_champ pb vitesse }
+					duidxj		refChamp { pb_champ pb gradient_vitesse }
+
+					# statistiques #
+					moy_p		moyenne			{ t_deb 0 t_fin 1e+6 sources_reference { p } }
+					ec_p		ecart_type		{ t_deb 0 t_fin 1e+6 sources_reference { p } }
+					moy_ui		moyenne			{ t_deb 0 t_fin 1e+6 sources_reference { ui } }
+					ec_ui		ecart_type		{ t_deb 0 t_fin 1e+6 sources_reference { ui } }
+					moy_duidxj	moyenne			{ t_deb 0 t_fin 1e+6 sources_reference { duidxj } }
+					ec_duidxj	ecart_type		{ t_deb 0 t_fin 1e+6 sources_reference { duidxj } }
+					pui		correlation		{ t_deb 0 t_fin 1e+6 sources_reference { p , ui } }
+					pduidxj		correlation		{ t_deb 0 t_fin 1e+6 sources_reference { p , duidxj } }		# vecteur 9 composantes : composante (i,j) -> colonne 3*i+j-4 #
+					uiuj		correlation		{ t_deb 0 t_fin 1e+6 sources_reference { ui , ui } }		# vecteur 9 composantes : composante (i,j) -> colonne 3*i+j-4 #
+					duidxj_dukdxl	correlation		{ t_deb 0 t_fin 1e+6 sources_reference { duidxj , duidxj } }	# vecteur 81 composantes : composante (i,j,k,l) -> colonne 27*i+9*j+3*k+l-40 #
+					uiujuk		correlation_triple	{ t_deb 0 t_fin 1e+6 sources_reference { ui , ui , ui } }	# vecteur 27 composantes : composante (i,j,k) -> colonne 9*i+3*j+k-13 #
+
+					# pour snapshots #
+					U		transformation	{ methode composante numero 0 localisation elem sources_reference { ui } }
+					V		transformation	{ methode composante numero 1 localisation elem sources_reference { ui } }
+					W		transformation	{ methode composante numero 2 localisation elem sources_reference { ui } }
+					UU		transformation	{ methode formule expression 1 U*U localisation elem sources_reference { U } }
+					VV		transformation	{ methode formule expression 1 V*V localisation elem sources_reference { V } }
+					WW		transformation	{ methode formule expression 1 W*W localisation elem sources_reference { W } }
+					UV		transformation	{ methode formule expression 1 U*V localisation elem sources_reference { U , V } }
+					UW		transformation	{ methode formule expression 1 U*W localisation elem sources_reference { U , W } }
+					VW		transformation	{ methode formule expression 1 V*W localisation elem sources_reference { V , W } }
+					}
+		sondes			{
+					# pour controle convergence #
+					moy_p_streamwise	moy_p		periode 1e-6 segment 10 -25.13 1 0 0 1 0
+					moy_p_spanwise		moy_p		periode 1e-6 segment 10 0 1 0 0 1 9.42
+					moy_p_normal		moy_p		periode 1e-6 segment 10 0 0 0 0 2 0
+
+					moy_ui_streamwise	moy_ui		periode 1e-6 position_like moy_p_streamwise
+					moy_ui_spanwise		moy_ui		periode 1e-6 position_like moy_p_spanwise
+					moy_ui_normal		moy_ui		periode 1e-6 position_like moy_p_normal
+
+					uiuj_streamwise		uiuj		periode 1e-6 position_like moy_p_streamwise
+					uiuj_spanwise		uiuj		periode 1e-6 position_like moy_p_spanwise
+					uiuj_normal		uiuj		periode 1e-6 position_like moy_p_normal
+
+					ec_duidxj_streamwise	ec_duidxj	periode 1e-6 position_like moy_p_streamwise
+					ec_duidxj_spanwise	ec_duidxj	periode 1e-6 position_like moy_p_spanwise
+					ec_duidxj_normal	ec_duidxj	periode 1e-6 position_like moy_p_normal
+
+					uiujuk_streamwise	uiujuk		periode 1e-6 position_like moy_p_streamwise
+					uiujuk_spanwise		uiujuk		periode 1e-6 position_like moy_p_spanwise
+					uiujuk_normal		uiujuk		periode 1e-6 position_like moy_p_normal
+
+					# pour autocorrelations spatiales et temporelles #
+					p_streamwise	grav	p	periode 1e-6 segment 10000 -25.1327 1 0 0 1 0
+					p_spanwise	grav	p	periode 1e-6 segment 10000 0 1 0 0 1 9.42478
+					p_normal	grav	p	periode 1e-6 segment 10000 0 0 0 0 2 0
+
+					ui_streamwise	grav	ui	periode 1e-6 position_like p_streamwise
+					ui_spanwise	grav	ui	periode 1e-6 position_like p_spanwise
+					ui_normal	grav	ui	periode 1e-6 position_like p_normal
+					}
+		format lml
+		champs dt_post 1e6
+					{
+					p elem
+					U elem
+					V elem
+					W elem
+					UU elem
+					VV elem
+					WW elem
+					UV elem
+					UW elem
+					VW elem
+					}
+	}
+	Sauvegarde_simple pdi Cas.sauv
+}
+
+EcritureLectureSpecial 0
+
+Resoudre pb
+
+Fin
diff --git a/tests/GPU/Canal_VDF/Canal_VDF.lml.gz b/tests/GPU/Canal_VDF/Canal_VDF.lml.gz
new file mode 100644
index 0000000000..e9a97eefdf
Binary files /dev/null and b/tests/GPU/Canal_VDF/Canal_VDF.lml.gz differ
diff --git a/tests/GPU/Canal_VDF/Canal_VDF_BENCH.TU.adastra_gfx90a b/tests/GPU/Canal_VDF/Canal_VDF_BENCH.TU.adastra_gfx90a
new file mode 100644
index 0000000000..68df863854
--- /dev/null
+++ b/tests/GPU/Canal_VDF/Canal_VDF_BENCH.TU.adastra_gfx90a
@@ -0,0 +1,77 @@
+                                             # Global performance file #
+
+This is the global file for tracking performance in TRUST. It stores aggregated quantities.
+More detailed statistics can be found in the Canal_VDF_BENCH_csv.TU file
+For time loop, only standard counters of level 1 are printed alongside your custom counters
+Time is given in seconds
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+                                              Context of the computation
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Date:     24-05-2026 -- 15:52:42
+OS:       g1301__Linux__x86_64__5.14.0-570.69.1.el9_6.x86_64__#1 SMP PREEMPT_DYNAMIC Tue Nov 25 01:30:14 EST 2025
+CPU model : AMD EPYC 7A53 64-Core Processor
+Total number of threads:128
+GPU model: AMD Instinct MI250X
+HIP runtime version: 6.43
+HIP drivers version: 6.43
+Nb procs used for the computation: 1
+TRUST version: 1.9.8_beta
+Total number of elements used for the calculation: 1000000
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+                                           Computation start-up statistics
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Total time of the start-up:                                                17.2369        
+
+Number of calls to the linear solver per time step:                        2              
+Average time of the resolution of the linear problem per call:             0.751213       
+Average number of iteration of the linear solver per call:                 0              
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+                                                 Time loop statistics
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+The first time step is not accounted for the computation of the time loop statistics
+Total time of the time loop:                                               2.44631        
+Number of time steps:                                                      9              
+Skipped time steps:                                                        1              
+Average time per time step:                                                0.271812       
+Standard deviation between time steps:                                     0.103236       
+Time elapsed in the skipped time steps:                                    2.40684        
+
+
+Standard counter description             | Time/step       | % loop time | Call(s)/step   
+------------------------------------------------------------------------------------------
+Linear solver resolutions Ax=B           | 0.00308358      |  1.1        | 3              
+Convection operator                      | 0.01080208      |  4.0        | 3              
+Diffusion operator                       | 0.007258781     |  2.7        | 3              
+Gradient operator                        | 0.001891219     |  0.7        | 6              
+Divergence operator                      | 0.001879412     |  0.7        | 4              
+Source terms                             | 0.0007146429    |  0.3        | 3              
+Update ::mettre_a_jour                   | 0.03159206      | 11.6        | 1              
+Computation of the time step dt          | 0.0004381009    |  0.2        | 2              
+Post-treatment operations                | 0.2100512       | 77.3        | 1              
+Other operations                         | 0.004101117     |  1.5        | 
+
+Average number of iteration of the linear solver per call:                 0.37           
+
+
+-----------------------------------------------------------------------------------------------------------
+                                                    GPU statistics
+-----------------------------------------------------------------------------------------------------------
+Counter description                      | Time per step   | % loop time | Call(s)/step    | Bandwidth 
+-----------------------------------------------------------------------------------------------------------
+Libraries:                               | 0.00271389      |  1.0        | 3               | 
+Kernels:                                 | 0.254017        | 93.5        | 451             | 
+Copy host to device:                     | 0.000601216     |  0.2        | 33              | 0.0 GB/s
+Copy device to host:                     | 0.000993021     |  0.4        | 26              | 8.4 GB/s
+Alloc/Free on device:                    | 0.000607269     |  0.2        | 269             | 
+GPU: 94% Copy H<->D: 0.59% Alloc/free: 0.22% Comm: 0% CPU & I/O: 4.7%
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+                                              Post-resolution statistics
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Time of the post-resolution:                                               1.87675        
+
+Total time for the whole computation                                       23.9668        
+
+[Slurm] Power consumption (32 s):  0.426 kW  0.004 kWh  0.000 € (0.10€/kWh)
diff --git a/tests/GPU/Canal_VDF/Canal_VDF_BENCH.TU.adastra_gfx942 b/tests/GPU/Canal_VDF/Canal_VDF_BENCH.TU.adastra_gfx942
new file mode 100644
index 0000000000..6a03fc85ea
--- /dev/null
+++ b/tests/GPU/Canal_VDF/Canal_VDF_BENCH.TU.adastra_gfx942
@@ -0,0 +1,77 @@
+                                             # Global performance file #
+
+This is the global file for tracking performance in TRUST. It stores aggregated quantities.
+More detailed statistics can be found in the Canal_VDF_BENCH_csv.TU file
+For time loop, only standard counters of level 1 are printed alongside your custom counters
+Time is given in seconds
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+                                              Context of the computation
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Date:     08-06-2026 -- 14:34:27
+OS:       a1007__Linux__x86_64__5.14.0-570.69.1.el9_6.x86_64__#1 SMP PREEMPT_DYNAMIC Tue Nov 25 01:30:14 EST 2025
+CPU model : AMD Instinct MI300A Accelerator
+Total number of threads:192
+GPU model: AMD Instinct MI300A
+HIP runtime version: 6.43
+HIP drivers version: 6.43
+Nb procs used for the computation: 1
+TRUST version: 1.9.8_beta
+Total number of elements used for the calculation: 1000000
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+                                           Computation start-up statistics
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Total time of the start-up:                                                13.9131        
+
+Number of calls to the linear solver per time step:                        2              
+Average time of the resolution of the linear problem per call:             0.68663        
+Average number of iteration of the linear solver per call:                 0              
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+                                                 Time loop statistics
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+The first time step is not accounted for the computation of the time loop statistics
+Total time of the time loop:                                               0.384921       
+Number of time steps:                                                      9              
+Skipped time steps:                                                        1              
+Average time per time step:                                                0.0427689      
+Standard deviation between time steps:                                     0.0226606      
+Time elapsed in the skipped time steps:                                    1.95805        
+
+
+Standard counter description             | Time/step       | % loop time | Call(s)/step   
+------------------------------------------------------------------------------------------
+Linear solver resolutions Ax=B           | 0.00332468      |  7.8        | 3              
+Convection operator                      | 0.00420472      |  9.8        | 3              
+Diffusion operator                       | 0.002919138     |  6.8        | 3              
+Gradient operator                        | 0.001110474     |  2.6        | 6              
+Divergence operator                      | 0.001066916     |  2.5        | 4              
+Source terms                             | 0.0004038194    |  0.9        | 3              
+Update ::mettre_a_jour                   | 0.009225593     | 21.6        | 1              
+Computation of the time step dt          | 0.0002626509    |  0.6        | 2              
+Post-treatment operations                | 0.0174134       | 40.7        | 1              
+Other operations                         | 0.002837557     |  6.6        | 
+
+Average number of iteration of the linear solver per call:                 0.37           
+
+
+-----------------------------------------------------------------------------------------------------------
+                                                    GPU statistics
+-----------------------------------------------------------------------------------------------------------
+Counter description                      | Time per step   | % loop time | Call(s)/step    | Bandwidth 
+-----------------------------------------------------------------------------------------------------------
+Libraries:                               | 0.00305653      |  7.1        | 3               | 
+Kernels:                                 | 0.0285809       | 66.8        | 451             | 
+Copy host to device:                     | 0.000510485     |  1.2        | 33              | 0.0 GB/s
+Copy device to host:                     | 0.00102205      |  2.4        | 34              | 8.1 GB/s
+Alloc/Free on device:                    | 9.63393e-05     |  0.2        | 253             | 
+GPU: 74% Copy H<->D: 3.6% Alloc/free: 0.23% Comm: 0% CPU & I/O: 22%
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+                                              Post-resolution statistics
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Time of the post-resolution:                                               1.75888        
+
+Total time for the whole computation                                       18.015         
+
+[Slurm] Power consumption (25 s):  0.597 kW  0.004 kWh  0.000 € (0.10€/kWh)
diff --git a/tests/GPU/Canal_VDF/Canal_VDF_BENCH.TU.dalianvl_cc100 b/tests/GPU/Canal_VDF/Canal_VDF_BENCH.TU.dalianvl_cc100
new file mode 100644
index 0000000000..2cd913980f
--- /dev/null
+++ b/tests/GPU/Canal_VDF/Canal_VDF_BENCH.TU.dalianvl_cc100
@@ -0,0 +1,76 @@
+                                             # Global performance file #
+
+This is the global file for tracking performance in TRUST. It stores aggregated quantities.
+More detailed statistics can be found in the Canal_VDF_BENCH_csv.TU file
+For time loop, only standard counters of level 1 are printed alongside your custom counters
+Time is given in seconds
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+                                              Context of the computation
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Date:     22-05-2026 -- 12:12:33
+OS:       dalianvl14__Linux__aarch64__5.14.0-503.40.1.el9_5.aarch64__#1 SMP PREEMPT_DYNAMIC Thu Apr 24 13:05:29 UTC 2025
+CPU model : Unknown Linux CPU
+Total number of threads:144
+GPU model: NVIDIA GB200
+CUDA runtime version: 13.0
+CUDA drivers version: 13.20
+Nb procs used for the computation: 1
+TRUST version: 1.9.8_beta
+Total number of elements used for the calculation: 1000000
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+                                           Computation start-up statistics
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Total time of the start-up:                                                6.5758         
+
+Number of calls to the linear solver per time step:                        2              
+Average time of the resolution of the linear problem per call:             0.466502       
+Average number of iteration of the linear solver per call:                 0              
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+                                                 Time loop statistics
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+The first time step is not accounted for the computation of the time loop statistics
+Total time of the time loop:                                               0.267319       
+Number of time steps:                                                      9              
+Skipped time steps:                                                        1              
+Average time per time step:                                                0.0297022      
+Standard deviation between time steps:                                     0.0168388      
+Time elapsed in the skipped time steps:                                    1.67483        
+
+
+Standard counter description             | Time/step       | % loop time | Call(s)/step   
+------------------------------------------------------------------------------------------
+Linear solver resolutions Ax=B           | 0.00289441      |  9.7        | 3              
+Convection operator                      | 0.001819897     |  6.1        | 3              
+Diffusion operator                       | 0.001806095     |  6.1        | 3              
+Gradient operator                        | 0.0009081656    |  3.1        | 6              
+Divergence operator                      | 0.001402306     |  4.7        | 4              
+Source terms                             | 0.0005865699    |  2.0        | 3              
+Update ::mettre_a_jour                   | 0.004325817     | 14.6        | 1              
+Computation of the time step dt          | 0.0001865382    |  0.6        | 2              
+Post-treatment operations                | 0.01290875      | 43.5        | 1              
+Other operations                         | 0.002863604     |  9.6        | 
+
+Average number of iteration of the linear solver per call:                 0.37           
+
+
+-----------------------------------------------------------------------------------------------------------
+                                                    GPU statistics
+-----------------------------------------------------------------------------------------------------------
+Counter description                      | Time per step   | % loop time | Call(s)/step    | Bandwidth 
+-----------------------------------------------------------------------------------------------------------
+Libraries:                               | 0.00221351      |  7.5        | 3               | 
+Kernels:                                 | 0.0144006       | 48.5        | 451             | 
+Copy host to device:                     | 0.00057469      |  1.9        | 33              | 0.0 GB/s
+Copy device to host:                     | 0.000497414     |  1.7        | 26              | 16.7 GB/s
+Alloc/Free on device:                    | 0.00182166      |  6.1        | 269             | 
+GPU: 56% Copy H<->D: 3.6% Alloc/free: 6.1% Comm: 0% CPU & I/O: 34%
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+                                              Post-resolution statistics
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Time of the post-resolution:                                               1.05186        
+
+Total time for the whole computation                                       9.56983        
+
diff --git a/tests/GPU/Canal_VDF/Canal_VDF_BENCH.TU.is157091_cc86 b/tests/GPU/Canal_VDF/Canal_VDF_BENCH.TU.is157091_cc86
new file mode 100644
index 0000000000..7f06a54c3b
--- /dev/null
+++ b/tests/GPU/Canal_VDF/Canal_VDF_BENCH.TU.is157091_cc86
@@ -0,0 +1,76 @@
+                                             # Global performance file #
+
+This is the global file for tracking performance in TRUST. It stores aggregated quantities.
+More detailed statistics can be found in the Canal_VDF_BENCH_csv.TU file
+For time loop, only standard counters of level 1 are printed alongside your custom counters
+Time is given in seconds
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+                                              Context of the computation
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Date:     31-05-2026 -- 19:50:10
+OS:       is157091__Linux__x86_64__5.11.12-300.fc34.x86_64__#1 SMP Wed Apr 7 16:31:13 UTC 2021
+CPU model : AMD Ryzen Threadripper PRO 5975WX 32-Cores
+Total number of threads:64
+GPU model: NVIDIA RTX A6000
+CUDA runtime version: 12.90
+CUDA drivers version: 13.20
+Nb procs used for the computation: 1
+TRUST version: 1.9.8_beta
+Total number of elements used for the calculation: 1000000
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+                                           Computation start-up statistics
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Total time of the start-up:                                                7.00921        
+
+Number of calls to the linear solver per time step:                        2              
+Average time of the resolution of the linear problem per call:             0.519914       
+Average number of iteration of the linear solver per call:                 0              
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+                                                 Time loop statistics
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+The first time step is not accounted for the computation of the time loop statistics
+Total time of the time loop:                                               0.56956        
+Number of time steps:                                                      9              
+Skipped time steps:                                                        1              
+Average time per time step:                                                0.0632844      
+Standard deviation between time steps:                                     0.0113425      
+Time elapsed in the skipped time steps:                                    1.82503        
+
+
+Standard counter description             | Time/step       | % loop time | Call(s)/step   
+------------------------------------------------------------------------------------------
+Linear solver resolutions Ax=B           | 0.00376538      |  5.9        | 3              
+Convection operator                      | 0.008504433     | 13.4        | 3              
+Diffusion operator                       | 0.005449793     |  8.6        | 3              
+Gradient operator                        | 0.002636608     |  4.2        | 6              
+Divergence operator                      | 0.001631437     |  2.6        | 4              
+Source terms                             | 0.0007752543    |  1.2        | 3              
+Update ::mettre_a_jour                   | 0.02134089      | 33.7        | 1              
+Computation of the time step dt          | 0.0005473281    |  0.9        | 2              
+Post-treatment operations                | 0.01282795      | 20.3        | 1              
+Other operations                         | 0.005805318     |  9.2        | 
+
+Average number of iteration of the linear solver per call:                 0.37           
+
+
+-----------------------------------------------------------------------------------------------------------
+                                                    GPU statistics
+-----------------------------------------------------------------------------------------------------------
+Counter description                      | Time per step   | % loop time | Call(s)/step    | Bandwidth 
+-----------------------------------------------------------------------------------------------------------
+Libraries:                               | 0.00337555      |  5.3        | 3               | 
+Kernels:                                 | 0.0529067       | 83.6        | 451             | 
+Copy host to device:                     | 0.000292819     |  0.5        | 33              | 0.1 GB/s
+Copy device to host:                     | 0.00106282      |  1.7        | 26              | 7.8 GB/s
+Alloc/Free on device:                    | 0.000734244     |  1.2        | 269             | 
+GPU: 89% Copy H<->D: 2.1% Alloc/free: 1.2% Comm: 0% CPU & I/O: 7.8%
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+                                              Post-resolution statistics
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Time of the post-resolution:                                               0.56347        
+
+Total time for the whole computation                                       9.96728        
+
diff --git a/tests/GPU/Canal_VDF/Canal_VDF_BENCH.TU.is159479_cc120 b/tests/GPU/Canal_VDF/Canal_VDF_BENCH.TU.is159479_cc120
new file mode 100644
index 0000000000..fb973e970a
--- /dev/null
+++ b/tests/GPU/Canal_VDF/Canal_VDF_BENCH.TU.is159479_cc120
@@ -0,0 +1,76 @@
+                                             # Global performance file #
+
+This is the global file for tracking performance in TRUST. It stores aggregated quantities.
+More detailed statistics can be found in the Canal_VDF_BENCH_csv.TU file
+For time loop, only standard counters of level 1 are printed alongside your custom counters
+Time is given in seconds
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+                                              Context of the computation
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Date:     13-05-2026 -- 07:01:09
+OS:       is159479.intra.cea.fr__Linux__x86_64__6.12.0-160000.29-default__#1 SMP PREEMPT_DYNAMIC Fri May  1 12:45:19 UTC 2026 (6
+CPU model : AMD Ryzen Threadripper PRO 9965WX 24-Cores
+Total number of threads:48
+GPU model: NVIDIA RTX PRO 6000 Blackwell Workstation Edition
+CUDA runtime version: 13.0
+CUDA drivers version: 13.0
+Nb procs used for the computation: 1
+TRUST version: 1.9.8_beta
+Total number of elements used for the calculation: 1000000
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+                                           Computation start-up statistics
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Total time of the start-up:                                                5.37035        
+
+Number of calls to the linear solver per time step:                        2              
+Average time of the resolution of the linear problem per call:             0.342442       
+Average number of iteration of the linear solver per call:                 0              
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+                                                 Time loop statistics
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+The first time step is not accounted for the computation of the time loop statistics
+Total time of the time loop:                                               0.254878       
+Number of time steps:                                                      9              
+Skipped time steps:                                                        1              
+Average time per time step:                                                0.0283198      
+Standard deviation between time steps:                                     0.00794149     
+Time elapsed in the skipped time steps:                                    1.32327        
+
+
+Standard counter description             | Time/step       | % loop time | Call(s)/step   
+------------------------------------------------------------------------------------------
+Linear solver resolutions Ax=B           | 0.00213505      |  7.5        | 3              
+Convection operator                      | 0.002743935     |  9.7        | 3              
+Diffusion operator                       | 0.00208883      |  7.4        | 3              
+Gradient operator                        | 0.0006796297    |  2.4        | 6              
+Divergence operator                      | 0.0006408326    |  2.3        | 4              
+Source terms                             | 0.000344486     |  1.2        | 3              
+Update ::mettre_a_jour                   | 0.009316315     | 32.9        | 1              
+Computation of the time step dt          | 0.0002445942    |  0.9        | 2              
+Post-treatment operations                | 0.008318809     | 29.4        | 1              
+Other operations                         | 0.001807347     |  6.4        | 
+
+Average number of iteration of the linear solver per call:                 0.37           
+
+
+-----------------------------------------------------------------------------------------------------------
+                                                    GPU statistics
+-----------------------------------------------------------------------------------------------------------
+Counter description                      | Time per step   | % loop time | Call(s)/step    | Bandwidth 
+-----------------------------------------------------------------------------------------------------------
+Libraries:                               | 0.00189167      |  6.7        | 3               | 
+Kernels:                                 | 0.0215655       | 76.1        | 461             | 
+Copy host to device:                     | 0.000192181     |  0.7        | 33              | 0.1 GB/s
+Copy device to host:                     | 0.0011407       |  4.0        | 26              | 7.3 GB/s
+Alloc/Free on device:                    | 0.000567154     |  2.0        | 271             | 
+GPU: 83% Copy H<->D: 4.7% Alloc/free: 2% Comm: 0% CPU & I/O: 10%
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+                                              Post-resolution statistics
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Time of the post-resolution:                                               0.441399       
+
+Total time for the whole computation                                       7.38991        
+
diff --git a/tests/GPU/Canal_VDF/Canal_VDF_BENCH.TU.is247793_gfx1100 b/tests/GPU/Canal_VDF/Canal_VDF_BENCH.TU.is247793_gfx1100
new file mode 100644
index 0000000000..15535d1138
--- /dev/null
+++ b/tests/GPU/Canal_VDF/Canal_VDF_BENCH.TU.is247793_gfx1100
@@ -0,0 +1,76 @@
+                                             # Global performance file #
+
+This is the global file for tracking performance in TRUST. It stores aggregated quantities.
+More detailed statistics can be found in the Canal_VDF_BENCH_csv.TU file
+For time loop, only standard counters of level 1 are printed alongside your custom counters
+Time is given in seconds
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+                                              Context of the computation
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Date:     22-05-2026 -- 18:58:02
+OS:       is247793__Linux__x86_64__6.8.0-107-generic__#107~22.04.1-Ubuntu SMP PREEMPT_DYNAMIC Wed Mar 18 23:40:43 UTC 
+CPU model : AMD Ryzen Threadripper PRO 7965WX 24-Cores
+Total number of threads:48
+GPU model: AMD Radeon PRO W7900
+HIP runtime version: 7.53
+HIP drivers version: 7.53
+Nb procs used for the computation: 1
+TRUST version: 1.9.8_beta
+Total number of elements used for the calculation: 1000000
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+                                           Computation start-up statistics
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Total time of the start-up:                                                7.62223        
+
+Number of calls to the linear solver per time step:                        2              
+Average time of the resolution of the linear problem per call:             0.897327       
+Average number of iteration of the linear solver per call:                 0              
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+                                                 Time loop statistics
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+The first time step is not accounted for the computation of the time loop statistics
+Total time of the time loop:                                               1.34957        
+Number of time steps:                                                      9              
+Skipped time steps:                                                        1              
+Average time per time step:                                                0.149952       
+Standard deviation between time steps:                                     0.0408195      
+Time elapsed in the skipped time steps:                                    1.88478        
+
+
+Standard counter description             | Time/step       | % loop time | Call(s)/step   
+------------------------------------------------------------------------------------------
+Linear solver resolutions Ax=B           | 0.010451        |  7.0        | 3              
+Convection operator                      | 0.00830886      |  5.5        | 3              
+Diffusion operator                       | 0.006585594     |  4.4        | 3              
+Gradient operator                        | 0.002247286     |  1.5        | 6              
+Divergence operator                      | 0.002138017     |  1.4        | 4              
+Source terms                             | 0.001000203     |  0.7        | 3              
+Update ::mettre_a_jour                   | 0.02632996      | 17.6        | 1              
+Computation of the time step dt          | 0.0007627901    |  0.5        | 2              
+Post-treatment operations                | 0.08695642      | 58.0        | 1              
+Other operations                         | 0.005171515     |  3.4        | 
+
+Average number of iteration of the linear solver per call:                 0.37           
+
+
+-----------------------------------------------------------------------------------------------------------
+                                                    GPU statistics
+-----------------------------------------------------------------------------------------------------------
+Counter description                      | Time per step   | % loop time | Call(s)/step    | Bandwidth 
+-----------------------------------------------------------------------------------------------------------
+Libraries:                               | 0.01003         |  6.7        | 3               | 
+Kernels:                                 | 0.13202         | 88.0        | 451             | 
+Copy host to device:                     | 0.000783234     |  0.5        | 33              | 0.0 GB/s
+Copy device to host:                     | 0.00111942      |  0.7        | 26              | 7.4 GB/s
+Alloc/Free on device:                    | 0.00106206      |  0.7        | 269             | 
+GPU: 95% Copy H<->D: 1.3% Alloc/free: 0.71% Comm: 0% CPU & I/O: 3.3%
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+                                              Post-resolution statistics
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Time of the post-resolution:                                               0.722624       
+
+Total time for the whole computation                                       11.5792        
+
diff --git a/tests/GPU/Canal_VDF/Canal_VDF_BENCH.TU.jean-zay_cc90 b/tests/GPU/Canal_VDF/Canal_VDF_BENCH.TU.jean-zay_cc90
new file mode 100644
index 0000000000..12c6f370df
--- /dev/null
+++ b/tests/GPU/Canal_VDF/Canal_VDF_BENCH.TU.jean-zay_cc90
@@ -0,0 +1,77 @@
+                                             # Global performance file #
+
+This is the global file for tracking performance in TRUST. It stores aggregated quantities.
+More detailed statistics can be found in the Canal_VDF_BENCH_csv.TU file
+For time loop, only standard counters of level 1 are printed alongside your custom counters
+Time is given in seconds
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+                                              Context of the computation
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Date:     13-05-2026 -- 09:48:35
+OS:       jzxh116__Linux__x86_64__5.14.0-570.58.1.el9_6.x86_64__#1 SMP PREEMPT_DYNAMIC Tue Oct 21 04:15:07 EDT 2025
+CPU model : Intel(R) Xeon(R) Platinum 8468
+Total number of threads:192
+GPU model: NVIDIA H100 80GB HBM3
+CUDA runtime version: 12.60
+CUDA drivers version: 13.0
+Nb procs used for the computation: 1
+TRUST version: 1.9.8_beta
+Total number of elements used for the calculation: 1000000
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+                                           Computation start-up statistics
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Total time of the start-up:                                                10.006         
+
+Number of calls to the linear solver per time step:                        2              
+Average time of the resolution of the linear problem per call:             0.498991       
+Average number of iteration of the linear solver per call:                 0              
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+                                                 Time loop statistics
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+The first time step is not accounted for the computation of the time loop statistics
+Total time of the time loop:                                               0.450887       
+Number of time steps:                                                      9              
+Skipped time steps:                                                        1              
+Average time per time step:                                                0.0500986      
+Standard deviation between time steps:                                     0.0548609      
+Time elapsed in the skipped time steps:                                    2.21404        
+
+
+Standard counter description             | Time/step       | % loop time | Call(s)/step   
+------------------------------------------------------------------------------------------
+Linear solver resolutions Ax=B           | 0.00273623      |  5.5        | 3              
+Convection operator                      | 0.002399998     |  4.8        | 3              
+Diffusion operator                       | 0.002259157     |  4.5        | 3              
+Gradient operator                        | 0.001007029     |  2.0        | 6              
+Divergence operator                      | 0.001120468     |  2.2        | 4              
+Source terms                             | 0.0005228747    |  1.0        | 3              
+Update ::mettre_a_jour                   | 0.008564734     | 17.1        | 1              
+Computation of the time step dt          | 0.0002005593    |  0.4        | 2              
+Post-treatment operations                | 0.02815309      | 56.2        | 1              
+Other operations                         | 0.003134466     |  6.3        | 
+
+Average number of iteration of the linear solver per call:                 0.37           
+
+
+-----------------------------------------------------------------------------------------------------------
+                                                    GPU statistics
+-----------------------------------------------------------------------------------------------------------
+Counter description                      | Time per step   | % loop time | Call(s)/step    | Bandwidth 
+-----------------------------------------------------------------------------------------------------------
+Libraries:                               | 0.00221922      |  4.4        | 3               | 
+Kernels:                                 | 0.0194838       | 38.9        | 461             | 
+Copy host to device:                     | 0.000445538     |  0.9        | 33              | 0.0 GB/s
+Copy device to host:                     | 0.00111509      |  2.2        | 26              | 7.4 GB/s
+Alloc/Free on device:                    | 0.00107831      |  2.2        | 271             | 
+GPU: 43% Copy H<->D: 3.1% Alloc/free: 2.2% Comm: 0% CPU & I/O: 51%
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+                                              Post-resolution statistics
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Time of the post-resolution:                                               0.861773       
+
+Total time for the whole computation                                       13.5327        
+
+[Slurm] Power consumption (21 s):  0.431 kW  0.003 kWh  0.000 € (0.10€/kWh)
diff --git a/tests/GPU/Canal_VDF/Canal_VDF_BENCH.TU.lumi_gfx90a b/tests/GPU/Canal_VDF/Canal_VDF_BENCH.TU.lumi_gfx90a
new file mode 100644
index 0000000000..0766d8527d
--- /dev/null
+++ b/tests/GPU/Canal_VDF/Canal_VDF_BENCH.TU.lumi_gfx90a
@@ -0,0 +1,77 @@
+                                             # Global performance file #
+
+This is the global file for tracking performance in TRUST. It stores aggregated quantities.
+More detailed statistics can be found in the Canal_VDF_BENCH_csv.TU file
+For time loop, only standard counters of level 1 are printed alongside your custom counters
+Time is given in seconds
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+                                              Context of the computation
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Date:     15-05-2026 -- 19:42:18
+OS:       nid005018__Linux__x86_64__6.4.0-150600.23.73_15.0.14-cray_shasta_c__#1 SMP Tue Oct 21 20:32:25 UTC 2025 (89d3c98)
+CPU model : AMD EPYC 7A53 64-Core Processor
+Total number of threads:128
+GPU model: AMD Instinct MI250X
+HIP runtime version: 6.43
+HIP drivers version: 6.43
+Nb procs used for the computation: 1
+TRUST version: 1.9.8_beta
+Total number of elements used for the calculation: 1000000
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+                                           Computation start-up statistics
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Total time of the start-up:                                                52.831         
+
+Number of calls to the linear solver per time step:                        2              
+Average time of the resolution of the linear problem per call:             1.5103         
+Average number of iteration of the linear solver per call:                 0              
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+                                                 Time loop statistics
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+The first time step is not accounted for the computation of the time loop statistics
+Total time of the time loop:                                               2.24607        
+Number of time steps:                                                      9              
+Skipped time steps:                                                        1              
+Average time per time step:                                                0.249563       
+Standard deviation between time steps:                                     0.0899302      
+Time elapsed in the skipped time steps:                                    2.44083        
+
+
+Standard counter description             | Time/step       | % loop time | Call(s)/step   
+------------------------------------------------------------------------------------------
+Linear solver resolutions Ax=B           | 0.00276879      |  1.1        | 3              
+Convection operator                      | 0.0105047       |  4.2        | 3              
+Diffusion operator                       | 0.007269927     |  2.9        | 3              
+Gradient operator                        | 0.001888666     |  0.8        | 6              
+Divergence operator                      | 0.001826117     |  0.7        | 4              
+Source terms                             | 0.0006813406    |  0.3        | 3              
+Update ::mettre_a_jour                   | 0.03157014      | 12.7        | 1              
+Computation of the time step dt          | 0.0004289448    |  0.2        | 2              
+Post-treatment operations                | 0.1885739       | 75.6        | 1              
+Other operations                         | 0.004050516     |  1.6        | 
+
+Average number of iteration of the linear solver per call:                 0.37           
+
+
+-----------------------------------------------------------------------------------------------------------
+                                                    GPU statistics
+-----------------------------------------------------------------------------------------------------------
+Counter description                      | Time per step   | % loop time | Call(s)/step    | Bandwidth 
+-----------------------------------------------------------------------------------------------------------
+Libraries:                               | 0.00239408      |  1.0        | 3               | 
+Kernels:                                 | 0.235169        | 94.2        | 451             | 
+Copy host to device:                     | 0.000564144     |  0.2        | 33              | 0.0 GB/s
+Copy device to host:                     | 0.000952524     |  0.4        | 26              | 8.7 GB/s
+Alloc/Free on device:                    | 0.000560723     |  0.2        | 269             | 
+GPU: 95% Copy H<->D: 0.61% Alloc/free: 0.22% Comm: 0% CPU & I/O: 4%
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+                                              Post-resolution statistics
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Time of the post-resolution:                                               1.70586        
+
+Total time for the whole computation                                       59.2237        
+
+[Slurm] Power consumption (79 s):  0.480 kW  0.011 kWh  0.001 € (0.10€/kWh)
diff --git a/tests/GPU/Canal_VDF/Canal_VDF_BENCH.TU.topaze_cc80 b/tests/GPU/Canal_VDF/Canal_VDF_BENCH.TU.topaze_cc80
new file mode 100644
index 0000000000..1132193177
--- /dev/null
+++ b/tests/GPU/Canal_VDF/Canal_VDF_BENCH.TU.topaze_cc80
@@ -0,0 +1,77 @@
+                                             # Global performance file #
+
+This is the global file for tracking performance in TRUST. It stores aggregated quantities.
+More detailed statistics can be found in the Canal_VDF_BENCH_csv.TU file
+For time loop, only standard counters of level 1 are printed alongside your custom counters
+Time is given in seconds
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+                                              Context of the computation
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Date:     15-05-2026 -- 13:21:39
+OS:       topaze7064__Linux__x86_64__4.18.0-553.89.1.el8_10.x86_64__#1 SMP Sat Nov 29 00:49:18 EST 2025
+CPU model : AMD EPYC 7763 64-Core Processor
+Total number of threads:256
+GPU model: NVIDIA A100-SXM4-80GB
+CUDA runtime version: 12.90
+CUDA drivers version: 13.0
+Nb procs used for the computation: 1
+TRUST version: 1.9.8_beta
+Total number of elements used for the calculation: 1000000
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+                                           Computation start-up statistics
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Total time of the start-up:                                                9.93835        
+
+Number of calls to the linear solver per time step:                        2              
+Average time of the resolution of the linear problem per call:             0.688868       
+Average number of iteration of the linear solver per call:                 0              
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+                                                 Time loop statistics
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+The first time step is not accounted for the computation of the time loop statistics
+Total time of the time loop:                                               0.395459       
+Number of time steps:                                                      9              
+Skipped time steps:                                                        1              
+Average time per time step:                                                0.0439399      
+Standard deviation between time steps:                                     0.0239771      
+Time elapsed in the skipped time steps:                                    2.57337        
+
+
+Standard counter description             | Time/step       | % loop time | Call(s)/step   
+------------------------------------------------------------------------------------------
+Linear solver resolutions Ax=B           | 0.00333603      |  7.6        | 3              
+Convection operator                      | 0.003698002     |  8.4        | 3              
+Diffusion operator                       | 0.003023572     |  6.9        | 3              
+Gradient operator                        | 0.001306829     |  3.0        | 6              
+Divergence operator                      | 0.001484432     |  3.4        | 4              
+Source terms                             | 0.0006081321    |  1.4        | 3              
+Update ::mettre_a_jour                   | 0.01068189      | 24.3        | 1              
+Computation of the time step dt          | 0.0002696096    |  0.6        | 2              
+Post-treatment operations                | 0.01635107      | 37.2        | 1              
+Other operations                         | 0.003180315     |  7.2        | 
+
+Average number of iteration of the linear solver per call:                 0.37           
+
+
+-----------------------------------------------------------------------------------------------------------
+                                                    GPU statistics
+-----------------------------------------------------------------------------------------------------------
+Counter description                      | Time per step   | % loop time | Call(s)/step    | Bandwidth 
+-----------------------------------------------------------------------------------------------------------
+Libraries:                               | 0.00269444      |  6.1        | 3               | 
+Kernels:                                 | 0.0273097       | 62.2        | 451             | 
+Copy host to device:                     | 0.000447015     |  1.0        | 33              | 0.0 GB/s
+Copy device to host:                     | 0.00123098      |  2.8        | 26              | 6.7 GB/s
+Alloc/Free on device:                    | 0.00108718      |  2.5        | 269             | 
+GPU: 68% Copy H<->D: 3.8% Alloc/free: 2.5% Comm: 0% CPU & I/O: 25%
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+                                              Post-resolution statistics
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Time of the post-resolution:                                               1.36392        
+
+Total time for the whole computation                                       14.2711        
+
+[Slurm] Power consumption (51 s):  0.611 kW  0.009 kWh  0.001 € (0.10€/kWh)
diff --git a/tests/GPU/Canal_VDF/check_perf.sh b/tests/GPU/Canal_VDF/check_perf.sh
new file mode 120000
index 0000000000..6d20411c12
--- /dev/null
+++ b/tests/GPU/Canal_VDF/check_perf.sh
@@ -0,0 +1 @@
+../DomainFlowLES/check_perf.sh
\ No newline at end of file
diff --git a/tests/GPU/ColdLegMixing/ColdLegMixing.data b/tests/GPU/ColdLegMixing/ColdLegMixing.data
index 322a967d4f..b6e284f0e1 100644
--- a/tests/GPU/ColdLegMixing/ColdLegMixing.data
+++ b/tests/GPU/ColdLegMixing/ColdLegMixing.data
@@ -42,7 +42,7 @@ Scatter DOM.Zones dom
 END SCATTER #
 
 VEFPreP1B dis
-lire dis { P0 P1 changement_de_base_P1bulle 1 CL_pression_sommet_faible 0 modif_div_face_dirichlet 0 }
+Lire dis { reorder { algo none } } # reorder make differences #
 
 Runge_Kutta_rationnel_ordre_2 sch
 lire sch
@@ -91,7 +91,7 @@ Lire pb
             preconditionnement_diag 1
             seuil_diffusion_implicite 1e-10
         }
-        solveur_pression	AMG GCP { atol 1e-5 impr }
+        solveur_pression	AMG GCP { atol 1e-9 impr }
         convection		{ MUSCL }
         diffusion		{ }
         conditions_initiales	{ vitesse champ_uniforme 3 0 0 0 }
diff --git a/tests/GPU/ColdLegMixing/ColdLegMixing_BENCH.TU.adastra_gfx90a b/tests/GPU/ColdLegMixing/ColdLegMixing_BENCH.TU.adastra_gfx90a
index dda0d84caa..34cd597e00 100644
--- a/tests/GPU/ColdLegMixing/ColdLegMixing_BENCH.TU.adastra_gfx90a
+++ b/tests/GPU/ColdLegMixing/ColdLegMixing_BENCH.TU.adastra_gfx90a
@@ -8,8 +8,8 @@ Time is given in seconds
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
                                               Context of the computation
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-Date:     20-02-2026 -- 21:07:42
-OS:       g1031__Linux__x86_64__5.14.0-570.69.1.el9_6.x86_64__#1 SMP PREEMPT_DYNAMIC Tue Nov 25 01:30:14 EST 2025
+Date:     23-04-2026 -- 09:16:32
+OS:       g1182__Linux__x86_64__5.14.0-570.69.1.el9_6.x86_64__#1 SMP PREEMPT_DYNAMIC Tue Nov 25 01:30:14 EST 2025
 CPU model : AMD EPYC 7A53 64-Core Processor
 Total number of threads:128
 GPU model: AMD Instinct MI250X
@@ -22,40 +22,40 @@ Total number of elements used for the calculation: 2160000
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
                                            Computation start-up statistics
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-Total time of the start-up:                                                54.8818        
+Total time of the start-up:                                                55.4215        
 
 Number of calls to the linear solver per time step:                        2              
-Average time of the resolution of the linear problem per call:             1.50432        
+Average time of the resolution of the linear problem per call:             1.57831        
 Average number of iteration of the linear solver per call:                 0              
 
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
                                                  Time loop statistics
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 The first time step is not accounted for the computation of the time loop statistics
-Total time of the time loop:                                               8.39766        
+Total time of the time loop:                                               7.89771        
 Number of time steps:                                                      9              
 Skipped time steps:                                                        1              
-Average time per time step:                                                0.933074       
-Standard deviation between time steps:                                     0.168845       
-Time elapsed in the skipped time steps:                                    1.56224        
+Average time per time step:                                                0.877524       
+Standard deviation between time steps:                                     0.345544       
+Time elapsed in the skipped time steps:                                    1.75101        
 
 
 Standard counter description             | Time/step       | % loop time | Call(s)/step   
 ------------------------------------------------------------------------------------------
-Linear solver resolutions Ax=B           | 0.121511        | 11.0        | 2              
-Matrix assembly for implicit scheme      | 0.04890356      |  4.4        | 4              
-Convection operator                      | 0.1384608       | 12.5        | 10             
-Diffusion operator                       | 0.1001888       |  9.1        | 18             
-Gradient operator                        | 0.1041166       |  9.4        | 5              
-Divergence operator                      | 0.03953337      |  3.6        | 6              
-Source terms                             | 0.001168745     |  0.1        | 4              
-Update ::mettre_a_jour                   | 0.2427784       | 21.9        | 4              
-Solver for implicit diffusion            | 0.02933033      |  2.7        | 4              
-Computation of the time step dt          | 0.02264737      |  2.0        | 6              
-Post-treatment operations                | 0.07146423      |  6.5        | 1              
-Other operations                         | 0.01297066      |  1.2        | 
+Linear solver resolutions Ax=B           | 0.154441        | 17.6        | 2              
+Matrix assembly for implicit scheme      | 0.0378347       |  4.3        | 4              
+Convection operator                      | 0.0779121       |  8.9        | 9              
+Diffusion operator                       | 0.08704269      |  9.9        | 18             
+Gradient operator                        | 0.05078478      |  5.8        | 5              
+Divergence operator                      | 0.02666375      |  3.0        | 6              
+Source terms                             | 0.0009005066    |  0.1        | 4              
+Update ::mettre_a_jour                   | 0.2270792       | 25.9        | 4              
+Solver for implicit diffusion            | 0.02766899      |  3.2        | 4              
+Computation of the time step dt          | 0.01654435      |  1.9        | 6              
+Post-treatment operations                | 0.1431845       | 16.3        | 1              
+Other operations                         | 0.02746652      |  3.1        | 
 
-Average number of iteration of the linear solver per call:                 33.7           
+Average number of iteration of the linear solver per call:                 52.7           
 
 
 -----------------------------------------------------------------------------------------------------------
@@ -63,17 +63,17 @@ Average number of iteration of the linear solver per call:                 33.7
 -----------------------------------------------------------------------------------------------------------
 Counter description                      | Time per step   | % loop time | Call(s)/step    | Bandwidth 
 -----------------------------------------------------------------------------------------------------------
-Libraries:                               | 0.121127        | 13.0        | 2               | 
-Kernels:                                 | 0.775968        | 83.2        | 1111            | 
-Copy host to device:                     | 0.000841133     |  0.1        | 42              | 2.6 GB/s
-Copy device to host:                     | 0.00288309      |  0.3        | 44              | 13.7 GB/s
-Alloc/Free on device:                    | 0.000129747     |  0.0        | 11              | 
-GPU: 96% Copy H<->D: 0.4% Alloc/free: 0.014% Comm: 0% CPU & I/O: 3.4%
+Libraries:                               | 0.154065        | 17.6        | 2               | 
+Kernels:                                 | 0.613257        | 69.9        | 1097            | 
+Copy host to device:                     | 0.000828083     |  0.1        | 42              | 2.6 GB/s
+Copy device to host:                     | 0.00289064      |  0.3        | 44              | 13.6 GB/s
+Alloc/Free on device:                    | 0.000112598     |  0.0        | 11              | 
+GPU: 87% Copy H<->D: 0.42% Alloc/free: 0.013% Comm: 0% CPU & I/O: 12%
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
                                               Post-resolution statistics
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-Time of the post-resolution:                                               2.54669        
+Time of the post-resolution:                                               2.69953        
 
-Total time for the whole computation                                       67.3884        
+Total time for the whole computation                                       67.7697        
 
-Power consumption: 0 kW 0 kWh 0 € (0.12€/kWh)
+[Slurm] Power consumption (76 s):  0.483 kW  0.010 kWh  0.001 € (0.10€/kWh)
diff --git a/tests/GPU/ColdLegMixing/ColdLegMixing_BENCH.TU.dalianvl_cc100 b/tests/GPU/ColdLegMixing/ColdLegMixing_BENCH.TU.dalianvl_cc100
new file mode 100644
index 0000000000..e99fd2ca48
--- /dev/null
+++ b/tests/GPU/ColdLegMixing/ColdLegMixing_BENCH.TU.dalianvl_cc100
@@ -0,0 +1,78 @@
+                                             # Global performance file #
+
+This is the global file for tracking performance in TRUST. It stores aggregated quantities.
+More detailed statistics can be found in the ColdLegMixing_BENCH_csv.TU file
+For time loop, only standard counters of level 1 are printed alongside your custom counters
+Time is given in seconds
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+                                              Context of the computation
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Date:     22-05-2026 -- 12:13:09
+OS:       dalianvl14__Linux__aarch64__5.14.0-503.40.1.el9_5.aarch64__#1 SMP PREEMPT_DYNAMIC Thu Apr 24 13:05:29 UTC 2025
+CPU model : Unknown Linux CPU
+Total number of threads:144
+GPU model: NVIDIA GB200
+CUDA runtime version: 13.0
+CUDA drivers version: 13.20
+Nb procs used for the computation: 1
+TRUST version: 1.9.8_beta
+Total number of elements used for the calculation: 2160000
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+                                           Computation start-up statistics
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Total time of the start-up:                                                27.0598        
+
+Number of calls to the linear solver per time step:                        2              
+Average time of the resolution of the linear problem per call:             1.11402        
+Average number of iteration of the linear solver per call:                 0              
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+                                                 Time loop statistics
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+The first time step is not accounted for the computation of the time loop statistics
+Total time of the time loop:                                               1.86596        
+Number of time steps:                                                      9              
+Skipped time steps:                                                        1              
+Average time per time step:                                                0.207329       
+Standard deviation between time steps:                                     0.0668016      
+Time elapsed in the skipped time steps:                                    0.674962       
+
+
+Standard counter description             | Time/step       | % loop time | Call(s)/step   
+------------------------------------------------------------------------------------------
+Linear solver resolutions Ax=B           | 0.0830543       | 40.1        | 2              
+Matrix assembly for implicit scheme      | 0.003942188     |  1.9        | 4              
+Convection operator                      | 0.02023463      |  9.8        | 10             
+Diffusion operator                       | 0.009598779     |  4.6        | 18             
+Gradient operator                        | 0.005206757     |  2.5        | 5              
+Divergence operator                      | 0.002221272     |  1.1        | 6              
+Source terms                             | 0.0004284402    |  0.2        | 4              
+Update ::mettre_a_jour                   | 0.02895494      | 14.0        | 4              
+Solver for implicit diffusion            | 0.009809403     |  4.7        | 4              
+Computation of the time step dt          | 0.001842386     |  0.9        | 6              
+Post-treatment operations                | 0.02996372      | 14.5        | 1              
+Other operations                         | 0.01207206      |  5.8        | 
+
+Average number of iteration of the linear solver per call:                 53             
+
+
+-----------------------------------------------------------------------------------------------------------
+                                                    GPU statistics
+-----------------------------------------------------------------------------------------------------------
+Counter description                      | Time per step   | % loop time | Call(s)/step    | Bandwidth 
+-----------------------------------------------------------------------------------------------------------
+Libraries:                               | 0.0827913       | 39.9        | 2               | 
+Kernels:                                 | 0.0879506       | 42.4        | 1063            | 
+Copy host to device:                     | 0.000721296     |  0.3        | 42              | 3.0 GB/s
+Copy device to host:                     | 0.000766822     |  0.4        | 31              | 44.4 GB/s
+Alloc/Free on device:                    | 0.0016505       |  0.8        | 39              | 
+GPU: 82% Copy H<->D: 0.72% Alloc/free: 0.8% Comm: 0% CPU & I/O: 16%
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+                                              Post-resolution statistics
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Time of the post-resolution:                                               1.59505        
+
+Total time for the whole computation                                       31.1958        
+
diff --git a/tests/GPU/ColdLegMixing/ColdLegMixing_BENCH.TU.eureka_cc89 b/tests/GPU/ColdLegMixing/ColdLegMixing_BENCH.TU.eureka_cc89
new file mode 100644
index 0000000000..ea0331118d
--- /dev/null
+++ b/tests/GPU/ColdLegMixing/ColdLegMixing_BENCH.TU.eureka_cc89
@@ -0,0 +1,78 @@
+                                             # Global performance file #
+
+This is the global file for tracking performance in TRUST. It stores aggregated quantities.
+More detailed statistics can be found in the ColdLegMixing_BENCH_csv.TU file
+For time loop, only standard counters of level 1 are printed alongside your custom counters
+Time is given in seconds
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+                                              Context of the computation
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Date:     11-05-2026 -- 08:29:14
+OS:       eureka__Linux__x86_64__6.17.0-20-generic__#20~24.04.1-Ubuntu SMP PREEMPT_DYNAMIC Thu Mar 19 01:28:37 UTC 2
+CPU model : INTEL(R) XEON(R) PLATINUM 8580
+Total number of threads:240
+GPU model: NVIDIA RTX 6000 Ada Generation
+CUDA runtime version: 12.90
+CUDA drivers version: 13.10
+Nb procs used for the computation: 1
+TRUST version: 1.9.8_beta
+Total number of elements used for the calculation: 2160000
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+                                           Computation start-up statistics
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Total time of the start-up:                                                40.8787        
+
+Number of calls to the linear solver per time step:                        2              
+Average time of the resolution of the linear problem per call:             1.41704        
+Average number of iteration of the linear solver per call:                 0              
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+                                                 Time loop statistics
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+The first time step is not accounted for the computation of the time loop statistics
+Total time of the time loop:                                               4.14205        
+Number of time steps:                                                      9              
+Skipped time steps:                                                        1              
+Average time per time step:                                                0.460228       
+Standard deviation between time steps:                                     0.13345        
+Time elapsed in the skipped time steps:                                    1.76797        
+
+
+Standard counter description             | Time/step       | % loop time | Call(s)/step   
+------------------------------------------------------------------------------------------
+Linear solver resolutions Ax=B           | 0.150019        | 32.6        | 2              
+Matrix assembly for implicit scheme      | 0.01175697      |  2.6        | 4              
+Convection operator                      | 0.05860689      | 12.7        | 10             
+Diffusion operator                       | 0.03013431      |  6.5        | 18             
+Gradient operator                        | 0.01550689      |  3.4        | 5              
+Divergence operator                      | 0.008359857     |  1.8        | 6              
+Source terms                             | 0.001063969     |  0.2        | 4              
+Update ::mettre_a_jour                   | 0.0559526       | 12.2        | 4              
+Solver for implicit diffusion            | 0.03351775      |  7.3        | 4              
+Computation of the time step dt          | 0.005934557     |  1.3        | 6              
+Post-treatment operations                | 0.05752008      | 12.5        | 1              
+Other operations                         | 0.03185519      |  6.9        | 
+
+Average number of iteration of the linear solver per call:                 53             
+
+
+-----------------------------------------------------------------------------------------------------------
+                                                    GPU statistics
+-----------------------------------------------------------------------------------------------------------
+Counter description                      | Time per step   | % loop time | Call(s)/step    | Bandwidth 
+-----------------------------------------------------------------------------------------------------------
+Libraries:                               | 0.149355        | 32.5        | 2               | 
+Kernels:                                 | 0.291186        | 63.3        | 1110            | 
+Copy host to device:                     | 0.000486001     |  0.1        | 42              | 4.4 GB/s
+Copy device to host:                     | 0.00310949      |  0.7        | 45              | 12.7 GB/s
+Alloc/Free on device:                    | 0.000238185     |  0.1        | 11              | 
+GPU: 96% Copy H<->D: 0.78% Alloc/free: 0.052% Comm: 0% CPU & I/O: 3.4%
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+                                              Post-resolution statistics
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Time of the post-resolution:                                               0.773176       
+
+Total time for the whole computation                                       47.5619        
+
diff --git a/tests/GPU/ColdLegMixing/ColdLegMixing_BENCH.TU.irene-amd-ccrt_cc70 b/tests/GPU/ColdLegMixing/ColdLegMixing_BENCH.TU.irene-amd-ccrt_cc70
index 7741a83d11..3d7f462e42 100644
--- a/tests/GPU/ColdLegMixing/ColdLegMixing_BENCH.TU.irene-amd-ccrt_cc70
+++ b/tests/GPU/ColdLegMixing/ColdLegMixing_BENCH.TU.irene-amd-ccrt_cc70
@@ -8,8 +8,8 @@ Time is given in seconds
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
                                               Context of the computation
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-Date:     20-03-2026 -- 14:08:56
-OS:       irene7056__Linux__x86_64__4.18.0-553.89.1.el8_10.x86_64__#1 SMP Sat Nov 29 00:49:18 EST 2025
+Date:     23-04-2026 -- 14:44:19
+OS:       irene7051__Linux__x86_64__4.18.0-553.89.1.el8_10.x86_64__#1 SMP Sat Nov 29 00:49:18 EST 2025
 CPU model : Intel(R) Xeon(R) Gold 6230 CPU @ 2.10GHz
 Total number of threads:80
 GPU model: Tesla V100-SXM2-16GB
@@ -22,40 +22,40 @@ Total number of elements used for the calculation: 2160000
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
                                            Computation start-up statistics
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-Total time of the start-up:                                                68.8927        
+Total time of the start-up:                                                72.2546        
 
 Number of calls to the linear solver per time step:                        2              
-Average time of the resolution of the linear problem per call:             2.55071        
+Average time of the resolution of the linear problem per call:             2.2267         
 Average number of iteration of the linear solver per call:                 0              
 
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
                                                  Time loop statistics
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 The first time step is not accounted for the computation of the time loop statistics
-Total time of the time loop:                                               6.35237        
+Total time of the time loop:                                               5.59819        
 Number of time steps:                                                      9              
 Skipped time steps:                                                        1              
-Average time per time step:                                                0.705819       
-Standard deviation between time steps:                                     0.111849       
-Time elapsed in the skipped time steps:                                    2.75659        
+Average time per time step:                                                0.622021       
+Standard deviation between time steps:                                     0.100609       
+Time elapsed in the skipped time steps:                                    2.66817        
 
 
 Standard counter description             | Time/step       | % loop time | Call(s)/step   
 ------------------------------------------------------------------------------------------
-Linear solver resolutions Ax=B           | 0.166276        | 23.6        | 2              
-Matrix assembly for implicit scheme      | 0.03361593      |  4.8        | 4              
-Convection operator                      | 0.1028779       | 14.6        | 10             
-Diffusion operator                       | 0.06363367      |  9.0        | 18             
-Gradient operator                        | 0.04134556      |  5.9        | 5              
-Divergence operator                      | 0.04251433      |  6.0        | 6              
-Source terms                             | 0.001611651     |  0.2        | 4              
-Update ::mettre_a_jour                   | 0.09962028      | 14.1        | 4              
-Solver for implicit diffusion            | 0.04006988      |  5.7        | 4              
-Computation of the time step dt          | 0.02328207      |  3.3        | 6              
-Post-treatment operations                | 0.05626412      |  8.0        | 1              
-Other operations                         | 0.0347072       |  4.9        | 
+Linear solver resolutions Ax=B           | 0.228427        | 36.7        | 2              
+Matrix assembly for implicit scheme      | 0.02427188      |  3.9        | 4              
+Convection operator                      | 0.05417733      |  8.7        | 9              
+Diffusion operator                       | 0.03900059      |  6.3        | 18             
+Gradient operator                        | 0.02075651      |  3.3        | 5              
+Divergence operator                      | 0.02327661      |  3.7        | 6              
+Source terms                             | 0.001577241     |  0.3        | 4              
+Update ::mettre_a_jour                   | 0.08490219      | 13.6        | 4              
+Solver for implicit diffusion            | 0.03963722      |  6.4        | 4              
+Computation of the time step dt          | 0.0155664       |  2.5        | 6              
+Post-treatment operations                | 0.05371404      |  8.6        | 1              
+Other operations                         | 0.03671423      |  5.9        | 
 
-Average number of iteration of the linear solver per call:                 32.7           
+Average number of iteration of the linear solver per call:                 52.8           
 
 
 -----------------------------------------------------------------------------------------------------------
@@ -63,17 +63,17 @@ Average number of iteration of the linear solver per call:                 32.7
 -----------------------------------------------------------------------------------------------------------
 Counter description                      | Time per step   | % loop time | Call(s)/step    | Bandwidth 
 -----------------------------------------------------------------------------------------------------------
-Libraries:                               | 0.165709        | 23.5        | 2               | 
-Kernels:                                 | 0.49784         | 70.5        | 1111            | 
-Copy host to device:                     | 0.00112713      |  0.2        | 42              | 1.9 GB/s
-Copy device to host:                     | 0.00941522      |  1.3        | 44              | 4.2 GB/s
-Alloc/Free on device:                    | 0.000490555     |  0.1        | 11              | 
-GPU: 94% Copy H<->D: 1.5% Alloc/free: 0.07% Comm: 0% CPU & I/O: 4.4%
+Libraries:                               | 0.227866        | 36.6        | 2               | 
+Kernels:                                 | 0.350872        | 56.4        | 1097            | 
+Copy host to device:                     | 0.00111442      |  0.2        | 42              | 1.9 GB/s
+Copy device to host:                     | 0.0095051       |  1.5        | 44              | 4.1 GB/s
+Alloc/Free on device:                    | 0.00031579      |  0.1        | 11              | 
+GPU: 93% Copy H<->D: 1.7% Alloc/free: 0.051% Comm: 0% CPU & I/O: 5.2%
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
                                               Post-resolution statistics
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-Time of the post-resolution:                                               1.89088        
+Time of the post-resolution:                                               2.05112        
 
-Total time for the whole computation                                       79.8925        
+Total time for the whole computation                                       82.5721        
 
-[Slurm] Power consumption (104 s):  0.220 kW  0.006 kWh  0.001 € (0.10€/kWh)
+[Slurm] Power consumption (97 s):  0.226 kW  0.006 kWh  0.001 € (0.10€/kWh)
diff --git a/tests/GPU/ColdLegMixing/ColdLegMixing_BENCH.TU.is157091_cc86 b/tests/GPU/ColdLegMixing/ColdLegMixing_BENCH.TU.is157091_cc86
index 0ccdcbfa56..6bc6865978 100644
--- a/tests/GPU/ColdLegMixing/ColdLegMixing_BENCH.TU.is157091_cc86
+++ b/tests/GPU/ColdLegMixing/ColdLegMixing_BENCH.TU.is157091_cc86
@@ -8,7 +8,7 @@ Time is given in seconds
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
                                               Context of the computation
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-Date:     10-03-2026 -- 08:37:35
+Date:     22-04-2026 -- 07:49:49
 OS:       is157091__Linux__x86_64__5.11.12-300.fc34.x86_64__#1 SMP Wed Apr 7 16:31:13 UTC 2021
 CPU model : AMD Ryzen Threadripper PRO 5975WX 32-Cores
 Total number of threads:64
@@ -22,38 +22,38 @@ Total number of elements used for the calculation: 2160000
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
                                            Computation start-up statistics
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-Total time of the start-up:                                                42.1372        
+Total time of the start-up:                                                44.4438        
 
 Number of calls to the linear solver per time step:                        2              
-Average time of the resolution of the linear problem per call:             1.15194        
+Average time of the resolution of the linear problem per call:             1.42068        
 Average number of iteration of the linear solver per call:                 0              
 
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
                                                  Time loop statistics
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 The first time step is not accounted for the computation of the time loop statistics
-Total time of the time loop:                                               6.28092        
+Total time of the time loop:                                               6.11625        
 Number of time steps:                                                      9              
 Skipped time steps:                                                        1              
-Average time per time step:                                                0.69788        
-Standard deviation between time steps:                                     0.11746        
-Time elapsed in the skipped time steps:                                    1.51742        
+Average time per time step:                                                0.679583       
+Standard deviation between time steps:                                     0.087326       
+Time elapsed in the skipped time steps:                                    1.67971        
 
 
 Standard counter description             | Time/step       | % loop time | Call(s)/step   
 ------------------------------------------------------------------------------------------
-Linear solver resolutions Ax=B           | 0.167858        | 24.1        | 2              
-Matrix assembly for implicit scheme      | 0.03240089      |  4.6        | 4              
-Convection operator                      | 0.09896009      | 14.2        | 10             
-Diffusion operator                       | 0.05078759      |  7.3        | 18             
-Gradient operator                        | 0.03526034      |  5.1        | 5              
-Divergence operator                      | 0.0328344       |  4.7        | 6              
-Source terms                             | 0.001295119     |  0.2        | 4              
-Update ::mettre_a_jour                   | 0.1129639       | 16.2        | 4              
-Solver for implicit diffusion            | 0.04478752      |  6.4        | 4              
-Computation of the time step dt          | 0.02486768      |  3.6        | 6              
-Post-treatment operations                | 0.05516704      |  7.9        | 1              
-Other operations                         | 0.04069752      |  5.8        | 
+Linear solver resolutions Ax=B           | 0.16942         | 24.9        | 2              
+Matrix assembly for implicit scheme      | 0.03782602      |  5.6        | 4              
+Convection operator                      | 0.100055        | 14.7        | 10             
+Diffusion operator                       | 0.05079105      |  7.5        | 18             
+Gradient operator                        | 0.02477128      |  3.6        | 5              
+Divergence operator                      | 0.03284109      |  4.8        | 6              
+Source terms                             | 0.001298781     |  0.2        | 4              
+Update ::mettre_a_jour                   | 0.107247        | 15.8        | 4              
+Solver for implicit diffusion            | 0.04514434      |  6.6        | 4              
+Computation of the time step dt          | 0.02508486      |  3.7        | 6              
+Post-treatment operations                | 0.04197711      |  6.2        | 1              
+Other operations                         | 0.04312715      |  6.3        | 
 
 Average number of iteration of the linear solver per call:                 32.7           
 
@@ -63,16 +63,16 @@ Average number of iteration of the linear solver per call:                 32.7
 -----------------------------------------------------------------------------------------------------------
 Counter description                      | Time per step   | % loop time | Call(s)/step    | Bandwidth 
 -----------------------------------------------------------------------------------------------------------
-Libraries:                               | 0.167302        | 24.0        | 2               | 
-Kernels:                                 | 0.496374        | 71.1        | 1111            | 
-Copy host to device:                     | 0.00053393      |  0.1        | 42              | 4.0 GB/s
-Copy device to host:                     | 0.00382078      |  0.5        | 44              | 10.3 GB/s
-Alloc/Free on device:                    | 0.000359834     |  0.1        | 11              | 
-GPU: 95% Copy H<->D: 0.62% Alloc/free: 0.052% Comm: 0% CPU & I/O: 4.2%
+Libraries:                               | 0.168832        | 24.8        | 2               | 
+Kernels:                                 | 0.488327        | 71.9        | 1107            | 
+Copy host to device:                     | 0.000582148     |  0.1        | 42              | 3.7 GB/s
+Copy device to host:                     | 0.00392578      |  0.6        | 44              | 10.0 GB/s
+Alloc/Free on device:                    | 0.000275118     |  0.0        | 11              | 
+GPU: 97% Copy H<->D: 0.66% Alloc/free: 0.04% Comm: 0% CPU & I/O: 2.6%
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
                                               Post-resolution statistics
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-Time of the post-resolution:                                               0.87779        
+Time of the post-resolution:                                               0.919342       
 
-Total time for the whole computation                                       50.8133        
+Total time for the whole computation                                       53.1591        
 
diff --git a/tests/GPU/ColdLegMixing/ColdLegMixing_BENCH.TU.is159479_cc120 b/tests/GPU/ColdLegMixing/ColdLegMixing_BENCH.TU.is159479_cc120
new file mode 100644
index 0000000000..c6cae453a5
--- /dev/null
+++ b/tests/GPU/ColdLegMixing/ColdLegMixing_BENCH.TU.is159479_cc120
@@ -0,0 +1,78 @@
+                                             # Global performance file #
+
+This is the global file for tracking performance in TRUST. It stores aggregated quantities.
+More detailed statistics can be found in the ColdLegMixing_BENCH_csv.TU file
+For time loop, only standard counters of level 1 are printed alongside your custom counters
+Time is given in seconds
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+                                              Context of the computation
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Date:     24-04-2026 -- 14:32:59
+OS:       is159479.intra.cea.fr__Linux__x86_64__6.12.0-160000.26-default__#1 SMP PREEMPT_DYNAMIC Thu Feb  5 00:00:11 UTC 2026 (f
+CPU model : AMD Ryzen Threadripper PRO 9965WX 24-Cores
+Total number of threads:48
+GPU model: NVIDIA RTX PRO 6000 Blackwell Workstation Edition
+CUDA runtime version: 13.0
+CUDA drivers version: 13.0
+Nb procs used for the computation: 1
+TRUST version: 1.9.8_beta
+Total number of elements used for the calculation: 2160000
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+                                           Computation start-up statistics
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Total time of the start-up:                                                33.0966        
+
+Number of calls to the linear solver per time step:                        2              
+Average time of the resolution of the linear problem per call:             0.828613       
+Average number of iteration of the linear solver per call:                 0              
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+                                                 Time loop statistics
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+The first time step is not accounted for the computation of the time loop statistics
+Total time of the time loop:                                               2.23231        
+Number of time steps:                                                      9              
+Skipped time steps:                                                        1              
+Average time per time step:                                                0.248034       
+Standard deviation between time steps:                                     0.0364159      
+Time elapsed in the skipped time steps:                                    0.984948       
+
+
+Standard counter description             | Time/step       | % loop time | Call(s)/step   
+------------------------------------------------------------------------------------------
+Linear solver resolutions Ax=B           | 0.103753        | 41.8        | 2              
+Matrix assembly for implicit scheme      | 0.005429958     |  2.2        | 4              
+Convection operator                      | 0.02593384      | 10.5        | 9              
+Diffusion operator                       | 0.01465164      |  5.9        | 18             
+Gradient operator                        | 0.007345582     |  3.0        | 5              
+Divergence operator                      | 0.003139903     |  1.3        | 6              
+Source terms                             | 0.0005302821    |  0.2        | 4              
+Update ::mettre_a_jour                   | 0.03202612      | 12.9        | 4              
+Solver for implicit diffusion            | 0.01674319      |  6.8        | 4              
+Computation of the time step dt          | 0.002658233     |  1.1        | 6              
+Post-treatment operations                | 0.01895837      |  7.6        | 1              
+Other operations                         | 0.01686431      |  6.8        | 
+
+Average number of iteration of the linear solver per call:                 52.8           
+
+
+-----------------------------------------------------------------------------------------------------------
+                                                    GPU statistics
+-----------------------------------------------------------------------------------------------------------
+Counter description                      | Time per step   | % loop time | Call(s)/step    | Bandwidth 
+-----------------------------------------------------------------------------------------------------------
+Libraries:                               | 0.103627        | 41.8        | 2               | 
+Kernels:                                 | 0.128405        | 51.8        | 1097            | 
+Copy host to device:                     | 0.000396511     |  0.2        | 42              | 5.5 GB/s
+Copy device to host:                     | 0.00455285      |  1.8        | 44              | 8.7 GB/s
+Alloc/Free on device:                    | 0.000110567     |  0.0        | 11              | 
+GPU: 94% Copy H<->D: 2% Alloc/free: 0.045% Comm: 0% CPU & I/O: 4.4%
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+                                              Post-resolution statistics
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Time of the post-resolution:                                               0.711175       
+
+Total time for the whole computation                                       37.025         
+
diff --git a/tests/GPU/ColdLegMixing/ColdLegMixing_BENCH.TU.is247793_gfx1100 b/tests/GPU/ColdLegMixing/ColdLegMixing_BENCH.TU.is247793_gfx1100
index 4a55029b7d..036dba545d 100644
--- a/tests/GPU/ColdLegMixing/ColdLegMixing_BENCH.TU.is247793_gfx1100
+++ b/tests/GPU/ColdLegMixing/ColdLegMixing_BENCH.TU.is247793_gfx1100
@@ -1,53 +1,78 @@
-Statistiques d'initialisation du calcul
-
-Temps total                       57.599
-
-Statistiques de resolution du probleme
-
-Temps total                       14.8858
-
-
-Timesteps                         10
-Secondes / pas de temps           1.48857
-Dont solveurs Ax=B                0.154625 10% (2 appels/pas de temps)
-Dont solveur diffusion_implicite  0.060475  4% (4 appels/pas de temps)
-Dont assemblage matrice_implicite 0.071929  4% (4 appels/pas de temps)
-Dont mettre_a_jour                0.152573 10% (4 appels/pas de temps)
-Dont operateurs convection        0.227592 15% (9.6 appels/pas de temps)
-Dont operateurs diffusion         0.108532  7% (18 appels/pas de temps)
-Dont operateurs gradient          0.044300  2% (5 appels/pas de temps)
-Dont operateurs divergence        0.022561  1% (6 appels/pas de temps)
-Dont operateurs source            0.005090  0% (4 appels/pas de temps)
-Dont operations postraitement     0.574336 38% (1 appel/pas de temps)
-Dont calcul dt                    0.013271  0% (6 appels/pas de temps)
-Dont calcul divers                0.053290  3% (0 appels/pas de temps)
-Nb solveur / pas de temps         2
-Secondes / solveur                0.0773127
-Iterations / solveur              21.4
-GPU statistics per time step (experimental):
-Libraries : 0.154164 s 10.4%  2.0 calls
-Kernels   : 0.622647 s 41.8% 4609110.7 calls
-Copy H2D  : 0.042943 s  2.9% 102.3 calls  9.2 GB/s
-Copy D2H  : 0.010556 s  0.7% 102.4 calls 15.4 GB/s
-Alloc/Free: 0.006001 s  0.4% 74.7 calls
-GPU: 52.1% Copy H<->D: 3.5% Alloc/Free: 0.4% Comm: 0% CPU & Others: 43.8%
-I/O:
-
-Timesteps = number of time steps
-Nb solveur = number of linear system resolutions
-Nb assemblage implicite = number of matrix assemblies for the implicit scheme
-Iterations = average number of iterations of the solver
-Communications = fraction of the time spent
-                 in communications between processors (excluding io files)
-Network latency = time of one mpsum measured by an internal bench over 0.1s
-Network bandwidth = maximum on all processors
-                    of the average bandwidth of send_recv operations
-Waiting time = estimation of the waiting time of the different processors
-
-Max_waiting_time big    => probably due to a bad partitioning
-Communications > 30%    => too many processors or network too slow
-
-Statistiques de post resolution
-
-Temps total                       4.71802
+                                             # Global performance file #
+
+This is the global file for tracking performance in TRUST. It stores aggregated quantities.
+More detailed statistics can be found in the ColdLegMixing_BENCH_csv.TU file
+For time loop, only standard counters of level 1 are printed alongside your custom counters
+Time is given in seconds
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+                                              Context of the computation
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Date:     22-05-2026 -- 18:30:47
+OS:       is247793__Linux__x86_64__6.8.0-107-generic__#107~22.04.1-Ubuntu SMP PREEMPT_DYNAMIC Wed Mar 18 23:40:43 UTC 
+CPU model : AMD Ryzen Threadripper PRO 7965WX 24-Cores
+Total number of threads:48
+GPU model: AMD Radeon PRO W7900
+HIP runtime version: 7.53
+HIP drivers version: 7.53
+Nb procs used for the computation: 1
+TRUST version: 1.9.8_beta
+Total number of elements used for the calculation: 2160000
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+                                           Computation start-up statistics
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Total time of the start-up:                                                38.1848        
+
+Number of calls to the linear solver per time step:                        2              
+Average time of the resolution of the linear problem per call:             1.76015        
+Average number of iteration of the linear solver per call:                 0              
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+                                                 Time loop statistics
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+The first time step is not accounted for the computation of the time loop statistics
+Total time of the time loop:                                               7.398          
+Number of time steps:                                                      9              
+Skipped time steps:                                                        1              
+Average time per time step:                                                0.822          
+Standard deviation between time steps:                                     0.165676       
+Time elapsed in the skipped time steps:                                    2.16854        
+
+
+Standard counter description             | Time/step       | % loop time | Call(s)/step   
+------------------------------------------------------------------------------------------
+Linear solver resolutions Ax=B           | 0.271718        | 33.1        | 2              
+Matrix assembly for implicit scheme      | 0.02632243      |  3.2        | 4              
+Convection operator                      | 0.1223453       | 14.9        | 10             
+Diffusion operator                       | 0.06110811      |  7.4        | 18             
+Gradient operator                        | 0.03366703      |  4.1        | 5              
+Divergence operator                      | 0.02193279      |  2.7        | 6              
+Source terms                             | 0.001620552     |  0.2        | 4              
+Update ::mettre_a_jour                   | 0.1051999       | 12.8        | 4              
+Solver for implicit diffusion            | 0.04933715      |  6.0        | 4              
+Computation of the time step dt          | 0.01476115      |  1.8        | 6              
+Post-treatment operations                | 0.06948995      |  8.5        | 1              
+Other operations                         | 0.04449732      |  5.4        | 
+
+Average number of iteration of the linear solver per call:                 53             
+
+
+-----------------------------------------------------------------------------------------------------------
+                                                    GPU statistics
+-----------------------------------------------------------------------------------------------------------
+Counter description                      | Time per step   | % loop time | Call(s)/step    | Bandwidth 
+-----------------------------------------------------------------------------------------------------------
+Libraries:                               | 0.270707        | 32.9        | 2               | 
+Kernels:                                 | 0.519918        | 63.3        | 1063            | 
+Copy host to device:                     | 0.00111484      |  0.1        | 42              | 1.9 GB/s
+Copy device to host:                     | 0.00244084      |  0.3        | 31              | 14.0 GB/s
+Alloc/Free on device:                    | 0.00112784      |  0.1        | 39              | 
+GPU: 96% Copy H<->D: 0.43% Alloc/free: 0.14% Comm: 0% CPU & I/O: 3.2%
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+                                              Post-resolution statistics
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Time of the post-resolution:                                               1.09806        
+
+Total time for the whole computation                                       48.8494        
 
diff --git a/tests/GPU/ColdLegMixing/ColdLegMixing_TMIN.son.ref b/tests/GPU/ColdLegMixing/ColdLegMixing_TMIN.son.ref
index b8608d39cf..a2b95357eb 100644
--- a/tests/GPU/ColdLegMixing/ColdLegMixing_TMIN.son.ref
+++ b/tests/GPU/ColdLegMixing/ColdLegMixing_TMIN.son.ref
@@ -4,12 +4,12 @@
 # Type NUMERO_ELEM_SUR_MAITRE
 0.00000000e+00 0.00000000e+00
 5.00000000e-03 0.00000000e+00
-1.00000000e-02 -1.54054285e-11
-1.50000000e-02 -1.35238391e-10
-2.00000000e-02 -5.29718902e-10
-2.50000000e-02 -1.44619321e-09
-3.00000000e-02 -3.20614607e-09
-3.50000000e-02 -6.20193023e-09
-4.00000000e-02 -1.08928586e-08
-4.50000000e-02 -1.78005486e-08
-5.00000000e-02 -2.75035004e-08
+1.00000000e-02 -1.54054293e-11
+1.50000000e-02 -1.35238393e-10
+2.00000000e-02 -5.29718896e-10
+2.50000000e-02 -1.44619318e-09
+3.00000000e-02 -3.20614598e-09
+3.50000000e-02 -6.20193004e-09
+4.00000000e-02 -1.08928583e-08
+4.50000000e-02 -1.78005482e-08
+5.00000000e-02 -2.75034999e-08
diff --git a/tests/GPU/ColdLegMixing/ColdLegMixing_VMAX.son.ref b/tests/GPU/ColdLegMixing/ColdLegMixing_VMAX.son.ref
index 4b5bf6d664..5a4f10ebd6 100644
--- a/tests/GPU/ColdLegMixing/ColdLegMixing_VMAX.son.ref
+++ b/tests/GPU/ColdLegMixing/ColdLegMixing_VMAX.son.ref
@@ -3,13 +3,13 @@
 # Champ VMAX [m/s]
 # Type NUMERO_ELEM_SUR_MAITRE
 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
-5.00000000e-03 2.83341776e-03 2.29668974e-03 9.28005173e-04
-1.00000000e-02 5.68379870e-03 4.55499731e-03 1.85889956e-03
-1.50000000e-02 8.54247059e-03 6.79018237e-03 2.79168084e-03
-2.00000000e-02 1.14022905e-02 9.01491002e-03 3.72483538e-03
+5.00000000e-03 2.83341777e-03 2.29668974e-03 9.28005164e-04
+1.00000000e-02 5.68379868e-03 4.55499732e-03 1.85889956e-03
+1.50000000e-02 8.54247057e-03 6.79018239e-03 2.79168083e-03
+2.00000000e-02 1.14022905e-02 9.01491004e-03 3.72483537e-03
 2.50000000e-02 1.42619333e-02 1.12325293e-02 4.65783489e-03
-3.00000000e-02 1.71207647e-02 1.34443266e-02 5.59032875e-03
-3.50000000e-02 1.99784164e-02 1.56505655e-02 6.52205471e-03
-4.00000000e-02 2.28346205e-02 1.78509971e-02 7.45279984e-03
-4.50000000e-02 2.56891456e-02 2.00536201e-02 8.38237896e-03
+3.00000000e-02 1.71207647e-02 1.34443265e-02 5.59032877e-03
+3.50000000e-02 1.99784164e-02 1.56505655e-02 6.52205474e-03
+4.00000000e-02 2.28346205e-02 1.78509971e-02 7.45279986e-03
+4.50000000e-02 2.56891456e-02 2.00536201e-02 8.38237897e-03
 5.00000000e-02 2.85417720e-02 2.22548485e-02 9.31062254e-03
diff --git a/tests/GPU/ColdLegMixing/ColdLegMixing_VMIN.son.ref b/tests/GPU/ColdLegMixing/ColdLegMixing_VMIN.son.ref
index 3adde92724..c4e9503b6a 100644
--- a/tests/GPU/ColdLegMixing/ColdLegMixing_VMIN.son.ref
+++ b/tests/GPU/ColdLegMixing/ColdLegMixing_VMIN.son.ref
@@ -3,13 +3,13 @@
 # Champ VMIN [m/s]
 # Type NUMERO_ELEM_SUR_MAITRE
 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
-5.00000000e-03 -2.83341863e-03 -3.24383912e-03 -9.28005166e-04
-1.00000000e-02 -5.60710555e-03 -6.54040291e-03 -1.88279343e-03
-1.50000000e-02 -8.34715109e-03 -9.86741368e-03 -2.85002332e-03
-2.00000000e-02 -1.10749232e-02 -1.32043684e-02 -3.81937590e-03
-2.50000000e-02 -1.37964246e-02 -1.65457438e-02 -4.78892735e-03
-3.00000000e-02 -1.65145141e-02 -1.98887507e-02 -5.75785211e-03
-3.50000000e-02 -1.92305284e-02 -2.32318978e-02 -6.72575799e-03
-4.00000000e-02 -2.19653111e-02 -2.65743352e-02 -7.69241057e-03
-4.50000000e-02 -2.47150783e-02 -2.99155320e-02 -8.65762676e-03
+5.00000000e-03 -2.83341860e-03 -3.24383911e-03 -9.28005164e-04
+1.00000000e-02 -5.60710555e-03 -6.54040292e-03 -1.88279341e-03
+1.50000000e-02 -8.34715112e-03 -9.86741370e-03 -2.85002328e-03
+2.00000000e-02 -1.10749232e-02 -1.32043684e-02 -3.81937586e-03
+2.50000000e-02 -1.37964246e-02 -1.65457438e-02 -4.78892732e-03
+3.00000000e-02 -1.65145140e-02 -1.98887507e-02 -5.75785210e-03
+3.50000000e-02 -1.92305283e-02 -2.32318978e-02 -6.72575799e-03
+4.00000000e-02 -2.19653111e-02 -2.65743352e-02 -7.69241058e-03
+4.50000000e-02 -2.47150784e-02 -2.99155320e-02 -8.65762676e-03
 5.00000000e-02 -2.74653003e-02 -3.32551150e-02 -9.62123630e-03
diff --git a/tests/GPU/ColdLegMixing/PAR_ColdLegMixing_BENCH.TU.adastra_gfx942x8 b/tests/GPU/ColdLegMixing/PAR_ColdLegMixing_BENCH.TU.adastra_gfx942x8
new file mode 100644
index 0000000000..bfe1fb7576
--- /dev/null
+++ b/tests/GPU/ColdLegMixing/PAR_ColdLegMixing_BENCH.TU.adastra_gfx942x8
@@ -0,0 +1,114 @@
+                                             # Global performance file #
+
+This is the global file for tracking performance in TRUST. It stores aggregated quantities.
+More detailed statistics can be found in the PAR_ColdLegMixing_BENCH_csv.TU file
+For time loop, only standard counters of level 1 are printed alongside your custom counters
+Time is given in seconds
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+                                              Context of the computation
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Date:     08-06-2026 -- 14:36:34
+OS:       a1007__Linux__x86_64__5.14.0-570.69.1.el9_6.x86_64__#1 SMP PREEMPT_DYNAMIC Tue Nov 25 01:30:14 EST 2025
+CPU model : AMD Instinct MI300A Accelerator
+Total number of threads:192
+GPU model: AMD Instinct MI300A
+HIP runtime version: 6.43
+HIP drivers version: 6.43
+Nb procs used for the computation: 8
+TRUST version: 1.9.8_beta
+Total number of elements used for the calculation: 2160000
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+                                           Computation start-up statistics
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Total time of the start-up:                                                23.5004        
+Number of virtual exchanges:                                               186            
+Maximum number of MPI allreduce per time step                              400            
+
+Number of calls to the linear solver per time step:                        2              
+Average time of the resolution of the linear problem per call:             2.04898        
+Average number of iteration of the linear solver per call:                 0              
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+                                                 Time loop statistics
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+The first time step is not accounted for the computation of the time loop statistics
+Total time of the time loop:                                               3.27794        
+Number of time steps:                                                      9              
+Skipped time steps:                                                        1              
+Average time per time step:                                                0.364215       
+Standard deviation between time steps:                                     0.0790277      
+Time elapsed in the skipped time steps:                                    0.728734       
+
+Percent of total time spend in communication:                              15.8925        
+
+Standard counter description             | Time/step       | % loop time | Call(s)/step   
+------------------------------------------------------------------------------------------
+Linear solver resolutions Ax=B           | 0.194097        | 53.3        | 2              
+Matrix assembly for implicit scheme      | 0.004431142     |  1.2        | 4              
+Convection operator                      | 0.007754889     |  2.1        | 9              
+Diffusion operator                       | 0.005278478     |  1.4        | 18             
+Gradient operator                        | 0.00350893      |  1.0        | 5              
+Divergence operator                      | 0.00190806      |  0.5        | 6              
+Source terms                             | 0.0002011109    |  0.1        | 4              
+Update ::mettre_a_jour                   | 0.04637919      | 12.7        | 4              
+Solver for implicit diffusion            | 0.01803998      |  5.0        | 4              
+Computation of the time step dt          | 0.002887528     |  0.8        | 6              
+Post-treatment operations                | 0.04766509      | 13.1        | 1              
+Other operations                         | 0.03206428      |  8.8        | 
+Number of virtual exchanges per time step:                                 148            
+Maximum number of MPI allreduce per time step                              78.8           
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ Warning: number of MPI allreduce calls per time step is high. Contact TRUST team to run massive parallel calculations
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Average number of iteration of the linear solver per call:                 52.7           
+
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+                                               Time loop statistics: IO
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Output write sequential:                                                   1066           MB/s
+
+---------------------------------------------------------------------------------------------------------
+Warning: One or several PETSc solvers are used and thus the communication time below are under-estimated.
+---------------------------------------------------------------------------------------------------------
+
+Average of the fraction of the time spent in communications between processors:      14.7           %
+Max of the fraction of the time spent in communications between processors:          25.7           %
+Min of the fraction of the time spent in communications between processors:          12.5           %
+Time of one mpsum measured by an internal bench over 0.1s (network latency):         1.38126e-05    
+Network maximum bandwidth on all processors:                                         41.9 GB/s      
+Total network traffic:                                                               4692.53        MB/time step
+Average message size:                                                                401.819        kB
+Min waiting time:                                                                    12.3           % of total time
+Max waiting time:                                                                    25.4           % of total time
+Avg waiting time:                                                                    17.65          % of total time
+
+-----------------------------------------------------------------------------------------------------------
+                                                    GPU statistics
+-----------------------------------------------------------------------------------------------------------
+Counter description                      | Time per step   | % loop time | Call(s)/step    | Bandwidth 
+-----------------------------------------------------------------------------------------------------------
+Libraries:                               | 0.19315         | 53.0        | 2               | 
+Kernels:                                 | 0.0718475       | 19.7        | 2040            | 
+Copy host to device:                     | 0.000624345     |  0.2        | 33              | 0.6 GB/s
+Copy device to host:                     | 0.000775929     |  0.2        | 22              | 5.7 GB/s
+Alloc/Free on device:                    | 0.000295281     |  0.1        | 37              | 
+GPU: 73% Copy H<->D: 0.38% Alloc/free: 0.081% Comm: 19% CPU & I/O: 7.4%
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+                                              Post-resolution statistics
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Time of the post-resolution:                                               2.30176        
+Maximum number of MPI allreduce per time step                              57             
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Max waiting time big    => probably due to a bad partitioning
+Communications > 30%    => too many processors or network too slow
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Total time for the whole computation                                       29.8089        
+
+[Slurm] Power consumption (38 s):  1.690 kW  0.018 kWh  0.002 € (0.10€/kWh)
diff --git a/tests/GPU/DomainFlowLES/DomainFlowLES.data b/tests/GPU/DomainFlowLES/DomainFlowLES.data
index 077a878e64..9db085c491 100644
--- a/tests/GPU/DomainFlowLES/DomainFlowLES.data
+++ b/tests/GPU/DomainFlowLES/DomainFlowLES.data
@@ -81,7 +81,8 @@ Scatter DOM.Zones dom
 END SCATTER #
 
 # Discretization #
-VEFPrep1B dis
+VEFPrep1B dis 
+Lire dis { reorder { algo Hilbert } }
 
 Runge_Kutta_Rationnel_ordre_2 sch
 Lire sch
diff --git a/tests/GPU/DomainFlowLES/DomainFlowLES_BENCH.TU.adastra_gfx90a b/tests/GPU/DomainFlowLES/DomainFlowLES_BENCH.TU.adastra_gfx90a
index 80e249a8cb..856fc2d793 100644
--- a/tests/GPU/DomainFlowLES/DomainFlowLES_BENCH.TU.adastra_gfx90a
+++ b/tests/GPU/DomainFlowLES/DomainFlowLES_BENCH.TU.adastra_gfx90a
@@ -8,8 +8,8 @@ Time is given in seconds
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
                                               Context of the computation
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-Date:     20-02-2026 -- 18:03:12
-OS:       g1081__Linux__x86_64__5.14.0-570.69.1.el9_6.x86_64__#1 SMP PREEMPT_DYNAMIC Tue Nov 25 01:30:14 EST 2025
+Date:     24-05-2026 -- 15:56:05
+OS:       g1321__Linux__x86_64__5.14.0-570.69.1.el9_6.x86_64__#1 SMP PREEMPT_DYNAMIC Tue Nov 25 01:30:14 EST 2025
 CPU model : AMD EPYC 7A53 64-Core Processor
 Total number of threads:128
 GPU model: AMD Instinct MI250X
@@ -22,41 +22,41 @@ Total number of elements used for the calculation: 3276800
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
                                            Computation start-up statistics
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-Total time of the start-up:                                                67.2841        
+Total time of the start-up:                                                55.684         
 
 Number of calls to the linear solver per time step:                        4              
-Average time of the resolution of the linear problem per call:             0.793104       
-Average number of iteration of the linear solver per call:                 21             
+Average time of the resolution of the linear problem per call:             1.0001         
+Average number of iteration of the linear solver per call:                 21.75          
 
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
                                                  Time loop statistics
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 The first time step is not accounted for the computation of the time loop statistics
-Total time of the time loop:                                               11.8106        
+Total time of the time loop:                                               9.88073        
 Number of time steps:                                                      9              
 Skipped time steps:                                                        1              
-Average time per time step:                                                1.31229        
-Standard deviation between time steps:                                     0.083117       
-Time elapsed in the skipped time steps:                                    2.8317         
+Average time per time step:                                                1.09786        
+Standard deviation between time steps:                                     0.0775836      
+Time elapsed in the skipped time steps:                                    2.6768         
 
 
 Standard counter description             | Time/step       | % loop time | Call(s)/step   
 ------------------------------------------------------------------------------------------
-Linear solver resolutions Ax=B           | 0.20118         | 12.4        | 4              
-Matrix assembly for implicit scheme      | 0.1265401       |  7.8        | 6              
-Convection operator                      | 0.4557769       | 28.0        | 8              
-Diffusion operator                       | 0.1462312       |  9.0        | 36             
-Gradient operator                        | 0.1370221       |  8.4        | 9              
-Divergence operator                      | 0.04048505      |  2.5        | 8              
-Source terms                             | 0.002067465     |  0.1        | 6              
-Update ::mettre_a_jour                   | 0.07502718      |  4.6        | 5              
-Solver for implicit diffusion            | 0.0577835       |  3.6        | 6              
-Computation of the time step dt          | 0.02468548      |  1.5        | 10             
-Turbulence model::update                 | 0.01060921      |  0.7        | 2              
-Post-treatment operations                | 0.0282308       |  1.7        | 2              
-Other operations                         | 0.006654609     |  0.4        | 
+Linear solver resolutions Ax=B           | 0.174566        | 15.9        | 4              
+Matrix assembly for implicit scheme      | 0.1055591       |  9.6        | 6              
+Convection operator                      | 0.4354912       | 39.7        | 8              
+Diffusion operator                       | 0.1078714       |  9.8        | 36             
+Gradient operator                        | 0.06912533      |  6.3        | 9              
+Divergence operator                      | 0.02441109      |  2.2        | 8              
+Source terms                             | 0.001850366     |  0.2        | 6              
+Update ::mettre_a_jour                   | 0.04996588      |  4.6        | 5              
+Solver for implicit diffusion            | 0.05493421      |  5.0        | 6              
+Computation of the time step dt          | 0.01905284      |  1.7        | 10             
+Turbulence model::update                 | 0.008911962     |  0.8        | 2              
+Post-treatment operations                | 0.02671327      |  2.4        | 2              
+Other operations                         | 0.01940543      |  1.8        | 
 
-Average number of iteration of the linear solver per call:                 35.3           
+Average number of iteration of the linear solver per call:                 35.7           
 
 
 -----------------------------------------------------------------------------------------------------------
@@ -64,17 +64,17 @@ Average number of iteration of the linear solver per call:                 35.3
 -----------------------------------------------------------------------------------------------------------
 Counter description                      | Time per step   | % loop time | Call(s)/step    | Bandwidth 
 -----------------------------------------------------------------------------------------------------------
-Libraries:                               | 0.200505        | 15.3        | 4               | 
-Kernels:                                 | 1.07348         | 81.8        | 1993            | 
-Copy host to device:                     | 0.000944822     |  0.1        | 43              | 3.1 GB/s
-Copy device to host:                     | 0.00182325      |  0.1        | 7               | 20.4 GB/s
-Alloc/Free on device:                    | 0.000348043     |  0.0        | 23              | 
-GPU: 97% Copy H<->D: 0.21% Alloc/free: 0.027% Comm: 0% CPU & I/O: 2.7%
+Libraries:                               | 0.173899        | 15.8        | 4               | 
+Kernels:                                 | 0.885519        | 80.7        | 1991            | 
+Copy host to device:                     | 0.000952295     |  0.1        | 43              | 3.1 GB/s
+Copy device to host:                     | 0.0018143       |  0.2        | 7               | 20.5 GB/s
+Alloc/Free on device:                    | 0.000332402     |  0.0        | 23              | 
+GPU: 96% Copy H<->D: 0.25% Alloc/free: 0.03% Comm: 0% CPU & I/O: 3.2%
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
                                               Post-resolution statistics
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-Time of the post-resolution:                                               0.342931       
+Time of the post-resolution:                                               0.342387       
 
-Total time for the whole computation                                       82.2694        
+Total time for the whole computation                                       68.5839        
 
-Power consumption: 0 kW 0 kWh 0 € (0.12€/kWh)
+[Slurm] Power consumption (75 s):  0.475 kW  0.010 kWh  0.001 € (0.10€/kWh)
diff --git a/tests/GPU/DomainFlowLES/DomainFlowLES_BENCH.TU.dalianvl_cc100 b/tests/GPU/DomainFlowLES/DomainFlowLES_BENCH.TU.dalianvl_cc100
new file mode 100644
index 0000000000..4dda32c8dd
--- /dev/null
+++ b/tests/GPU/DomainFlowLES/DomainFlowLES_BENCH.TU.dalianvl_cc100
@@ -0,0 +1,79 @@
+                                             # Global performance file #
+
+This is the global file for tracking performance in TRUST. It stores aggregated quantities.
+More detailed statistics can be found in the DomainFlowLES_BENCH_csv.TU file
+For time loop, only standard counters of level 1 are printed alongside your custom counters
+Time is given in seconds
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+                                              Context of the computation
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Date:     22-05-2026 -- 12:14:07
+OS:       dalianvl14__Linux__aarch64__5.14.0-503.40.1.el9_5.aarch64__#1 SMP PREEMPT_DYNAMIC Thu Apr 24 13:05:29 UTC 2025
+CPU model : Unknown Linux CPU
+Total number of threads:144
+GPU model: NVIDIA GB200
+CUDA runtime version: 13.0
+CUDA drivers version: 13.20
+Nb procs used for the computation: 1
+TRUST version: 1.9.8_beta
+Total number of elements used for the calculation: 3276800
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+                                           Computation start-up statistics
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Total time of the start-up:                                                48.5207        
+
+Number of calls to the linear solver per time step:                        4              
+Average time of the resolution of the linear problem per call:             0.98739        
+Average number of iteration of the linear solver per call:                 21.75          
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+                                                 Time loop statistics
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+The first time step is not accounted for the computation of the time loop statistics
+Total time of the time loop:                                               2.19618        
+Number of time steps:                                                      9              
+Skipped time steps:                                                        1              
+Average time per time step:                                                0.24402        
+Standard deviation between time steps:                                     0.0459336      
+Time elapsed in the skipped time steps:                                    1.21579        
+
+
+Standard counter description             | Time/step       | % loop time | Call(s)/step   
+------------------------------------------------------------------------------------------
+Linear solver resolutions Ax=B           | 0.0962815       | 39.5        | 4              
+Matrix assembly for implicit scheme      | 0.01431973      |  5.9        | 6              
+Convection operator                      | 0.02844478      | 11.7        | 8              
+Diffusion operator                       | 0.02991948      | 12.3        | 36             
+Gradient operator                        | 0.007994795     |  3.3        | 9              
+Divergence operator                      | 0.002778588     |  1.1        | 8              
+Source terms                             | 0.0007949644    |  0.3        | 6              
+Update ::mettre_a_jour                   | 0.009183308     |  3.8        | 5              
+Solver for implicit diffusion            | 0.0204651       |  8.4        | 6              
+Computation of the time step dt          | 0.002499168     |  1.0        | 10             
+Turbulence model::update                 | 0.001439575     |  0.6        | 2              
+Post-treatment operations                | 0.01804008      |  7.4        | 2              
+Other operations                         | 0.01185865      |  4.9        | 
+
+Average number of iteration of the linear solver per call:                 36.1           
+
+
+-----------------------------------------------------------------------------------------------------------
+                                                    GPU statistics
+-----------------------------------------------------------------------------------------------------------
+Counter description                      | Time per step   | % loop time | Call(s)/step    | Bandwidth 
+-----------------------------------------------------------------------------------------------------------
+Libraries:                               | 0.0958068       | 39.3        | 4               | 
+Kernels:                                 | 0.117053        | 48.0        | 1991            | 
+Copy host to device:                     | 0.000836579     |  0.3        | 43              | 3.5 GB/s
+Copy device to host:                     | 0.000433294     |  0.2        | 7               | 85.9 GB/s
+Alloc/Free on device:                    | 0.000928713     |  0.4        | 23              | 
+GPU: 87% Copy H<->D: 0.52% Alloc/free: 0.38% Comm: 0% CPU & I/O: 12%
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+                                              Post-resolution statistics
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Time of the post-resolution:                                               4.63836        
+
+Total time for the whole computation                                       56.571         
+
diff --git a/tests/GPU/DomainFlowLES/DomainFlowLES_BENCH.TU.eureka_cc89 b/tests/GPU/DomainFlowLES/DomainFlowLES_BENCH.TU.eureka_cc89
new file mode 100644
index 0000000000..91f11500bd
--- /dev/null
+++ b/tests/GPU/DomainFlowLES/DomainFlowLES_BENCH.TU.eureka_cc89
@@ -0,0 +1,79 @@
+                                             # Global performance file #
+
+This is the global file for tracking performance in TRUST. It stores aggregated quantities.
+More detailed statistics can be found in the DomainFlowLES_BENCH_csv.TU file
+For time loop, only standard counters of level 1 are printed alongside your custom counters
+Time is given in seconds
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+                                              Context of the computation
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Date:     11-05-2026 -- 08:30:16
+OS:       eureka__Linux__x86_64__6.17.0-20-generic__#20~24.04.1-Ubuntu SMP PREEMPT_DYNAMIC Thu Mar 19 01:28:37 UTC 2
+CPU model : INTEL(R) XEON(R) PLATINUM 8580
+Total number of threads:240
+GPU model: NVIDIA RTX 6000 Ada Generation
+CUDA runtime version: 12.90
+CUDA drivers version: 13.10
+Nb procs used for the computation: 1
+TRUST version: 1.9.8_beta
+Total number of elements used for the calculation: 3276800
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+                                           Computation start-up statistics
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Total time of the start-up:                                                53.5087        
+
+Number of calls to the linear solver per time step:                        4              
+Average time of the resolution of the linear problem per call:             1.18754        
+Average number of iteration of the linear solver per call:                 21.75          
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+                                                 Time loop statistics
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+The first time step is not accounted for the computation of the time loop statistics
+Total time of the time loop:                                               4.49963        
+Number of time steps:                                                      9              
+Skipped time steps:                                                        1              
+Average time per time step:                                                0.499959       
+Standard deviation between time steps:                                     0.0428608      
+Time elapsed in the skipped time steps:                                    3.00568        
+
+
+Standard counter description             | Time/step       | % loop time | Call(s)/step   
+------------------------------------------------------------------------------------------
+Linear solver resolutions Ax=B           | 0.16093         | 32.2        | 4              
+Matrix assembly for implicit scheme      | 0.01970638      |  3.9        | 6              
+Convection operator                      | 0.09048199      | 18.1        | 8              
+Diffusion operator                       | 0.0725157       | 14.5        | 36             
+Gradient operator                        | 0.02036548      |  4.1        | 9              
+Divergence operator                      | 0.008354662     |  1.7        | 8              
+Source terms                             | 0.001649874     |  0.3        | 6              
+Update ::mettre_a_jour                   | 0.01730522      |  3.5        | 5              
+Solver for implicit diffusion            | 0.05971053      | 11.9        | 6              
+Computation of the time step dt          | 0.007172377     |  1.4        | 10             
+Turbulence model::update                 | 0.004708388     |  0.9        | 2              
+Post-treatment operations                | 0.01345581      |  2.7        | 2              
+Other operations                         | 0.02360189      |  4.7        | 
+
+Average number of iteration of the linear solver per call:                 35.4           
+
+
+-----------------------------------------------------------------------------------------------------------
+                                                    GPU statistics
+-----------------------------------------------------------------------------------------------------------
+Counter description                      | Time per step   | % loop time | Call(s)/step    | Bandwidth 
+-----------------------------------------------------------------------------------------------------------
+Libraries:                               | 0.160556        | 32.1        | 4               | 
+Kernels:                                 | 0.312891        | 62.6        | 1991            | 
+Copy host to device:                     | 0.000655779     |  0.1        | 43              | 4.4 GB/s
+Copy device to host:                     | 0.00319411      |  0.6        | 7               | 11.7 GB/s
+Alloc/Free on device:                    | 0.000571484     |  0.1        | 23              | 
+GPU: 95% Copy H<->D: 0.77% Alloc/free: 0.11% Comm: 0% CPU & I/O: 4.4%
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+                                              Post-resolution statistics
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Time of the post-resolution:                                               0.0734133      
+
+Total time for the whole computation                                       61.0875        
+
diff --git a/tests/GPU/DomainFlowLES/DomainFlowLES_BENCH.TU.irene-amd-ccrt_cc70 b/tests/GPU/DomainFlowLES/DomainFlowLES_BENCH.TU.irene-amd-ccrt_cc70
index 1860138c54..f0a90ec939 100644
--- a/tests/GPU/DomainFlowLES/DomainFlowLES_BENCH.TU.irene-amd-ccrt_cc70
+++ b/tests/GPU/DomainFlowLES/DomainFlowLES_BENCH.TU.irene-amd-ccrt_cc70
@@ -8,8 +8,8 @@ Time is given in seconds
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
                                               Context of the computation
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-Date:     20-03-2026 -- 14:11:03
-OS:       irene7056__Linux__x86_64__4.18.0-553.89.1.el8_10.x86_64__#1 SMP Sat Nov 29 00:49:18 EST 2025
+Date:     23-04-2026 -- 14:46:40
+OS:       irene7051__Linux__x86_64__4.18.0-553.89.1.el8_10.x86_64__#1 SMP Sat Nov 29 00:49:18 EST 2025
 CPU model : Intel(R) Xeon(R) Gold 6230 CPU @ 2.10GHz
 Total number of threads:80
 GPU model: Tesla V100-SXM2-16GB
@@ -22,39 +22,39 @@ Total number of elements used for the calculation: 3276800
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
                                            Computation start-up statistics
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-Total time of the start-up:                                                84.3428        
+Total time of the start-up:                                                89.5421        
 
 Number of calls to the linear solver per time step:                        4              
-Average time of the resolution of the linear problem per call:             1.57906        
+Average time of the resolution of the linear problem per call:             1.65017        
 Average number of iteration of the linear solver per call:                 21.75          
 
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
                                                  Time loop statistics
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 The first time step is not accounted for the computation of the time loop statistics
-Total time of the time loop:                                               10.5634        
+Total time of the time loop:                                               10.4615        
 Number of time steps:                                                      9              
 Skipped time steps:                                                        1              
-Average time per time step:                                                1.17371        
-Standard deviation between time steps:                                     0.10204        
-Time elapsed in the skipped time steps:                                    4.49973        
+Average time per time step:                                                1.16239        
+Standard deviation between time steps:                                     0.107809       
+Time elapsed in the skipped time steps:                                    4.67111        
 
 
 Standard counter description             | Time/step       | % loop time | Call(s)/step   
 ------------------------------------------------------------------------------------------
-Linear solver resolutions Ax=B           | 0.281414        | 24.0        | 4              
-Matrix assembly for implicit scheme      | 0.04662882      |  4.0        | 6              
-Convection operator                      | 0.3456106       | 29.4        | 8              
-Diffusion operator                       | 0.1786968       | 15.2        | 36             
-Gradient operator                        | 0.0569327       |  4.9        | 9              
-Divergence operator                      | 0.04346049      |  3.7        | 8              
-Source terms                             | 0.00252737      |  0.2        | 6              
-Update ::mettre_a_jour                   | 0.04792681      |  4.1        | 5              
-Solver for implicit diffusion            | 0.07688781      |  6.6        | 6              
-Computation of the time step dt          | 0.02732393      |  2.3        | 10             
-Turbulence model::update                 | 0.01237735      |  1.1        | 2              
-Post-treatment operations                | 0.03411321      |  2.9        | 2              
-Other operations                         | 0.0198106       |  1.7        | 
+Linear solver resolutions Ax=B           | 0.281586        | 24.2        | 4              
+Matrix assembly for implicit scheme      | 0.04644436      |  4.0        | 6              
+Convection operator                      | 0.3458352       | 29.8        | 8              
+Diffusion operator                       | 0.1801068       | 15.5        | 36             
+Gradient operator                        | 0.04069576      |  3.5        | 9              
+Divergence operator                      | 0.04349869      |  3.7        | 8              
+Source terms                             | 0.002216718     |  0.2        | 6              
+Update ::mettre_a_jour                   | 0.0466742       |  4.0        | 5              
+Solver for implicit diffusion            | 0.07746529      |  6.7        | 6              
+Computation of the time step dt          | 0.02720833      |  2.3        | 10             
+Turbulence model::update                 | 0.01275307      |  1.1        | 2              
+Post-treatment operations                | 0.03633502      |  3.1        | 2              
+Other operations                         | 0.02157274      |  1.9        | 
 
 Average number of iteration of the linear solver per call:                 35.4           
 
@@ -64,17 +64,17 @@ Average number of iteration of the linear solver per call:                 35.4
 -----------------------------------------------------------------------------------------------------------
 Counter description                      | Time per step   | % loop time | Call(s)/step    | Bandwidth 
 -----------------------------------------------------------------------------------------------------------
-Libraries:                               | 0.280557        | 23.9        | 4               | 
-Kernels:                                 | 0.840487        | 71.6        | 1993            | 
-Copy host to device:                     | 0.00142747      |  0.1        | 43              | 2.0 GB/s
-Copy device to host:                     | 0.00819653      |  0.7        | 7               | 4.5 GB/s
-Alloc/Free on device:                    | 0.000885623     |  0.1        | 23              | 
-GPU: 96% Copy H<->D: 0.82% Alloc/free: 0.075% Comm: 0% CPU & I/O: 3.6%
+Libraries:                               | 0.280644        | 24.1        | 4               | 
+Kernels:                                 | 0.824854        | 71.0        | 1991            | 
+Copy host to device:                     | 0.00150175      |  0.1        | 43              | 1.9 GB/s
+Copy device to host:                     | 0.00918955      |  0.8        | 7               | 4.1 GB/s
+Alloc/Free on device:                    | 0.000722558     |  0.1        | 23              | 
+GPU: 95% Copy H<->D: 0.92% Alloc/free: 0.062% Comm: 0% CPU & I/O: 3.9%
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
                                               Post-resolution statistics
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-Time of the post-resolution:                                               0.384397       
+Time of the post-resolution:                                               0.253759       
 
-Total time for the whole computation                                       99.7903        
+Total time for the whole computation                                       104.929        
 
-[Slurm] Power consumption (116 s):  0.222 kW  0.007 kWh  0.001 € (0.10€/kWh)
+[Slurm] Power consumption (119 s):  0.195 kW  0.006 kWh  0.001 € (0.10€/kWh)
diff --git a/tests/GPU/DomainFlowLES/DomainFlowLES_BENCH.TU.is157091_cc86 b/tests/GPU/DomainFlowLES/DomainFlowLES_BENCH.TU.is157091_cc86
index 83cb9631bc..647ab137d1 100644
--- a/tests/GPU/DomainFlowLES/DomainFlowLES_BENCH.TU.is157091_cc86
+++ b/tests/GPU/DomainFlowLES/DomainFlowLES_BENCH.TU.is157091_cc86
@@ -8,13 +8,13 @@ Time is given in seconds
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
                                               Context of the computation
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-Date:     10-03-2026 -- 08:38:40
+Date:     14-05-2026 -- 15:57:08
 OS:       is157091__Linux__x86_64__5.11.12-300.fc34.x86_64__#1 SMP Wed Apr 7 16:31:13 UTC 2021
 CPU model : AMD Ryzen Threadripper PRO 5975WX 32-Cores
 Total number of threads:64
 GPU model: NVIDIA RTX A6000
 CUDA runtime version: 12.90
-CUDA drivers version: 12.70
+CUDA drivers version: 13.20
 Nb procs used for the computation: 1
 TRUST version: 1.9.8_beta
 Total number of elements used for the calculation: 3276800
@@ -22,39 +22,39 @@ Total number of elements used for the calculation: 3276800
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
                                            Computation start-up statistics
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-Total time of the start-up:                                                55.0303        
+Total time of the start-up:                                                56.4516        
 
 Number of calls to the linear solver per time step:                        4              
-Average time of the resolution of the linear problem per call:             0.852492       
+Average time of the resolution of the linear problem per call:             1.05844        
 Average number of iteration of the linear solver per call:                 21.75          
 
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
                                                  Time loop statistics
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 The first time step is not accounted for the computation of the time loop statistics
-Total time of the time loop:                                               9.17717        
+Total time of the time loop:                                               8.35729        
 Number of time steps:                                                      9              
 Skipped time steps:                                                        1              
-Average time per time step:                                                1.01969        
-Standard deviation between time steps:                                     0.0706814      
-Time elapsed in the skipped time steps:                                    2.88004        
+Average time per time step:                                                0.928588       
+Standard deviation between time steps:                                     0.0665702      
+Time elapsed in the skipped time steps:                                    2.86698        
 
 
 Standard counter description             | Time/step       | % loop time | Call(s)/step   
 ------------------------------------------------------------------------------------------
-Linear solver resolutions Ax=B           | 0.278561        | 27.3        | 4              
-Matrix assembly for implicit scheme      | 0.03953483      |  3.9        | 6              
-Convection operator                      | 0.2709577       | 26.6        | 8              
-Diffusion operator                       | 0.1310476       | 12.9        | 36             
-Gradient operator                        | 0.04856839      |  4.8        | 9              
-Divergence operator                      | 0.03368633      |  3.3        | 8              
-Source terms                             | 0.002160707     |  0.2        | 6              
-Update ::mettre_a_jour                   | 0.0415759       |  4.1        | 5              
-Solver for implicit diffusion            | 0.08429423      |  8.3        | 6              
-Computation of the time step dt          | 0.02971084      |  2.9        | 10             
-Turbulence model::update                 | 0.00944341      |  0.9        | 2              
-Post-treatment operations                | 0.02223871      |  2.2        | 2              
-Other operations                         | 0.02790565      |  2.7        | 
+Linear solver resolutions Ax=B           | 0.265963        | 28.6        | 4              
+Matrix assembly for implicit scheme      | 0.04038793      |  4.3        | 6              
+Convection operator                      | 0.2593428       | 27.9        | 8              
+Diffusion operator                       | 0.1087147       | 11.7        | 36             
+Gradient operator                        | 0.02225161      |  2.4        | 9              
+Divergence operator                      | 0.02232687      |  2.4        | 8              
+Source terms                             | 0.002280371     |  0.2        | 6              
+Update ::mettre_a_jour                   | 0.03501159      |  3.8        | 5              
+Solver for implicit diffusion            | 0.08865022      |  9.5        | 6              
+Computation of the time step dt          | 0.02310452      |  2.5        | 10             
+Turbulence model::update                 | 0.009679251     |  1.0        | 2              
+Post-treatment operations                | 0.02006731      |  2.2        | 2              
+Other operations                         | 0.03080791      |  3.3        | 
 
 Average number of iteration of the linear solver per call:                 35.4           
 
@@ -64,16 +64,16 @@ Average number of iteration of the linear solver per call:                 35.4
 -----------------------------------------------------------------------------------------------------------
 Counter description                      | Time per step   | % loop time | Call(s)/step    | Bandwidth 
 -----------------------------------------------------------------------------------------------------------
-Libraries:                               | 0.275962        | 27.1        | 4               | 
-Kernels:                                 | 0.709639        | 69.6        | 1993            | 
-Copy host to device:                     | 0.000689055     |  0.1        | 43              | 4.2 GB/s
-Copy device to host:                     | 0.00311297      |  0.3        | 7               | 12.0 GB/s
-Alloc/Free on device:                    | 0.000785802     |  0.1        | 23              | 
-GPU: 97% Copy H<->D: 0.37% Alloc/free: 0.077% Comm: 0% CPU & I/O: 2.9%
+Libraries:                               | 0.265066        | 28.5        | 4               | 
+Kernels:                                 | 0.63096         | 67.9        | 1991            | 
+Copy host to device:                     | 0.000694008     |  0.1        | 43              | 4.2 GB/s
+Copy device to host:                     | 0.00416774      |  0.4        | 7               | 8.9 GB/s
+Alloc/Free on device:                    | 0.000657239     |  0.1        | 23              | 
+GPU: 96% Copy H<->D: 0.52% Alloc/free: 0.071% Comm: 0% CPU & I/O: 2.9%
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
                                               Post-resolution statistics
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-Time of the post-resolution:                                               0.0926276      
+Time of the post-resolution:                                               0.0958504      
 
-Total time for the whole computation                                       67.1801        
+Total time for the whole computation                                       67.7717        
 
diff --git a/tests/GPU/DomainFlowLES/DomainFlowLES_BENCH.TU.is159479_cc120 b/tests/GPU/DomainFlowLES/DomainFlowLES_BENCH.TU.is159479_cc120
new file mode 100644
index 0000000000..6f4dfb3cd4
--- /dev/null
+++ b/tests/GPU/DomainFlowLES/DomainFlowLES_BENCH.TU.is159479_cc120
@@ -0,0 +1,79 @@
+                                             # Global performance file #
+
+This is the global file for tracking performance in TRUST. It stores aggregated quantities.
+More detailed statistics can be found in the DomainFlowLES_BENCH_csv.TU file
+For time loop, only standard counters of level 1 are printed alongside your custom counters
+Time is given in seconds
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+                                              Context of the computation
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Date:     24-04-2026 -- 14:33:44
+OS:       is159479.intra.cea.fr__Linux__x86_64__6.12.0-160000.26-default__#1 SMP PREEMPT_DYNAMIC Thu Feb  5 00:00:11 UTC 2026 (f
+CPU model : AMD Ryzen Threadripper PRO 9965WX 24-Cores
+Total number of threads:48
+GPU model: NVIDIA RTX PRO 6000 Blackwell Workstation Edition
+CUDA runtime version: 13.0
+CUDA drivers version: 13.0
+Nb procs used for the computation: 1
+TRUST version: 1.9.8_beta
+Total number of elements used for the calculation: 3276800
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+                                           Computation start-up statistics
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Total time of the start-up:                                                40.8316        
+
+Number of calls to the linear solver per time step:                        4              
+Average time of the resolution of the linear problem per call:             0.59519        
+Average number of iteration of the linear solver per call:                 21.75          
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+                                                 Time loop statistics
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+The first time step is not accounted for the computation of the time loop statistics
+Total time of the time loop:                                               2.72965        
+Number of time steps:                                                      9              
+Skipped time steps:                                                        1              
+Average time per time step:                                                0.303294       
+Standard deviation between time steps:                                     0.030574       
+Time elapsed in the skipped time steps:                                    1.69938        
+
+
+Standard counter description             | Time/step       | % loop time | Call(s)/step   
+------------------------------------------------------------------------------------------
+Linear solver resolutions Ax=B           | 0.119445        | 39.4        | 4              
+Matrix assembly for implicit scheme      | 0.0119855       |  4.0        | 6              
+Convection operator                      | 0.04748959      | 15.7        | 8              
+Diffusion operator                       | 0.04161902      | 13.7        | 36             
+Gradient operator                        | 0.01082988      |  3.6        | 9              
+Divergence operator                      | 0.00443817      |  1.5        | 8              
+Source terms                             | 0.0008179478    |  0.3        | 6              
+Update ::mettre_a_jour                   | 0.01174738      |  3.9        | 5              
+Solver for implicit diffusion            | 0.02742291      |  9.0        | 6              
+Computation of the time step dt          | 0.003806219     |  1.3        | 10             
+Turbulence model::update                 | 0.004209919     |  1.4        | 2              
+Post-treatment operations                | 0.009942449     |  3.3        | 2              
+Other operations                         | 0.00953959      |  3.1        | 
+
+Average number of iteration of the linear solver per call:                 35.3           
+
+
+-----------------------------------------------------------------------------------------------------------
+                                                    GPU statistics
+-----------------------------------------------------------------------------------------------------------
+Counter description                      | Time per step   | % loop time | Call(s)/step    | Bandwidth 
+-----------------------------------------------------------------------------------------------------------
+Libraries:                               | 0.119216        | 39.3        | 4               | 
+Kernels:                                 | 0.165121        | 54.4        | 1991            | 
+Copy host to device:                     | 0.000505459     |  0.2        | 43              | 5.8 GB/s
+Copy device to host:                     | 0.00358184      |  1.2        | 7               | 10.4 GB/s
+Alloc/Free on device:                    | 0.000322484     |  0.1        | 23              | 
+GPU: 94% Copy H<->D: 1.3% Alloc/free: 0.11% Comm: 0% CPU & I/O: 4.8%
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+                                              Post-resolution statistics
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Time of the post-resolution:                                               0.0774195      
+
+Total time for the whole computation                                       45.3381        
+
diff --git a/tests/GPU/DomainFlowLES/DomainFlowLES_BENCH.TU.is247793_gfx1100 b/tests/GPU/DomainFlowLES/DomainFlowLES_BENCH.TU.is247793_gfx1100
index 471ab9ec4c..a6a3ca6c4a 100644
--- a/tests/GPU/DomainFlowLES/DomainFlowLES_BENCH.TU.is247793_gfx1100
+++ b/tests/GPU/DomainFlowLES/DomainFlowLES_BENCH.TU.is247793_gfx1100
@@ -1,54 +1,79 @@
-Statistiques d'initialisation du calcul
-
-Temps total                       77.5112
-
-Statistiques de resolution du probleme
-
-Temps total                       15.4304
-
-
-Timesteps                         10
-Secondes / pas de temps           1.54304
-Dont solveurs Ax=B                0.299995 19% (4 appels/pas de temps)
-Dont solveur diffusion_implicite  0.103985  6% (6 appels/pas de temps)
-Dont assemblage matrice_implicite 0.122103  7% (6 appels/pas de temps)
-Dont mettre_a_jour                0.050672  3% (5 appels/pas de temps)
-Dont operateurs convection        0.364616 23% (8 appels/pas de temps)
-Dont operateurs diffusion         0.245065 15% (35.9 appels/pas de temps)
-Dont operateurs gradient          0.059774  3% (9 appels/pas de temps)
-Dont operateurs divergence        0.022762  1% (8.2 appels/pas de temps)
-Dont operateurs source            0.006063  0% (6 appels/pas de temps)
-Dont operations postraitement     0.202759 13% (2 appels/pas de temps)
-Dont calcul dt                    0.016310  1% (10 appels/pas de temps)
-Dont modele turbulence            0.011045  0% (2 appels/pas de temps)
-Dont calcul divers                0.037886  2% (0 appels/pas de temps)
-Nb solveur / pas de temps         4
-Secondes / solveur                0.0749988
-Iterations / solveur              27.25
-GPU statistics per time step (experimental):
-Libraries : 0.299279 s 19.4%  4.0 calls
-Kernels   : 0.914389 s 59.3% 64035.4 calls
-Copy H2D  : 0.034075 s  2.2% 86.4 calls 11.8 GB/s
-Copy D2H  : 0.012526 s  0.8% 101.0 calls 18.3 GB/s
-Alloc/Free: 0.007877 s  0.5% 57.5 calls
-GPU: 78.6% Copy H<->D: 3% Alloc/Free: 0.5% Comm: 0% CPU & Others: 17.8%
-I/O:
-
-Timesteps = number of time steps
-Nb solveur = number of linear system resolutions
-Nb assemblage implicite = number of matrix assemblies for the implicit scheme
-Iterations = average number of iterations of the solver
-Communications = fraction of the time spent
-                 in communications between processors (excluding io files)
-Network latency = time of one mpsum measured by an internal bench over 0.1s
-Network bandwidth = maximum on all processors
-                    of the average bandwidth of send_recv operations
-Waiting time = estimation of the waiting time of the different processors
-
-Max_waiting_time big    => probably due to a bad partitioning
-Communications > 30%    => too many processors or network too slow
-
-Statistiques de post resolution
-
-Temps total                       0.422806
+                                             # Global performance file #
+
+This is the global file for tracking performance in TRUST. It stores aggregated quantities.
+More detailed statistics can be found in the DomainFlowLES_BENCH_csv.TU file
+For time loop, only standard counters of level 1 are printed alongside your custom counters
+Time is given in seconds
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+                                              Context of the computation
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Date:     22-05-2026 -- 17:32:53
+OS:       is247793__Linux__x86_64__6.8.0-107-generic__#107~22.04.1-Ubuntu SMP PREEMPT_DYNAMIC Wed Mar 18 23:40:43 UTC 
+CPU model : AMD Ryzen Threadripper PRO 7965WX 24-Cores
+Total number of threads:48
+GPU model: AMD Radeon PRO W7900
+HIP runtime version: 7.53
+HIP drivers version: 7.53
+Nb procs used for the computation: 1
+TRUST version: 1.9.8_beta
+Total number of elements used for the calculation: 3276800
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+                                           Computation start-up statistics
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Total time of the start-up:                                                48.1642        
+
+Number of calls to the linear solver per time step:                        4              
+Average time of the resolution of the linear problem per call:             1.30368        
+Average number of iteration of the linear solver per call:                 41.25          
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+                                                 Time loop statistics
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+The first time step is not accounted for the computation of the time loop statistics
+Total time of the time loop:                                               9.09926        
+Number of time steps:                                                      9              
+Skipped time steps:                                                        1              
+Average time per time step:                                                1.01103        
+Standard deviation between time steps:                                     0.0525723      
+Time elapsed in the skipped time steps:                                    2.64799        
+
+
+Standard counter description             | Time/step       | % loop time | Call(s)/step   
+------------------------------------------------------------------------------------------
+Linear solver resolutions Ax=B           | 0.411071        | 40.7        | 4              
+Matrix assembly for implicit scheme      | 0.05677431      |  5.6        | 6              
+Convection operator                      | 0.17899         | 17.7        | 8              
+Diffusion operator                       | 0.1028158       | 10.2        | 36             
+Gradient operator                        | 0.03740054      |  3.7        | 9              
+Divergence operator                      | 0.01530846      |  1.5        | 8              
+Source terms                             | 0.002693547     |  0.3        | 6              
+Update ::mettre_a_jour                   | 0.03649897      |  3.6        | 5              
+Solver for implicit diffusion            | 0.0935207       |  9.3        | 6              
+Computation of the time step dt          | 0.01495112      |  1.5        | 10             
+Turbulence model::update                 | 0.005950598     |  0.6        | 2              
+Post-treatment operations                | 0.01349221      |  1.3        | 2              
+Other operations                         | 0.04156191      |  4.1        | 
+
+Average number of iteration of the linear solver per call:                 40.1           
+
+
+-----------------------------------------------------------------------------------------------------------
+                                                    GPU statistics
+-----------------------------------------------------------------------------------------------------------
+Counter description                      | Time per step   | % loop time | Call(s)/step    | Bandwidth 
+-----------------------------------------------------------------------------------------------------------
+Libraries:                               | 0.410401        | 40.6        | 4               | 
+Kernels:                                 | 0.57806         | 57.2        | 1991            | 
+Copy host to device:                     | 0.00114789      |  0.1        | 43              | 2.5 GB/s
+Copy device to host:                     | 0.00192947      |  0.2        | 7               | 19.3 GB/s
+Alloc/Free on device:                    | 0.000596136     |  0.1        | 23              | 
+GPU: 98% Copy H<->D: 0.3% Alloc/free: 0.059% Comm: 0% CPU & I/O: 1.9%
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+                                              Post-resolution statistics
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Time of the post-resolution:                                               0.154435       
+
+Total time for the whole computation                                       60.0659        
 
diff --git a/tests/GPU/DomainFlowLES/DomainFlowLES_BENCH.TU.jean-zay_cc90 b/tests/GPU/DomainFlowLES/DomainFlowLES_BENCH.TU.jean-zay_cc90
index 8c7e324927..929ea656a8 100644
--- a/tests/GPU/DomainFlowLES/DomainFlowLES_BENCH.TU.jean-zay_cc90
+++ b/tests/GPU/DomainFlowLES/DomainFlowLES_BENCH.TU.jean-zay_cc90
@@ -8,13 +8,13 @@ Time is given in seconds
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
                                               Context of the computation
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-Date:     20-03-2026 -- 13:59:18
-OS:       jzxh136__Linux__x86_64__5.14.0-570.58.1.el9_6.x86_64__#1 SMP PREEMPT_DYNAMIC Tue Oct 21 04:15:07 EDT 2025
+Date:     10-06-2026 -- 10:35:02
+OS:       jzxh022__Linux__x86_64__5.14.0-570.116.1.el9_6.x86_64__#1 SMP PREEMPT_DYNAMIC Tue May 19 08:44:54 EDT 2026
 CPU model : Intel(R) Xeon(R) Platinum 8468
 Total number of threads:192
 GPU model: NVIDIA H100 80GB HBM3
 CUDA runtime version: 12.60
-CUDA drivers version: 13.0
+CUDA drivers version: 13.20
 Nb procs used for the computation: 1
 TRUST version: 1.9.8_beta
 Total number of elements used for the calculation: 3276800
@@ -22,39 +22,39 @@ Total number of elements used for the calculation: 3276800
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
                                            Computation start-up statistics
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-Total time of the start-up:                                                57.7416        
+Total time of the start-up:                                                41.3232        
 
 Number of calls to the linear solver per time step:                        4              
-Average time of the resolution of the linear problem per call:             1.02785        
+Average time of the resolution of the linear problem per call:             0.791435       
 Average number of iteration of the linear solver per call:                 21.75          
 
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
                                                  Time loop statistics
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 The first time step is not accounted for the computation of the time loop statistics
-Total time of the time loop:                                               3.39787        
+Total time of the time loop:                                               3.20102        
 Number of time steps:                                                      9              
 Skipped time steps:                                                        1              
-Average time per time step:                                                0.377541       
-Standard deviation between time steps:                                     0.0880077      
-Time elapsed in the skipped time steps:                                    2.99383        
+Average time per time step:                                                0.355669       
+Standard deviation between time steps:                                     0.0925432      
+Time elapsed in the skipped time steps:                                    2.7195         
 
 
 Standard counter description             | Time/step       | % loop time | Call(s)/step   
 ------------------------------------------------------------------------------------------
-Linear solver resolutions Ax=B           | 0.11934         | 31.6        | 4              
-Matrix assembly for implicit scheme      | 0.01312974      |  3.5        | 6              
-Convection operator                      | 0.08059334      | 21.3        | 8              
-Diffusion operator                       | 0.04303252      | 11.4        | 36             
-Gradient operator                        | 0.0157995       |  4.2        | 9              
-Divergence operator                      | 0.007773447     |  2.1        | 8              
-Source terms                             | 0.0009952992    |  0.3        | 6              
-Update ::mettre_a_jour                   | 0.01492743      |  4.0        | 5              
-Solver for implicit diffusion            | 0.02761908      |  7.3        | 6              
-Computation of the time step dt          | 0.004369273     |  1.2        | 10             
-Turbulence model::update                 | 0.00482918      |  1.3        | 2              
-Post-treatment operations                | 0.03422401      |  9.1        | 2              
-Other operations                         | 0.01090807      |  2.9        | 
+Linear solver resolutions Ax=B           | 0.118033        | 33.2        | 4              
+Matrix assembly for implicit scheme      | 0.01421842      |  4.0        | 6              
+Convection operator                      | 0.06594422      | 18.5        | 8              
+Diffusion operator                       | 0.03895065      | 11.0        | 36             
+Gradient operator                        | 0.01349732      |  3.8        | 9              
+Divergence operator                      | 0.003659466     |  1.0        | 8              
+Source terms                             | 0.001029953     |  0.3        | 6              
+Update ::mettre_a_jour                   | 0.01316835      |  3.7        | 5              
+Solver for implicit diffusion            | 0.03054857      |  8.6        | 6              
+Computation of the time step dt          | 0.003698007     |  1.0        | 10             
+Turbulence model::update                 | 0.004816221     |  1.4        | 2              
+Post-treatment operations                | 0.03526975      |  9.9        | 2              
+Other operations                         | 0.0128344       |  3.6        | 
 
 Average number of iteration of the linear solver per call:                 35.4           
 
@@ -64,17 +64,17 @@ Average number of iteration of the linear solver per call:                 35.4
 -----------------------------------------------------------------------------------------------------------
 Counter description                      | Time per step   | % loop time | Call(s)/step    | Bandwidth 
 -----------------------------------------------------------------------------------------------------------
-Libraries:                               | 0.118884        | 31.5        | 4               | 
-Kernels:                                 | 0.205998        | 54.6        | 1993            | 
-Copy host to device:                     | 0.000907065     |  0.2        | 43              | 3.2 GB/s
-Copy device to host:                     | 0.00452331      |  1.2        | 7               | 8.2 GB/s
-Alloc/Free on device:                    | 0.000677801     |  0.2        | 23              | 
-GPU: 86% Copy H<->D: 1.4% Alloc/free: 0.18% Comm: 0% CPU & I/O: 12%
+Libraries:                               | 0.117593        | 33.1        | 4               | 
+Kernels:                                 | 0.185695        | 52.2        | 1991            | 
+Copy host to device:                     | 0.000912839     |  0.3        | 43              | 3.2 GB/s
+Copy device to host:                     | 0.00452191      |  1.3        | 7               | 8.2 GB/s
+Alloc/Free on device:                    | 0.000529603     |  0.1        | 23              | 
+GPU: 85% Copy H<->D: 1.5% Alloc/free: 0.15% Comm: 0% CPU & I/O: 13%
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
                                               Post-resolution statistics
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-Time of the post-resolution:                                               0.28617        
+Time of the post-resolution:                                               0.251046       
 
-Total time for the whole computation                                       64.4195        
+Total time for the whole computation                                       47.4947        
 
-[Slurm] Power consumption (79 s):  0.443 kW  0.010 kWh  0.001 € (0.10€/kWh)
+[Slurm] Power consumption (56 s):  0.441 kW  0.007 kWh  0.001 € (0.10€/kWh)
diff --git a/tests/GPU/DomainFlowLES/DomainFlowLES_BENCH.TU.lumi_gfx90a b/tests/GPU/DomainFlowLES/DomainFlowLES_BENCH.TU.lumi_gfx90a
index e2bf102ad2..65f7f6e603 100644
--- a/tests/GPU/DomainFlowLES/DomainFlowLES_BENCH.TU.lumi_gfx90a
+++ b/tests/GPU/DomainFlowLES/DomainFlowLES_BENCH.TU.lumi_gfx90a
@@ -8,8 +8,8 @@ Time is given in seconds
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
                                               Context of the computation
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-Date:     23-02-2026 -- 23:56:09
-OS:       nid007971__Linux__x86_64__6.4.0-150600.23.73_15.0.14-cray_shasta_c__#1 SMP Tue Oct 21 20:32:25 UTC 2025 (89d3c98)
+Date:     18-05-2026 -- 08:46:58
+OS:       nid007955__Linux__x86_64__6.4.0-150600.23.73_15.0.14-cray_shasta_c__#1 SMP Tue Oct 21 20:32:25 UTC 2025 (89d3c98)
 CPU model : AMD EPYC 7A53 64-Core Processor
 Total number of threads:128
 GPU model: AMD Instinct MI250X
@@ -22,41 +22,41 @@ Total number of elements used for the calculation: 3276800
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
                                            Computation start-up statistics
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-Total time of the start-up:                                                102.82         
+Total time of the start-up:                                                116.834        
 
 Number of calls to the linear solver per time step:                        4              
-Average time of the resolution of the linear problem per call:             1.87468        
-Average number of iteration of the linear solver per call:                 19.5           
+Average time of the resolution of the linear problem per call:             1.98809        
+Average number of iteration of the linear solver per call:                 21             
 
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
                                                  Time loop statistics
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 The first time step is not accounted for the computation of the time loop statistics
-Total time of the time loop:                                               12.0037        
+Total time of the time loop:                                               9.37967        
 Number of time steps:                                                      9              
 Skipped time steps:                                                        1              
-Average time per time step:                                                1.33374        
-Standard deviation between time steps:                                     0.0778389      
-Time elapsed in the skipped time steps:                                    3.78639        
+Average time per time step:                                                1.04219        
+Standard deviation between time steps:                                     0.0675697      
+Time elapsed in the skipped time steps:                                    3.91614        
 
 
 Standard counter description             | Time/step       | % loop time | Call(s)/step   
 ------------------------------------------------------------------------------------------
-Linear solver resolutions Ax=B           | 0.20903         | 11.9        | 4              
-Matrix assembly for implicit scheme      | 0.1245094       |  7.1        | 6              
-Convection operator                      | 0.4432143       | 25.3        | 8              
-Diffusion operator                       | 0.1587154       |  9.0        | 36             
-Gradient operator                        | 0.1311887       |  7.5        | 9              
-Divergence operator                      | 0.03958435      |  2.3        | 8              
-Source terms                             | 0.001954178     |  0.1        | 6              
-Update ::mettre_a_jour                   | 0.1000263       |  5.7        | 5              
-Solver for implicit diffusion            | 0.05686906      |  3.2        | 6              
-Computation of the time step dt          | 0.02558297      |  1.5        | 10             
-Turbulence model::update                 | 0.01002186      |  0.6        | 2              
-Post-treatment operations                | 0.02588545      |  1.5        | 2              
-Other operations                         | 0.00715868      |  0.4        | 
+Linear solver resolutions Ax=B           | 0.159419        | 15.3        | 4              
+Matrix assembly for implicit scheme      | 0.09955579      |  9.6        | 6              
+Convection operator                      | 0.420294        | 40.3        | 8              
+Diffusion operator                       | 0.1025904       |  9.8        | 36             
+Gradient operator                        | 0.06268515      |  6.0        | 9              
+Divergence operator                      | 0.02317575      |  2.2        | 8              
+Source terms                             | 0.001829052     |  0.2        | 6              
+Update ::mettre_a_jour                   | 0.04758081      |  4.6        | 5              
+Solver for implicit diffusion            | 0.05482205      |  5.3        | 6              
+Computation of the time step dt          | 0.01917573      |  1.8        | 10             
+Turbulence model::update                 | 0.00881079      |  0.8        | 2              
+Post-treatment operations                | 0.02308013      |  2.2        | 2              
+Other operations                         | 0.01916755      |  1.8        | 
 
-Average number of iteration of the linear solver per call:                 34.5           
+Average number of iteration of the linear solver per call:                 35.3           
 
 
 -----------------------------------------------------------------------------------------------------------
@@ -64,17 +64,17 @@ Average number of iteration of the linear solver per call:                 34.5
 -----------------------------------------------------------------------------------------------------------
 Counter description                      | Time per step   | % loop time | Call(s)/step    | Bandwidth 
 -----------------------------------------------------------------------------------------------------------
-Libraries:                               | 0.206993        | 15.5        | 4               | 
-Kernels:                                 | 1.09133         | 81.8        | 1993            | 
-Copy host to device:                     | 0.000990319     |  0.1        | 43              | 2.9 GB/s
-Copy device to host:                     | 0.00182308      |  0.1        | 7               | 20.4 GB/s
-Alloc/Free on device:                    | 0.00034577      |  0.0        | 23              | 
-GPU: 97% Copy H<->D: 0.21% Alloc/free: 0.026% Comm: 0% CPU & I/O: 2.4%
+Libraries:                               | 0.158676        | 15.2        | 4               | 
+Kernels:                                 | 0.847624        | 81.3        | 1991            | 
+Copy host to device:                     | 0.00099656      |  0.1        | 43              | 2.9 GB/s
+Copy device to host:                     | 0.00182369      |  0.2        | 7               | 20.4 GB/s
+Alloc/Free on device:                    | 0.00034191      |  0.0        | 23              | 
+GPU: 97% Copy H<->D: 0.27% Alloc/free: 0.033% Comm: 0% CPU & I/O: 3.1%
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
                                               Post-resolution statistics
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-Time of the post-resolution:                                               0.577526       
+Time of the post-resolution:                                               0.696635       
 
-Total time for the whole computation                                       119.187        
+Total time for the whole computation                                       130.827        
 
-Power consumption: 0 kW 0 kWh 0 € (0.12€/kWh)
+[Slurm] Power consumption (155 s):  0.503 kW  0.022 kWh  0.002 € (0.10€/kWh)
diff --git a/tests/GPU/DomainFlowLES/DomainFlowLES_BENCH.TU.topaze_cc80 b/tests/GPU/DomainFlowLES/DomainFlowLES_BENCH.TU.topaze_cc80
index f863c3cda3..5b32389127 100644
--- a/tests/GPU/DomainFlowLES/DomainFlowLES_BENCH.TU.topaze_cc80
+++ b/tests/GPU/DomainFlowLES/DomainFlowLES_BENCH.TU.topaze_cc80
@@ -8,8 +8,8 @@ Time is given in seconds
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
                                               Context of the computation
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-Date:     20-02-2026 -- 17:41:21
-OS:       topaze7059__Linux__x86_64__4.18.0-553.89.1.el8_10.x86_64__#1 SMP Sat Nov 29 00:49:18 EST 2025
+Date:     15-05-2026 -- 13:28:03
+OS:       topaze7064__Linux__x86_64__4.18.0-553.89.1.el8_10.x86_64__#1 SMP Sat Nov 29 00:49:18 EST 2025
 CPU model : AMD EPYC 7763 64-Core Processor
 Total number of threads:256
 GPU model: NVIDIA A100-SXM4-80GB
@@ -22,39 +22,39 @@ Total number of elements used for the calculation: 3276800
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
                                            Computation start-up statistics
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-Total time of the start-up:                                                75.2339        
+Total time of the start-up:                                                75.4274        
 
 Number of calls to the linear solver per time step:                        4              
-Average time of the resolution of the linear problem per call:             1.19536        
+Average time of the resolution of the linear problem per call:             1.22135        
 Average number of iteration of the linear solver per call:                 21.75          
 
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
                                                  Time loop statistics
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 The first time step is not accounted for the computation of the time loop statistics
-Total time of the time loop:                                               4.85899        
+Total time of the time loop:                                               4.19424        
 Number of time steps:                                                      9              
 Skipped time steps:                                                        1              
-Average time per time step:                                                0.539888       
-Standard deviation between time steps:                                     0.0744115      
-Time elapsed in the skipped time steps:                                    3.20339        
+Average time per time step:                                                0.466026       
+Standard deviation between time steps:                                     0.0711581      
+Time elapsed in the skipped time steps:                                    3.00923        
 
 
 Standard counter description             | Time/step       | % loop time | Call(s)/step   
 ------------------------------------------------------------------------------------------
-Linear solver resolutions Ax=B           | 0.163403        | 18.2        | 4              
-Matrix assembly for implicit scheme      | 0.02544632      |  2.8        | 6              
-Convection operator                      | 0.1210334       | 13.5        | 8              
-Diffusion operator                       | 0.06967004      |  7.8        | 36             
-Gradient operator                        | 0.02544889      |  2.8        | 9              
-Divergence operator                      | 0.0106954       |  1.2        | 8              
-Source terms                             | 0.002139126     |  0.2        | 6              
-Update ::mettre_a_jour                   | 0.01935749      |  2.2        | 5              
-Solver for implicit diffusion            | 0.0451121       |  5.0        | 6              
-Computation of the time step dt          | 0.009243173     |  1.0        | 10             
-Turbulence model::update                 | 0.004829846     |  0.5        | 2              
-Post-treatment operations                | 0.02625407      |  2.9        | 2              
-Other operations                         | 0.01725465      |  1.9        | 
+Linear solver resolutions Ax=B           | 0.160839        | 34.5        | 4              
+Matrix assembly for implicit scheme      | 0.02282103      |  4.9        | 6              
+Convection operator                      | 0.09259933      | 19.9        | 8              
+Diffusion operator                       | 0.05369537      | 11.5        | 36             
+Gradient operator                        | 0.01970627      |  4.2        | 9              
+Divergence operator                      | 0.005499746     |  1.2        | 8              
+Source terms                             | 0.001331374     |  0.3        | 6              
+Update ::mettre_a_jour                   | 0.01598558      |  3.4        | 5              
+Solver for implicit diffusion            | 0.04173282      |  9.0        | 6              
+Computation of the time step dt          | 0.00593109      |  1.3        | 10             
+Turbulence model::update                 | 0.003905554     |  0.8        | 2              
+Post-treatment operations                | 0.02473622      |  5.3        | 2              
+Other operations                         | 0.0172433       |  3.7        | 
 
 Average number of iteration of the linear solver per call:                 35.4           
 
@@ -64,16 +64,17 @@ Average number of iteration of the linear solver per call:                 35.4
 -----------------------------------------------------------------------------------------------------------
 Counter description                      | Time per step   | % loop time | Call(s)/step    | Bandwidth 
 -----------------------------------------------------------------------------------------------------------
-Libraries:                               | 0.162821        | 30.2        | 4               | 
-Kernels:                                 | 0.334537        | 62.0        | 1993            | 
-Copy host to device:                     | 0.000898156     |  0.2        | 43              | 3.2 GB/s
-Copy device to host:                     | 0.00366181      |  0.7        | 7               | 10.2 GB/s
-Alloc/Free on device:                    | 0.000978722     |  0.2        | 23              | 
-GPU: 92% Copy H<->D: 0.84% Alloc/free: 0.18% Comm: 0% CPU & I/O: 6.9%
+Libraries:                               | 0.16027         | 34.4        | 4               | 
+Kernels:                                 | 0.265791        | 57.0        | 1991            | 
+Copy host to device:                     | 0.000874332     |  0.2        | 43              | 3.3 GB/s
+Copy device to host:                     | 0.00237432      |  0.5        | 7               | 15.7 GB/s
+Alloc/Free on device:                    | 0.000643922     |  0.1        | 23              | 
+GPU: 91% Copy H<->D: 0.7% Alloc/free: 0.14% Comm: 0% CPU & I/O: 7.7%
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
                                               Post-resolution statistics
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-Time of the post-resolution:                                               0.480654       
+Time of the post-resolution:                                               0.277696       
 
-Total time for the whole computation                                       83.7769        
+Total time for the whole computation                                       82.9086        
 
+[Slurm] Power consumption (114 s):  0.577 kW  0.018 kWh  0.002 € (0.10€/kWh)
diff --git a/tests/GPU/DomainFlowLES/check_perf.sh b/tests/GPU/DomainFlowLES/check_perf.sh
index 0c9c696391..4972132040 100755
--- a/tests/GPU/DomainFlowLES/check_perf.sh
+++ b/tests/GPU/DomainFlowLES/check_perf.sh
@@ -56,6 +56,7 @@ run()
    # Try to mitigate variablity by setting exclusive mode on GPU (firefox, slack, edge, chrome, use device !)
    [ "$np" = "" ] && [ "$TRUST_WITHOUT_HOST" = 1 ] && [ "`hostname`" = is157091 ] && set_EXCLUSIVE_PROCESS=`sudo ls 2>/dev/null`
    [ "$set_EXCLUSIVE_PROCESS" != "" ] && sudo nvidia-smi -c EXCLUSIVE_PROCESS 1>/dev/null
+   trust -clean 1>/dev/null 2>&1 # Clean the files for IO
    trust $nsys $jdd $np 1>$jdd.out_err 2>&1
    [ "$set_EXCLUSIVE_PROCESS" != "" ] && sudo nvidia-smi -c DEFAULT 1>/dev/null         
    check $jdd $gpu
@@ -96,6 +97,7 @@ else
    then
       [ "`grep -i 'nb_parts 8' $jdd.data`" != "" ] && run $HOST$GPU_ARCH 8
    fi
+   #[ "`grep 'PARALLEL OK' $jdd.data`" != "" ] && run $HOST$GPU_ARCH 2
 fi
 # clean
 rm -f *.sauv *.lml *.sqlite *.nsys-rep
diff --git a/tests/GPU/GAMELAN_AMG/GAMELAN_AMG.data b/tests/GPU/GAMELAN_AMG/GAMELAN_AMG.data
new file mode 100644
index 0000000000..bf5eb4afa5
--- /dev/null
+++ b/tests/GPU/GAMELAN_AMG/GAMELAN_AMG.data
@@ -0,0 +1,141 @@
+# Warning: Degraded data file of GAMELAN simulation. Do not use except for non regression testing ! #
+# PARALLEL OK #
+
+dimension 3
+
+Pb_Thermohydraulique_QC pb1
+
+Domaine dom
+
+# BEGIN MESH #
+lire_fichier dom dom.geom
+/* raffiner_isotrope dom raffiner_isotrope dom */
+# END MESH #
+# BEGIN PARTITION
+Partition dom
+{
+    Partition_tool metis { Nb_parts 2 }
+    Larg_joint 2
+    zones_name dom
+    ecrire_lata dom.lata
+    reorder 1
+    single_hdf
+}
+End
+END PARTITION #
+# BEGIN SCATTER
+Scatter dom.Zones dom
+END SCATTER #
+
+VDF dis
+Lire dis { reorder { algo hilbert } }
+
+Runge_Kutta_Rationnel_ordre_2 sch
+
+Lire sch
+{
+    nb_pas_dt_max 10
+    tinit 0.
+    dt_start dt_calc
+    tmax 20.
+    dt_min 1.e-7
+    dt_max 6.e-3
+    dt_impr 1.e-7
+
+    seuil_statio 1.e-14
+    facsec 1
+    diffusion_implicite 1
+    seuil_diffusion_implicite 1.e-10
+    tcpumax 23.30
+
+}
+
+Associer pb1 dom
+Associer pb1 sch
+
+option_vdf {
+    p_imposee_aux_faces oui
+}
+Discretiser pb1 dis
+
+Lire pb1
+{
+
+    fluide_quasi_compressible {
+        gravite champ_uniforme 3 0 0. -9.81
+
+        pression   100000.
+
+        mu champ_fonc_fonction pb1 temperature 1 (0.86269e-5*val*0.02897*(8.0^0.5)*((1.0+(0.0020159/0.02897))^0.5))/(((val*0.02897*(8.0^0.5)*((1.0+(0.0020159/0.02897))^0.5))+(1.0-val)*0.0020159*((1.0+(((0.86269/1.792)^0.5)*((0.02897/0.0020159)^0.25)))^2)))+(1.792e-5*(1.0-val)*0.0020159*(8.0^0.5)*((1.0+(0.02897/0.0020159))^0.5))/((((1.0-val)*0.0020159*(8.0^0.5)*((1.0+(0.02897/0.0020159))^0.5))+(val)*0.02897*((1.0+(((1.792/0.86269)^0.5)*((0.0020159/0.02897)^0.25)))^2)))
+
+        lambda champ_fonc_fonction pb1 temperature 1 7.72e-5*(100000.0*0.02897)/(8.314472*284.15*((((0.02897/0.0020159)-1.0)*val)+1.00))
+
+        loi_etat gaz_parfait_QC {
+
+            Prandtl 0.189301713586576
+            Cp 1.
+            gamma 1.4
+        }
+
+        traitement_pth constant
+
+    }
+
+
+
+    Navier_Stokes_QC
+    {
+
+        solveur_pression amg gcp { rtol 1.e-7 impr }
+        convection { centre }
+        diffusion { }
+
+        conditions_initiales
+        {
+            vitesse Champ_uniforme 3 0. 0. 0.
+        }
+        conditions_limites
+        {
+            w_w_w_w paroi_fixe
+            c_c_c_c_c_c_c%0 paroi_fixe
+            p paroi_fixe
+            in  frontiere_ouverte_rho_U_impose  Champ_front_uniforme 3 0. 0.  0.0264513805222214
+            o_o_o_o    Frontiere_ouverte_pression_imposee Champ_front_fonc_XYZ 1  -12.0291562777197*z
+            f    Frontiere_ouverte_pression_imposee Champ_front_fonc_XYZ 1  -12.0291562777197*z
+            g  Frontiere_ouverte_pression_imposee Champ_front_fonc_XYZ 1  -12.0291562777197*z
+        }
+
+    }
+
+    Convection_diffusion_Chaleur_QC
+    {
+        convection  { quick }
+        diffusion { }
+        conditions_initiales
+        {
+            temperature Champ_Fonc_XYZ dom 1 273+10*(x+y+z)
+        }
+
+        conditions_limites
+        {
+            w_w_w_w paroi_adiabatique
+            c_c_c_c_c_c_c%0 paroi_adiabatique
+            p  paroi_adiabatique
+            in frontiere_ouverte_temperature_imposee  Champ_front_Uniforme 1 273
+            o_o_o_o  frontiere_ouverte T_ext Champ_front_Uniforme 1 273
+            f  frontiere_ouverte T_ext Champ_front_Uniforme 1 272
+            g   frontiere_ouverte T_ext Champ_front_Uniforme 1 272
+        }
+    }
+    Postraitement
+    {
+        Champs dt_post 1000
+        {
+            pression elem
+        }
+    }
+}
+Resoudre pb1
+Fin
+
+
diff --git a/tests/GPU/GAMELAN_AMG/GAMELAN_AMG.lml.gz b/tests/GPU/GAMELAN_AMG/GAMELAN_AMG.lml.gz
new file mode 120000
index 0000000000..0a5fc3c894
--- /dev/null
+++ b/tests/GPU/GAMELAN_AMG/GAMELAN_AMG.lml.gz
@@ -0,0 +1 @@
+../../Dilatable/GAMELAN/GAMELAN.lml.gz
\ No newline at end of file
diff --git a/tests/GPU/GAMELAN_AMG/GAMELAN_AMG_BENCH.TU.adastra_gfx90a b/tests/GPU/GAMELAN_AMG/GAMELAN_AMG_BENCH.TU.adastra_gfx90a
new file mode 100644
index 0000000000..6cdfd9503a
--- /dev/null
+++ b/tests/GPU/GAMELAN_AMG/GAMELAN_AMG_BENCH.TU.adastra_gfx90a
@@ -0,0 +1,78 @@
+                                             # Global performance file #
+
+This is the global file for tracking performance in TRUST. It stores aggregated quantities.
+More detailed statistics can be found in the GAMELAN_AMG_BENCH_csv.TU file
+For time loop, only standard counters of level 1 are printed alongside your custom counters
+Time is given in seconds
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+                                              Context of the computation
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Date:     23-04-2026 -- 09:20:04
+OS:       g1109__Linux__x86_64__5.14.0-570.69.1.el9_6.x86_64__#1 SMP PREEMPT_DYNAMIC Tue Nov 25 01:30:14 EST 2025
+CPU model : AMD EPYC 7A53 64-Core Processor
+Total number of threads:128
+GPU model: AMD Instinct MI250X
+HIP runtime version: 6.43
+HIP drivers version: 6.43
+Nb procs used for the computation: 1
+TRUST version: 1.9.8_beta
+Total number of elements used for the calculation: 600576
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+                                           Computation start-up statistics
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Total time of the start-up:                                                10.3899        
+
+Number of calls to the linear solver per time step:                        2              
+Average time of the resolution of the linear problem per call:             0.443242       
+Average number of iteration of the linear solver per call:                 19             
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+                                                 Time loop statistics
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+The first time step is not accounted for the computation of the time loop statistics
+Total time of the time loop:                                               2.21534        
+Number of time steps:                                                      9              
+Skipped time steps:                                                        1              
+Average time per time step:                                                0.246149       
+Standard deviation between time steps:                                     0.072186       
+Time elapsed in the skipped time steps:                                    0.545262       
+
+
+Standard counter description             | Time/step       | % loop time | Call(s)/step   
+------------------------------------------------------------------------------------------
+Linear solver resolutions Ax=B           | 0.0277442       | 11.3        | 2              
+Convection operator                      | 0.01435192      |  5.8        | 7              
+Diffusion operator                       | 0.01873119      |  7.6        | 16             
+Gradient operator                        | 0.004311924     |  1.8        | 5              
+Divergence operator                      | 0.001880982     |  0.8        | 6              
+Source terms                             | 0.000381754     |  0.2        | 4              
+Update ::mettre_a_jour                   | 0.01100964      |  4.5        | 4              
+Solver for implicit diffusion            | 0.004626703     |  1.9        | 4              
+Computation of the time step dt          | 0.00107997      |  0.4        | 6              
+Post-treatment operations                | 0.1561464       | 63.4        | 1              
+Other operations                         | 0.005884409     |  2.4        | 
+
+Average number of iteration of the linear solver per call:                 22             
+
+
+-----------------------------------------------------------------------------------------------------------
+                                                    GPU statistics
+-----------------------------------------------------------------------------------------------------------
+Counter description                      | Time per step   | % loop time | Call(s)/step    | Bandwidth 
+-----------------------------------------------------------------------------------------------------------
+Libraries:                               | 0.0275371       | 11.2        | 2               | 
+Kernels:                                 | 0.172778        | 70.2        | 1073            | 
+Copy host to device:                     | 0.00248311      |  1.0        | 107             | 5.3 GB/s
+Copy device to host:                     | 0.00041268      |  0.2        | 6               | 11.6 GB/s
+Alloc/Free on device:                    | 2.92352e-05     |  0.0        | 856             | 
+GPU: 81% Copy H<->D: 1.2% Alloc/free: 0.012% Comm: 0% CPU & I/O: 17%
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+                                              Post-resolution statistics
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Time of the post-resolution:                                               0.0288232      
+
+Total time for the whole computation                                       13.1794        
+
+[Slurm] Power consumption (21 s):  0.373 kW  0.002 kWh  0.000 € (0.10€/kWh)
diff --git a/tests/GPU/GAMELAN_AMG/GAMELAN_AMG_BENCH.TU.adastra_gfx942 b/tests/GPU/GAMELAN_AMG/GAMELAN_AMG_BENCH.TU.adastra_gfx942
new file mode 100644
index 0000000000..79f6321bed
--- /dev/null
+++ b/tests/GPU/GAMELAN_AMG/GAMELAN_AMG_BENCH.TU.adastra_gfx942
@@ -0,0 +1,78 @@
+                                             # Global performance file #
+
+This is the global file for tracking performance in TRUST. It stores aggregated quantities.
+More detailed statistics can be found in the GAMELAN_AMG_BENCH_csv.TU file
+For time loop, only standard counters of level 1 are printed alongside your custom counters
+Time is given in seconds
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+                                              Context of the computation
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Date:     23-04-2026 -- 14:40:15
+OS:       a1003__Linux__x86_64__5.14.0-570.69.1.el9_6.x86_64__#1 SMP PREEMPT_DYNAMIC Tue Nov 25 01:30:14 EST 2025
+CPU model : AMD Instinct MI300A Accelerator
+Total number of threads:192
+GPU model: AMD Instinct MI300A
+HIP runtime version: 6.43
+HIP drivers version: 6.43
+Nb procs used for the computation: 1
+TRUST version: 1.9.8_beta
+Total number of elements used for the calculation: 600576
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+                                           Computation start-up statistics
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Total time of the start-up:                                                10.7076        
+
+Number of calls to the linear solver per time step:                        2              
+Average time of the resolution of the linear problem per call:             0.491718       
+Average number of iteration of the linear solver per call:                 19             
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+                                                 Time loop statistics
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+The first time step is not accounted for the computation of the time loop statistics
+Total time of the time loop:                                               0.764317       
+Number of time steps:                                                      9              
+Skipped time steps:                                                        1              
+Average time per time step:                                                0.0849242      
+Standard deviation between time steps:                                     0.0536174      
+Time elapsed in the skipped time steps:                                    0.289352       
+
+
+Standard counter description             | Time/step       | % loop time | Call(s)/step   
+------------------------------------------------------------------------------------------
+Linear solver resolutions Ax=B           | 0.0168719       | 19.9        | 2              
+Convection operator                      | 0.01172727      | 13.8        | 7              
+Diffusion operator                       | 0.01047739      | 12.3        | 16             
+Gradient operator                        | 0.004039262     |  4.8        | 5              
+Divergence operator                      | 0.001213214     |  1.4        | 6              
+Source terms                             | 0.0002689149    |  0.3        | 4              
+Update ::mettre_a_jour                   | 0.006867849     |  8.1        | 4              
+Solver for implicit diffusion            | 0.004425528     |  5.2        | 4              
+Computation of the time step dt          | 0.0008362072    |  1.0        | 6              
+Post-treatment operations                | 0.02350376      | 27.7        | 1              
+Other operations                         | 0.004692891     |  5.5        | 
+
+Average number of iteration of the linear solver per call:                 22             
+
+
+-----------------------------------------------------------------------------------------------------------
+                                                    GPU statistics
+-----------------------------------------------------------------------------------------------------------
+Counter description                      | Time per step   | % loop time | Call(s)/step    | Bandwidth 
+-----------------------------------------------------------------------------------------------------------
+Libraries:                               | 0.0166839       | 19.6        | 2               | 
+Kernels:                                 | 0.0298892       | 35.2        | 1073            | 
+Copy host to device:                     | 0.00228099      |  2.7        | 107             | 5.8 GB/s
+Copy device to host:                     | 0.000326292     |  0.4        | 6               | 14.7 GB/s
+Alloc/Free on device:                    | 2.7959e-05      |  0.0        | 856             | 
+GPU: 55% Copy H<->D: 3.1% Alloc/free: 0.033% Comm: 0% CPU & I/O: 42%
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+                                              Post-resolution statistics
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Time of the post-resolution:                                               0.0255038      
+
+Total time for the whole computation                                       11.7868        
+
+[Slurm] Power consumption (21 s):  0.485 kW  0.003 kWh  0.000 € (0.10€/kWh)
diff --git a/tests/GPU/GAMELAN_AMG/GAMELAN_AMG_BENCH.TU.dalianvl_cc100 b/tests/GPU/GAMELAN_AMG/GAMELAN_AMG_BENCH.TU.dalianvl_cc100
new file mode 100644
index 0000000000..5c43126385
--- /dev/null
+++ b/tests/GPU/GAMELAN_AMG/GAMELAN_AMG_BENCH.TU.dalianvl_cc100
@@ -0,0 +1,77 @@
+                                             # Global performance file #
+
+This is the global file for tracking performance in TRUST. It stores aggregated quantities.
+More detailed statistics can be found in the GAMELAN_AMG_BENCH_csv.TU file
+For time loop, only standard counters of level 1 are printed alongside your custom counters
+Time is given in seconds
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+                                              Context of the computation
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Date:     22-05-2026 -- 12:14:24
+OS:       dalianvl14__Linux__aarch64__5.14.0-503.40.1.el9_5.aarch64__#1 SMP PREEMPT_DYNAMIC Thu Apr 24 13:05:29 UTC 2025
+CPU model : Unknown Linux CPU
+Total number of threads:144
+GPU model: NVIDIA GB200
+CUDA runtime version: 13.0
+CUDA drivers version: 13.20
+Nb procs used for the computation: 1
+TRUST version: 1.9.8_beta
+Total number of elements used for the calculation: 600576
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+                                           Computation start-up statistics
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Total time of the start-up:                                                5.34843        
+
+Number of calls to the linear solver per time step:                        2              
+Average time of the resolution of the linear problem per call:             0.303297       
+Average number of iteration of the linear solver per call:                 19             
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+                                                 Time loop statistics
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+The first time step is not accounted for the computation of the time loop statistics
+Total time of the time loop:                                               0.662786       
+Number of time steps:                                                      9              
+Skipped time steps:                                                        1              
+Average time per time step:                                                0.0736429      
+Standard deviation between time steps:                                     0.0515967      
+Time elapsed in the skipped time steps:                                    0.246865       
+
+
+Standard counter description             | Time/step       | % loop time | Call(s)/step   
+------------------------------------------------------------------------------------------
+Linear solver resolutions Ax=B           | 0.0194675       | 26.4        | 2              
+Convection operator                      | 0.008774242     | 11.9        | 7              
+Diffusion operator                       | 0.008319704     | 11.3        | 16             
+Gradient operator                        | 0.00259036      |  3.5        | 5              
+Divergence operator                      | 0.0009964828    |  1.4        | 6              
+Source terms                             | 0.0001999642    |  0.3        | 4              
+Update ::mettre_a_jour                   | 0.003929349     |  5.3        | 4              
+Solver for implicit diffusion            | 0.002681795     |  3.6        | 4              
+Computation of the time step dt          | 0.0006292012    |  0.9        | 6              
+Post-treatment operations                | 0.02159892      | 29.3        | 1              
+Other operations                         | 0.004455399     |  6.1        | 
+
+Average number of iteration of the linear solver per call:                 22             
+
+
+-----------------------------------------------------------------------------------------------------------
+                                                    GPU statistics
+-----------------------------------------------------------------------------------------------------------
+Counter description                      | Time per step   | % loop time | Call(s)/step    | Bandwidth 
+-----------------------------------------------------------------------------------------------------------
+Libraries:                               | 0.0192347       | 26.1        | 2               | 
+Kernels:                                 | 0.0204398       | 27.8        | 1116            | 
+Copy host to device:                     | 0.00176518      |  2.4        | 102             | 5.0 GB/s
+Copy device to host:                     | 0.000111953     |  0.2        | 6               | 4.5 GB/s
+Alloc/Free on device:                    | 3.53492e-05     |  0.0        | 856             | 
+GPU: 54% Copy H<->D: 2.5% Alloc/free: 0.048% Comm: 0% CPU & I/O: 44%
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+                                              Post-resolution statistics
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Time of the post-resolution:                                               0.0184735      
+
+Total time for the whole computation                                       6.27657        
+
diff --git a/tests/GPU/GAMELAN_AMG/GAMELAN_AMG_BENCH.TU.eureka_cc89 b/tests/GPU/GAMELAN_AMG/GAMELAN_AMG_BENCH.TU.eureka_cc89
new file mode 100644
index 0000000000..9b15d03726
--- /dev/null
+++ b/tests/GPU/GAMELAN_AMG/GAMELAN_AMG_BENCH.TU.eureka_cc89
@@ -0,0 +1,77 @@
+                                             # Global performance file #
+
+This is the global file for tracking performance in TRUST. It stores aggregated quantities.
+More detailed statistics can be found in the GAMELAN_AMG_BENCH_csv.TU file
+For time loop, only standard counters of level 1 are printed alongside your custom counters
+Time is given in seconds
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+                                              Context of the computation
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Date:     11-05-2026 -- 08:30:32
+OS:       eureka__Linux__x86_64__6.17.0-20-generic__#20~24.04.1-Ubuntu SMP PREEMPT_DYNAMIC Thu Mar 19 01:28:37 UTC 2
+CPU model : INTEL(R) XEON(R) PLATINUM 8580
+Total number of threads:240
+GPU model: NVIDIA RTX 6000 Ada Generation
+CUDA runtime version: 12.90
+CUDA drivers version: 13.10
+Nb procs used for the computation: 1
+TRUST version: 1.9.8_beta
+Total number of elements used for the calculation: 600576
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+                                           Computation start-up statistics
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Total time of the start-up:                                                6.16705        
+
+Number of calls to the linear solver per time step:                        2              
+Average time of the resolution of the linear problem per call:             0.365755       
+Average number of iteration of the linear solver per call:                 19             
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+                                                 Time loop statistics
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+The first time step is not accounted for the computation of the time loop statistics
+Total time of the time loop:                                               0.883002       
+Number of time steps:                                                      9              
+Skipped time steps:                                                        1              
+Average time per time step:                                                0.0981114      
+Standard deviation between time steps:                                     0.0399791      
+Time elapsed in the skipped time steps:                                    0.283736       
+
+
+Standard counter description             | Time/step       | % loop time | Call(s)/step   
+------------------------------------------------------------------------------------------
+Linear solver resolutions Ax=B           | 0.0294439       | 30.0        | 2              
+Convection operator                      | 0.01347316      | 13.7        | 7              
+Diffusion operator                       | 0.0176114       | 18.0        | 16             
+Gradient operator                        | 0.003485975     |  3.6        | 5              
+Divergence operator                      | 0.006129298     |  6.2        | 6              
+Source terms                             | 0.0004662337    |  0.5        | 4              
+Update ::mettre_a_jour                   | 0.005655361     |  5.8        | 4              
+Solver for implicit diffusion            | 0.002242366     |  2.3        | 4              
+Computation of the time step dt          | 0.0007874331    |  0.8        | 6              
+Post-treatment operations                | 0.01724572      | 17.6        | 1              
+Other operations                         | 0.001570476     |  1.6        | 
+
+Average number of iteration of the linear solver per call:                 22             
+
+
+-----------------------------------------------------------------------------------------------------------
+                                                    GPU statistics
+-----------------------------------------------------------------------------------------------------------
+Counter description                      | Time per step   | % loop time | Call(s)/step    | Bandwidth 
+-----------------------------------------------------------------------------------------------------------
+Libraries:                               | 0.0293034       | 29.9        | 2               | 
+Kernels:                                 | 0.023372        | 23.8        | 1099            | 
+Copy host to device:                     | 0.00983065      | 10.0        | 120             | 9.1 GB/s
+Copy device to host:                     | 0.00637157      |  6.5        | 24              | 12.7 GB/s
+Alloc/Free on device:                    | 2.12862e-05     |  0.0        | 856             | 
+GPU: 54% Copy H<->D: 17% Alloc/free: 0.022% Comm: 0% CPU & I/O: 30%
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+                                              Post-resolution statistics
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Time of the post-resolution:                                               0.0112591      
+
+Total time for the whole computation                                       7.34506        
+
diff --git a/tests/GPU/GAMELAN_AMG/GAMELAN_AMG_BENCH.TU.irene-amd-ccrt_cc70 b/tests/GPU/GAMELAN_AMG/GAMELAN_AMG_BENCH.TU.irene-amd-ccrt_cc70
new file mode 100644
index 0000000000..f24564b48b
--- /dev/null
+++ b/tests/GPU/GAMELAN_AMG/GAMELAN_AMG_BENCH.TU.irene-amd-ccrt_cc70
@@ -0,0 +1,78 @@
+                                             # Global performance file #
+
+This is the global file for tracking performance in TRUST. It stores aggregated quantities.
+More detailed statistics can be found in the GAMELAN_AMG_BENCH_csv.TU file
+For time loop, only standard counters of level 1 are printed alongside your custom counters
+Time is given in seconds
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+                                              Context of the computation
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Date:     23-04-2026 -- 09:21:57
+OS:       irene7050__Linux__x86_64__4.18.0-553.89.1.el8_10.x86_64__#1 SMP Sat Nov 29 00:49:18 EST 2025
+CPU model : Intel(R) Xeon(R) Gold 6230 CPU @ 2.10GHz
+Total number of threads:80
+GPU model: Tesla V100-SXM2-16GB
+CUDA runtime version: 12.90
+CUDA drivers version: 13.0
+Nb procs used for the computation: 1
+TRUST version: 1.9.8_beta
+Total number of elements used for the calculation: 600576
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+                                           Computation start-up statistics
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Total time of the start-up:                                                6.53659        
+
+Number of calls to the linear solver per time step:                        2              
+Average time of the resolution of the linear problem per call:             0.526084       
+Average number of iteration of the linear solver per call:                 19             
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+                                                 Time loop statistics
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+The first time step is not accounted for the computation of the time loop statistics
+Total time of the time loop:                                               1.25661        
+Number of time steps:                                                      9              
+Skipped time steps:                                                        1              
+Average time per time step:                                                0.139624       
+Standard deviation between time steps:                                     0.0814119      
+Time elapsed in the skipped time steps:                                    0.661938       
+
+
+Standard counter description             | Time/step       | % loop time | Call(s)/step   
+------------------------------------------------------------------------------------------
+Linear solver resolutions Ax=B           | 0.0439441       | 31.5        | 2              
+Convection operator                      | 0.01506447      | 10.8        | 7              
+Diffusion operator                       | 0.01830773      | 13.1        | 16             
+Gradient operator                        | 0.004852523     |  3.5        | 5              
+Divergence operator                      | 0.001835266     |  1.3        | 6              
+Source terms                             | 0.0004075369    |  0.3        | 4              
+Update ::mettre_a_jour                   | 0.007209213     |  5.2        | 4              
+Solver for implicit diffusion            | 0.005015587     |  3.6        | 4              
+Computation of the time step dt          | 0.001144518     |  0.8        | 6              
+Post-treatment operations                | 0.03472955      | 24.9        | 1              
+Other operations                         | 0.007113221     |  5.1        | 
+
+Average number of iteration of the linear solver per call:                 22             
+
+
+-----------------------------------------------------------------------------------------------------------
+                                                    GPU statistics
+-----------------------------------------------------------------------------------------------------------
+Counter description                      | Time per step   | % loop time | Call(s)/step    | Bandwidth 
+-----------------------------------------------------------------------------------------------------------
+Libraries:                               | 0.0436994       | 31.3        | 2               | 
+Kernels:                                 | 0.0398327       | 28.5        | 1073            | 
+Copy host to device:                     | 0.00489413      |  3.5        | 107             | 2.7 GB/s
+Copy device to host:                     | 0.00110194      |  0.8        | 6               | 4.4 GB/s
+Alloc/Free on device:                    | 2.8232e-05      |  0.0        | 856             | 
+GPU: 60% Copy H<->D: 4.3% Alloc/free: 0.02% Comm: 0% CPU & I/O: 36%
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+                                              Post-resolution statistics
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Time of the post-resolution:                                               0.0293262      
+
+Total time for the whole computation                                       8.48449        
+
+[Slurm] Power consumption (23 s):  0.290 kW  0.002 kWh  0.000 € (0.10€/kWh)
diff --git a/tests/GPU/GAMELAN_AMG/GAMELAN_AMG_BENCH.TU.is157091_cc86 b/tests/GPU/GAMELAN_AMG/GAMELAN_AMG_BENCH.TU.is157091_cc86
new file mode 100644
index 0000000000..098287e410
--- /dev/null
+++ b/tests/GPU/GAMELAN_AMG/GAMELAN_AMG_BENCH.TU.is157091_cc86
@@ -0,0 +1,77 @@
+                                             # Global performance file #
+
+This is the global file for tracking performance in TRUST. It stores aggregated quantities.
+More detailed statistics can be found in the GAMELAN_AMG_BENCH_csv.TU file
+For time loop, only standard counters of level 1 are printed alongside your custom counters
+Time is given in seconds
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+                                              Context of the computation
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Date:     31-05-2026 -- 19:50:56
+OS:       is157091__Linux__x86_64__5.11.12-300.fc34.x86_64__#1 SMP Wed Apr 7 16:31:13 UTC 2021
+CPU model : AMD Ryzen Threadripper PRO 5975WX 32-Cores
+Total number of threads:64
+GPU model: NVIDIA RTX A6000
+CUDA runtime version: 12.90
+CUDA drivers version: 13.20
+Nb procs used for the computation: 1
+TRUST version: 1.9.8_beta
+Total number of elements used for the calculation: 600576
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+                                           Computation start-up statistics
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Total time of the start-up:                                                5.13875        
+
+Number of calls to the linear solver per time step:                        2              
+Average time of the resolution of the linear problem per call:             0.361327       
+Average number of iteration of the linear solver per call:                 19             
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+                                                 Time loop statistics
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+The first time step is not accounted for the computation of the time loop statistics
+Total time of the time loop:                                               1.11979        
+Number of time steps:                                                      9              
+Skipped time steps:                                                        1              
+Average time per time step:                                                0.124421       
+Standard deviation between time steps:                                     0.0569485      
+Time elapsed in the skipped time steps:                                    0.248369       
+
+
+Standard counter description             | Time/step       | % loop time | Call(s)/step   
+------------------------------------------------------------------------------------------
+Linear solver resolutions Ax=B           | 0.0481099       | 38.7        | 2              
+Convection operator                      | 0.01159845      |  9.3        | 7              
+Diffusion operator                       | 0.01443568      | 11.6        | 16             
+Gradient operator                        | 0.003674675     |  3.0        | 5              
+Divergence operator                      | 0.001333877     |  1.1        | 6              
+Source terms                             | 0.0003881394    |  0.3        | 4              
+Update ::mettre_a_jour                   | 0.008112219     |  6.5        | 4              
+Solver for implicit diffusion            | 0.004905236     |  3.9        | 4              
+Computation of the time step dt          | 0.001191429     |  1.0        | 6              
+Post-treatment operations                | 0.02340271      | 18.8        | 1              
+Other operations                         | 0.007268525     |  5.8        | 
+
+Average number of iteration of the linear solver per call:                 22             
+
+
+-----------------------------------------------------------------------------------------------------------
+                                                    GPU statistics
+-----------------------------------------------------------------------------------------------------------
+Counter description                      | Time per step   | % loop time | Call(s)/step    | Bandwidth 
+-----------------------------------------------------------------------------------------------------------
+Libraries:                               | 0.0479003       | 38.5        | 2               | 
+Kernels:                                 | 0.0420805       | 33.8        | 1116            | 
+Copy host to device:                     | 0.00167663      |  1.3        | 102             | 5.3 GB/s
+Copy device to host:                     | 0.000106937     |  0.1        | 6               | 4.7 GB/s
+Alloc/Free on device:                    | 2.33561e-05     |  0.0        | 856             | 
+GPU: 72% Copy H<->D: 1.4% Alloc/free: 0.019% Comm: 0% CPU & I/O: 26%
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+                                              Post-resolution statistics
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Time of the post-resolution:                                               0.010622       
+
+Total time for the whole computation                                       6.51754        
+
diff --git a/tests/GPU/GAMELAN_AMG/GAMELAN_AMG_BENCH.TU.is159479_cc120 b/tests/GPU/GAMELAN_AMG/GAMELAN_AMG_BENCH.TU.is159479_cc120
new file mode 100644
index 0000000000..6fe6886c37
--- /dev/null
+++ b/tests/GPU/GAMELAN_AMG/GAMELAN_AMG_BENCH.TU.is159479_cc120
@@ -0,0 +1,77 @@
+                                             # Global performance file #
+
+This is the global file for tracking performance in TRUST. It stores aggregated quantities.
+More detailed statistics can be found in the GAMELAN_AMG_BENCH_csv.TU file
+For time loop, only standard counters of level 1 are printed alongside your custom counters
+Time is given in seconds
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+                                              Context of the computation
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Date:     13-05-2026 -- 07:02:42
+OS:       is159479.intra.cea.fr__Linux__x86_64__6.12.0-160000.29-default__#1 SMP PREEMPT_DYNAMIC Fri May  1 12:45:19 UTC 2026 (6
+CPU model : AMD Ryzen Threadripper PRO 9965WX 24-Cores
+Total number of threads:48
+GPU model: NVIDIA RTX PRO 6000 Blackwell Workstation Edition
+CUDA runtime version: 13.0
+CUDA drivers version: 13.0
+Nb procs used for the computation: 1
+TRUST version: 1.9.8_beta
+Total number of elements used for the calculation: 600576
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+                                           Computation start-up statistics
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Total time of the start-up:                                                3.77582        
+
+Number of calls to the linear solver per time step:                        2              
+Average time of the resolution of the linear problem per call:             0.224925       
+Average number of iteration of the linear solver per call:                 19             
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+                                                 Time loop statistics
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+The first time step is not accounted for the computation of the time loop statistics
+Total time of the time loop:                                               0.524563       
+Number of time steps:                                                      9              
+Skipped time steps:                                                        1              
+Average time per time step:                                                0.0582848      
+Standard deviation between time steps:                                     0.0277033      
+Time elapsed in the skipped time steps:                                    0.209913       
+
+
+Standard counter description             | Time/step       | % loop time | Call(s)/step   
+------------------------------------------------------------------------------------------
+Linear solver resolutions Ax=B           | 0.0235559       | 40.4        | 2              
+Convection operator                      | 0.006250034     | 10.7        | 7              
+Diffusion operator                       | 0.006456884     | 11.1        | 16             
+Gradient operator                        | 0.00193865      |  3.3        | 5              
+Divergence operator                      | 0.000565465     |  1.0        | 6              
+Source terms                             | 0.0001516686    |  0.3        | 4              
+Update ::mettre_a_jour                   | 0.002713566     |  4.7        | 4              
+Solver for implicit diffusion            | 0.001805891     |  3.1        | 4              
+Computation of the time step dt          | 0.0004955493    |  0.9        | 6              
+Post-treatment operations                | 0.01174376      | 20.1        | 1              
+Other operations                         | 0.002607339     |  4.5        | 
+
+Average number of iteration of the linear solver per call:                 22             
+
+
+-----------------------------------------------------------------------------------------------------------
+                                                    GPU statistics
+-----------------------------------------------------------------------------------------------------------
+Counter description                      | Time per step   | % loop time | Call(s)/step    | Bandwidth 
+-----------------------------------------------------------------------------------------------------------
+Libraries:                               | 0.023469        | 40.3        | 2               | 
+Kernels:                                 | 0.0159351       | 27.3        | 1116            | 
+Copy host to device:                     | 0.00123798      |  2.1        | 102             | 7.2 GB/s
+Copy device to host:                     | 8.48317e-05     |  0.1        | 6               | 5.9 GB/s
+Alloc/Free on device:                    | 1.78693e-05     |  0.0        | 856             | 
+GPU: 68% Copy H<->D: 2.3% Alloc/free: 0.031% Comm: 0% CPU & I/O: 30%
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+                                              Post-resolution statistics
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Time of the post-resolution:                                               0.00880884     
+
+Total time for the whole computation                                       4.51912        
+
diff --git a/tests/GPU/GAMELAN_AMG/GAMELAN_AMG_BENCH.TU.is246827_cc86 b/tests/GPU/GAMELAN_AMG/GAMELAN_AMG_BENCH.TU.is246827_cc86
new file mode 100644
index 0000000000..119f8aacad
--- /dev/null
+++ b/tests/GPU/GAMELAN_AMG/GAMELAN_AMG_BENCH.TU.is246827_cc86
@@ -0,0 +1,52 @@
+Statistiques d'initialisation du calcul
+
+Temps total                       4.8905
+
+Statistiques de resolution du probleme
+
+Temps total                       3.46361
+
+
+Timesteps                         10
+Secondes / pas de temps           0.346353
+Dont solveurs Ax=B                0.170325 49% (2 appels/pas de temps)
+Dont solveur diffusion_implicite  0.012993  3% (4 appels/pas de temps)
+Dont mettre_a_jour                0.017205  4% (4 appels/pas de temps)
+Dont operateurs convection        0.036781 10% (6.9 appels/pas de temps)
+Dont operateurs diffusion         0.045840 13% (16 appels/pas de temps)
+Dont operateurs gradient          0.010809  3% (5 appels/pas de temps)
+Dont operateurs divergence        0.006897  1% (6 appels/pas de temps)
+Dont operateurs source            0.000772  0% (4 appels/pas de temps)
+Dont operations postraitement     0.018958  5% (1 appel/pas de temps)
+Dont calcul dt                    0.002290  0% (6 appels/pas de temps)
+Dont calcul divers                0.023481  6% (0 appels/pas de temps)
+Nb solveur / pas de temps         2
+Secondes / solveur                0.0851625
+Iterations / solveur              16.05
+GPU statistics per time step (experimental):
+Libraries : 0.169797 s 49.0%  2.0 calls
+Kernels   : 0.106873 s 30.9% 1154.1 calls
+Copy H2D  : 0.008219 s  2.4% 137.2 calls  8.0 GB/s
+Copy D2H  : 0.004039 s  1.2% 11.4 calls  8.2 GB/s
+Alloc/Free: 0.008932 s  2.6% 912.1 calls
+GPU: 79.8% Copy H<->D: 3.5% Alloc/Free: 2.5% Comm: 0% CPU & Others: 14%
+I/O:
+
+Timesteps = number of time steps
+Nb solveur = number of linear system resolutions
+Nb assemblage implicite = number of matrix assemblies for the implicit scheme
+Iterations = average number of iterations of the solver
+Communications = fraction of the time spent
+                 in communications between processors (excluding io files)
+Network latency = time of one mpsum measured by an internal bench over 0.1s
+Network bandwidth = maximum on all processors
+                    of the average bandwidth of send_recv operations
+Waiting time = estimation of the waiting time of the different processors
+
+Max_waiting_time big    => probably due to a bad partitioning
+Communications > 30%    => too many processors or network too slow
+
+Statistiques de post resolution
+
+Temps total                       0.143859
+
diff --git a/tests/GPU/GAMELAN_AMG/GAMELAN_AMG_BENCH.TU.is247793_gfx1100 b/tests/GPU/GAMELAN_AMG/GAMELAN_AMG_BENCH.TU.is247793_gfx1100
new file mode 100644
index 0000000000..bd0ae4f1e6
--- /dev/null
+++ b/tests/GPU/GAMELAN_AMG/GAMELAN_AMG_BENCH.TU.is247793_gfx1100
@@ -0,0 +1,77 @@
+                                             # Global performance file #
+
+This is the global file for tracking performance in TRUST. It stores aggregated quantities.
+More detailed statistics can be found in the GAMELAN_AMG_BENCH_csv.TU file
+For time loop, only standard counters of level 1 are printed alongside your custom counters
+Time is given in seconds
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+                                              Context of the computation
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Date:     22-05-2026 -- 17:33:11
+OS:       is247793__Linux__x86_64__6.8.0-107-generic__#107~22.04.1-Ubuntu SMP PREEMPT_DYNAMIC Wed Mar 18 23:40:43 UTC 
+CPU model : AMD Ryzen Threadripper PRO 7965WX 24-Cores
+Total number of threads:48
+GPU model: AMD Radeon PRO W7900
+HIP runtime version: 7.53
+HIP drivers version: 7.53
+Nb procs used for the computation: 1
+TRUST version: 1.9.8_beta
+Total number of elements used for the calculation: 600576
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+                                           Computation start-up statistics
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Total time of the start-up:                                                4.69876        
+
+Number of calls to the linear solver per time step:                        2              
+Average time of the resolution of the linear problem per call:             0.491257       
+Average number of iteration of the linear solver per call:                 22             
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+                                                 Time loop statistics
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+The first time step is not accounted for the computation of the time loop statistics
+Total time of the time loop:                                               1.50817        
+Number of time steps:                                                      9              
+Skipped time steps:                                                        1              
+Average time per time step:                                                0.167574       
+Standard deviation between time steps:                                     0.0378413      
+Time elapsed in the skipped time steps:                                    0.324171       
+
+
+Standard counter description             | Time/step       | % loop time | Call(s)/step   
+------------------------------------------------------------------------------------------
+Linear solver resolutions Ax=B           | 0.0474527       | 28.3        | 2              
+Convection operator                      | 0.01240694      |  7.4        | 7              
+Diffusion operator                       | 0.01717201      | 10.2        | 16             
+Gradient operator                        | 0.003410908     |  2.0        | 5              
+Divergence operator                      | 0.001815707     |  1.1        | 6              
+Source terms                             | 0.0005138534    |  0.3        | 4              
+Update ::mettre_a_jour                   | 0.007159344     |  4.3        | 4              
+Solver for implicit diffusion            | 0.005624942     |  3.4        | 4              
+Computation of the time step dt          | 0.00146865      |  0.9        | 6              
+Post-treatment operations                | 0.06273935      | 37.4        | 1              
+Other operations                         | 0.00780992      |  4.7        | 
+
+Average number of iteration of the linear solver per call:                 21.7           
+
+
+-----------------------------------------------------------------------------------------------------------
+                                                    GPU statistics
+-----------------------------------------------------------------------------------------------------------
+Counter description                      | Time per step   | % loop time | Call(s)/step    | Bandwidth 
+-----------------------------------------------------------------------------------------------------------
+Libraries:                               | 0.0472264       | 28.2        | 2               | 
+Kernels:                                 | 0.0947303       | 56.5        | 1116            | 
+Copy host to device:                     | 0.00191656      |  1.1        | 102             | 4.6 GB/s
+Copy device to host:                     | 0.00020206      |  0.1        | 6               | 2.5 GB/s
+Alloc/Free on device:                    | 1.96927e-05     |  0.0        | 856             | 
+GPU: 85% Copy H<->D: 1.3% Alloc/free: 0.012% Comm: 0% CPU & I/O: 14%
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+                                              Post-resolution statistics
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Time of the post-resolution:                                               0.0125331      
+
+Total time for the whole computation                                       6.54364        
+
diff --git a/tests/GPU/GAMELAN_AMG/GAMELAN_AMG_BENCH.TU.jean-zay_cc90 b/tests/GPU/GAMELAN_AMG/GAMELAN_AMG_BENCH.TU.jean-zay_cc90
new file mode 100644
index 0000000000..07ab4d2e5d
--- /dev/null
+++ b/tests/GPU/GAMELAN_AMG/GAMELAN_AMG_BENCH.TU.jean-zay_cc90
@@ -0,0 +1,78 @@
+                                             # Global performance file #
+
+This is the global file for tracking performance in TRUST. It stores aggregated quantities.
+More detailed statistics can be found in the GAMELAN_AMG_BENCH_csv.TU file
+For time loop, only standard counters of level 1 are printed alongside your custom counters
+Time is given in seconds
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+                                              Context of the computation
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Date:     23-04-2026 -- 08:11:50
+OS:       jzxh011__Linux__x86_64__5.14.0-570.58.1.el9_6.x86_64__#1 SMP PREEMPT_DYNAMIC Tue Oct 21 04:15:07 EDT 2025
+CPU model : Intel(R) Xeon(R) Platinum 8468
+Total number of threads:192
+GPU model: NVIDIA H100 80GB HBM3
+CUDA runtime version: 12.60
+CUDA drivers version: 13.0
+Nb procs used for the computation: 1
+TRUST version: 1.9.8_beta
+Total number of elements used for the calculation: 600576
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+                                           Computation start-up statistics
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Total time of the start-up:                                                6.74938        
+
+Number of calls to the linear solver per time step:                        2              
+Average time of the resolution of the linear problem per call:             0.418271       
+Average number of iteration of the linear solver per call:                 19             
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+                                                 Time loop statistics
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+The first time step is not accounted for the computation of the time loop statistics
+Total time of the time loop:                                               0.768038       
+Number of time steps:                                                      9              
+Skipped time steps:                                                        1              
+Average time per time step:                                                0.0853375      
+Standard deviation between time steps:                                     0.0561452      
+Time elapsed in the skipped time steps:                                    0.663388       
+
+
+Standard counter description             | Time/step       | % loop time | Call(s)/step   
+------------------------------------------------------------------------------------------
+Linear solver resolutions Ax=B           | 0.0226408       | 26.5        | 2              
+Convection operator                      | 0.01184246      | 13.9        | 7              
+Diffusion operator                       | 0.009782814     | 11.5        | 16             
+Gradient operator                        | 0.003806978     |  4.5        | 5              
+Divergence operator                      | 0.0009910392    |  1.2        | 6              
+Source terms                             | 0.0001986488    |  0.2        | 4              
+Update ::mettre_a_jour                   | 0.00376011      |  4.4        | 4              
+Solver for implicit diffusion            | 0.002483634     |  2.9        | 4              
+Computation of the time step dt          | 0.0006226492    |  0.7        | 6              
+Post-treatment operations                | 0.02469796      | 28.9        | 1              
+Other operations                         | 0.00451043      |  5.3        | 
+
+Average number of iteration of the linear solver per call:                 22             
+
+
+-----------------------------------------------------------------------------------------------------------
+                                                    GPU statistics
+-----------------------------------------------------------------------------------------------------------
+Counter description                      | Time per step   | % loop time | Call(s)/step    | Bandwidth 
+-----------------------------------------------------------------------------------------------------------
+Libraries:                               | 0.0225083       | 26.4        | 2               | 
+Kernels:                                 | 0.0191684       | 22.5        | 1073            | 
+Copy host to device:                     | 0.00240775      |  2.8        | 107             | 5.5 GB/s
+Copy device to host:                     | 0.00061727      |  0.7        | 6               | 7.8 GB/s
+Alloc/Free on device:                    | 1.9512e-05      |  0.0        | 856             | 
+GPU: 49% Copy H<->D: 3.5% Alloc/free: 0.023% Comm: 0% CPU & I/O: 48%
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+                                              Post-resolution statistics
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Time of the post-resolution:                                               0.0715051      
+
+Total time for the whole computation                                       8.25234        
+
+[Slurm] Power consumption (19 s):  0.370 kW  0.002 kWh  0.000 € (0.10€/kWh)
diff --git a/tests/GPU/GAMELAN_AMG/GAMELAN_AMG_BENCH.TU.lumi_gfx90a b/tests/GPU/GAMELAN_AMG/GAMELAN_AMG_BENCH.TU.lumi_gfx90a
new file mode 100644
index 0000000000..7b334d89de
--- /dev/null
+++ b/tests/GPU/GAMELAN_AMG/GAMELAN_AMG_BENCH.TU.lumi_gfx90a
@@ -0,0 +1,78 @@
+                                             # Global performance file #
+
+This is the global file for tracking performance in TRUST. It stores aggregated quantities.
+More detailed statistics can be found in the GAMELAN_AMG_BENCH_csv.TU file
+For time loop, only standard counters of level 1 are printed alongside your custom counters
+Time is given in seconds
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+                                              Context of the computation
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Date:     18-05-2026 -- 08:50:06
+OS:       nid007955__Linux__x86_64__6.4.0-150600.23.73_15.0.14-cray_shasta_c__#1 SMP Tue Oct 21 20:32:25 UTC 2025 (89d3c98)
+CPU model : AMD EPYC 7A53 64-Core Processor
+Total number of threads:128
+GPU model: AMD Instinct MI250X
+HIP runtime version: 6.43
+HIP drivers version: 6.43
+Nb procs used for the computation: 1
+TRUST version: 1.9.8_beta
+Total number of elements used for the calculation: 600576
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+                                           Computation start-up statistics
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Total time of the start-up:                                                48.31          
+
+Number of calls to the linear solver per time step:                        2              
+Average time of the resolution of the linear problem per call:             1.69807        
+Average number of iteration of the linear solver per call:                 19             
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+                                                 Time loop statistics
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+The first time step is not accounted for the computation of the time loop statistics
+Total time of the time loop:                                               2.09182        
+Number of time steps:                                                      9              
+Skipped time steps:                                                        1              
+Average time per time step:                                                0.232424       
+Standard deviation between time steps:                                     0.0682738      
+Time elapsed in the skipped time steps:                                    0.479321       
+
+
+Standard counter description             | Time/step       | % loop time | Call(s)/step   
+------------------------------------------------------------------------------------------
+Linear solver resolutions Ax=B           | 0.0269227       | 11.6        | 2              
+Convection operator                      | 0.01438519      |  6.2        | 7              
+Diffusion operator                       | 0.01992115      |  8.6        | 16             
+Gradient operator                        | 0.004397904     |  1.9        | 5              
+Divergence operator                      | 0.001778478     |  0.8        | 6              
+Source terms                             | 0.0003667301    |  0.2        | 4              
+Update ::mettre_a_jour                   | 0.01043215      |  4.5        | 4              
+Solver for implicit diffusion            | 0.004551798     |  2.0        | 4              
+Computation of the time step dt          | 0.001073174     |  0.5        | 6              
+Post-treatment operations                | 0.1427887       | 61.4        | 1              
+Other operations                         | 0.005806166     |  2.5        | 
+
+Average number of iteration of the linear solver per call:                 22             
+
+
+-----------------------------------------------------------------------------------------------------------
+                                                    GPU statistics
+-----------------------------------------------------------------------------------------------------------
+Counter description                      | Time per step   | % loop time | Call(s)/step    | Bandwidth 
+-----------------------------------------------------------------------------------------------------------
+Libraries:                               | 0.0267111       | 11.5        | 2               | 
+Kernels:                                 | 0.163472        | 70.3        | 1116            | 
+Copy host to device:                     | 0.0021821       |  0.9        | 102             | 4.1 GB/s
+Copy device to host:                     | 0.000161978     |  0.1        | 6               | 3.1 GB/s
+Alloc/Free on device:                    | 3.40422e-05     |  0.0        | 856             | 
+GPU: 82% Copy H<->D: 1% Alloc/free: 0.015% Comm: 0% CPU & I/O: 17%
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+                                              Post-resolution statistics
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Time of the post-resolution:                                               0.0255515      
+
+Total time for the whole computation                                       50.9067        
+
+[Slurm] Power consumption (76 s):  0.449 kW  0.009 kWh  0.001 € (0.10€/kWh)
diff --git a/tests/GPU/GAMELAN_AMG/GAMELAN_AMG_BENCH.TU.topaze_cc80 b/tests/GPU/GAMELAN_AMG/GAMELAN_AMG_BENCH.TU.topaze_cc80
new file mode 100644
index 0000000000..426022d24e
--- /dev/null
+++ b/tests/GPU/GAMELAN_AMG/GAMELAN_AMG_BENCH.TU.topaze_cc80
@@ -0,0 +1,78 @@
+                                             # Global performance file #
+
+This is the global file for tracking performance in TRUST. It stores aggregated quantities.
+More detailed statistics can be found in the GAMELAN_AMG_BENCH_csv.TU file
+For time loop, only standard counters of level 1 are printed alongside your custom counters
+Time is given in seconds
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+                                              Context of the computation
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Date:     15-05-2026 -- 13:29:10
+OS:       topaze7064__Linux__x86_64__4.18.0-553.89.1.el8_10.x86_64__#1 SMP Sat Nov 29 00:49:18 EST 2025
+CPU model : AMD EPYC 7763 64-Core Processor
+Total number of threads:256
+GPU model: NVIDIA A100-SXM4-80GB
+CUDA runtime version: 12.90
+CUDA drivers version: 13.0
+Nb procs used for the computation: 1
+TRUST version: 1.9.8_beta
+Total number of elements used for the calculation: 600576
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+                                           Computation start-up statistics
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Total time of the start-up:                                                7.11853        
+
+Number of calls to the linear solver per time step:                        2              
+Average time of the resolution of the linear problem per call:             0.439124       
+Average number of iteration of the linear solver per call:                 19             
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+                                                 Time loop statistics
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+The first time step is not accounted for the computation of the time loop statistics
+Total time of the time loop:                                               0.928862       
+Number of time steps:                                                      9              
+Skipped time steps:                                                        1              
+Average time per time step:                                                0.103207       
+Standard deviation between time steps:                                     0.0773559      
+Time elapsed in the skipped time steps:                                    0.454931       
+
+
+Standard counter description             | Time/step       | % loop time | Call(s)/step   
+------------------------------------------------------------------------------------------
+Linear solver resolutions Ax=B           | 0.0303394       | 29.4        | 2              
+Convection operator                      | 0.01164006      | 11.3        | 7              
+Diffusion operator                       | 0.01139027      | 11.0        | 16             
+Gradient operator                        | 0.004048191     |  3.9        | 5              
+Divergence operator                      | 0.001141949     |  1.1        | 6              
+Source terms                             | 0.0002602714    |  0.3        | 4              
+Update ::mettre_a_jour                   | 0.004800796     |  4.7        | 4              
+Solver for implicit diffusion            | 0.003161202     |  3.1        | 4              
+Computation of the time step dt          | 0.0007556401    |  0.7        | 6              
+Post-treatment operations                | 0.0315041       | 30.5        | 1              
+Other operations                         | 0.004165016     |  4.0        | 
+
+Average number of iteration of the linear solver per call:                 22             
+
+
+-----------------------------------------------------------------------------------------------------------
+                                                    GPU statistics
+-----------------------------------------------------------------------------------------------------------
+Counter description                      | Time per step   | % loop time | Call(s)/step    | Bandwidth 
+-----------------------------------------------------------------------------------------------------------
+Libraries:                               | 0.0301621       | 29.2        | 2               | 
+Kernels:                                 | 0.025873        | 25.1        | 1116            | 
+Copy host to device:                     | 0.00202125      |  2.0        | 102             | 4.4 GB/s
+Copy device to host:                     | 0.000121609     |  0.1        | 6               | 4.1 GB/s
+Alloc/Free on device:                    | 2.90103e-05     |  0.0        | 856             | 
+GPU: 54% Copy H<->D: 2.1% Alloc/free: 0.028% Comm: 0% CPU & I/O: 44%
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+                                              Post-resolution statistics
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Time of the post-resolution:                                               0.030145       
+
+Total time for the whole computation                                       8.53248        
+
+[Slurm] Power consumption (38 s):  0.629 kW  0.007 kWh  0.001 € (0.10€/kWh)
diff --git a/tests/GPU/GAMELAN_AMG/PAR_GAMELAN_AMG_BENCH.TU.is246827x8 b/tests/GPU/GAMELAN_AMG/PAR_GAMELAN_AMG_BENCH.TU.is246827x8
new file mode 100644
index 0000000000..790ebd73cb
--- /dev/null
+++ b/tests/GPU/GAMELAN_AMG/PAR_GAMELAN_AMG_BENCH.TU.is246827x8
@@ -0,0 +1,63 @@
+Statistiques d'initialisation du calcul
+
+Temps total                       2.30915
+
+Statistiques de resolution du probleme
+
+Temps total                       11.5886
+
+
+Timesteps                         10
+Secondes / pas de temps           1.15886
+Dont solveurs Ax=B                0.652978 56% (2 appels/pas de temps)
+Dont solveur diffusion_implicite  0.060133  5% (4 appels/pas de temps)
+Dont mettre_a_jour                0.057110  4% (4 appels/pas de temps)
+Dont operateurs convection        0.077288  6% (6.9 appels/pas de temps)
+Dont operateurs diffusion         0.202981 17% (16 appels/pas de temps)
+Dont operateurs gradient          0.019023  1% (5 appels/pas de temps)
+Dont operateurs divergence        0.018322  1% (6 appels/pas de temps)
+Dont operateurs source            0.003755  0% (4 appels/pas de temps)
+Dont operations postraitement     0.010286  0% (1 appel/pas de temps)
+Dont calcul dt                    0.009854  0% (6 appels/pas de temps)
+Dont calcul divers                0.085039  7% (0 appels/pas de temps)
+Nb echange_espace_virtuel / pas de temps 105.9
+Nb MPI_allreduce / pas de temps 54.2
+-----------------------------------------------------------------------------------------------------------------------------------------
+Warning: The number of MPI_allreduce calls per time step is high. Contact TRUST support if you plan to run massive parallel calculation.
+-----------------------------------------------------------------------------------------------------------------------------------------
+Nb solveur / pas de temps         2
+Secondes / solveur                0.326489
+Iterations / solveur              8.05
+I/O:
+---------------------------------------------------------------------------------------------------------
+Warning: One or several PETSc solvers are used and thus the communication time below are under-estimated.
+---------------------------------------------------------------------------------------------------------
+Communications avg        4.7 % of total time
+Communications max        8.4 % of total time
+Communications min        2.1 % of total time
+Network latency benchmark 1.39952e-06 s
+Network bandwidth max     4325.19 MB/s
+Total network traffic     461.511 MB / timestep
+Average message size      151.893 kB
+Min waiting time          0.2 % of total time
+Max waiting time          6.5 % of total time
+Avg waiting time          2.9 % of total time
+
+Timesteps = number of time steps
+Nb solveur = number of linear system resolutions
+Nb assemblage implicite = number of matrix assemblies for the implicit scheme
+Iterations = average number of iterations of the solver
+Communications = fraction of the time spent
+                 in communications between processors (excluding io files)
+Network latency = time of one mpsum measured by an internal bench over 0.1s
+Network bandwidth = maximum on all processors
+                    of the average bandwidth of send_recv operations
+Waiting time = estimation of the waiting time of the different processors
+
+Max_waiting_time big    => probably due to a bad partitioning
+Communications > 30%    => too many processors or network too slow
+
+Statistiques de post resolution
+
+Temps total                       0.0601
+
diff --git a/tests/GPU/GAMELAN_AMG/check_perf.sh b/tests/GPU/GAMELAN_AMG/check_perf.sh
new file mode 120000
index 0000000000..6d20411c12
--- /dev/null
+++ b/tests/GPU/GAMELAN_AMG/check_perf.sh
@@ -0,0 +1 @@
+../DomainFlowLES/check_perf.sh
\ No newline at end of file
diff --git a/tests/GPU/GAMELAN_AMG/dom.geom b/tests/GPU/GAMELAN_AMG/dom.geom
new file mode 120000
index 0000000000..0759265946
--- /dev/null
+++ b/tests/GPU/GAMELAN_AMG/dom.geom
@@ -0,0 +1 @@
+../../Dilatable/GAMELAN/dom.geom
\ No newline at end of file
diff --git a/tests/GPU/GMRES/GMRES.data b/tests/GPU/GMRES/GMRES.data
index 895e8fcd83..c27c485015 100644
--- a/tests/GPU/GMRES/GMRES.data
+++ b/tests/GPU/GMRES/GMRES.data
@@ -41,7 +41,8 @@ Scatter DOM.Zones dom_fluide
 END SCATTER #
 
 
-VEFPreP1B dis
+VEFPreP1B dis 
+Lire dis { reorder { algo Hilbert } }
 
 Scheme_euler_implicit sch
 Read sch
diff --git a/tests/GPU/GMRES/GMRES_BENCH.TU.adastra_gfx90a b/tests/GPU/GMRES/GMRES_BENCH.TU.adastra_gfx90a
index bd1cb44dd5..0d0c3bf521 100644
--- a/tests/GPU/GMRES/GMRES_BENCH.TU.adastra_gfx90a
+++ b/tests/GPU/GMRES/GMRES_BENCH.TU.adastra_gfx90a
@@ -8,8 +8,8 @@ Time is given in seconds
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
                                               Context of the computation
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-Date:     12-03-2026 -- 18:25:55
-OS:       g1016__Linux__x86_64__5.14.0-570.69.1.el9_6.x86_64__#1 SMP PREEMPT_DYNAMIC Tue Nov 25 01:30:14 EST 2025
+Date:     23-04-2026 -- 09:21:16
+OS:       g1109__Linux__x86_64__5.14.0-570.69.1.el9_6.x86_64__#1 SMP PREEMPT_DYNAMIC Tue Nov 25 01:30:14 EST 2025
 CPU model : AMD EPYC 7A53 64-Core Processor
 Total number of threads:128
 GPU model: AMD Instinct MI250X
@@ -22,36 +22,36 @@ Total number of elements used for the calculation: 2560000
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
                                            Computation start-up statistics
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-Total time of the start-up:                                                54.233         
+Total time of the start-up:                                                58.4859        
 
 Number of calls to the linear solver per time step:                        1              
-Average time of the resolution of the linear problem per call:             2.47337        
+Average time of the resolution of the linear problem per call:             3.10443        
 Average number of iteration of the linear solver per call:                 0              
 
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
                                                  Time loop statistics
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 The first time step is not accounted for the computation of the time loop statistics
-Total time of the time loop:                                               19.9272        
+Total time of the time loop:                                               18.9179        
 Number of time steps:                                                      9              
 Skipped time steps:                                                        1              
-Average time per time step:                                                2.21413        
-Standard deviation between time steps:                                     0.279079       
-Time elapsed in the skipped time steps:                                    8.57934        
+Average time per time step:                                                2.10199        
+Standard deviation between time steps:                                     0.290698       
+Time elapsed in the skipped time steps:                                    18.7335        
 
 
 Standard counter description             | Time/step       | % loop time | Call(s)/step   
 ------------------------------------------------------------------------------------------
-Linear solver resolutions Ax=B           | 1.39803         | 63.1        | 3              
-Matrix assembly for implicit scheme      | 0.1719198       |  7.8        | 1              
-Convection operator                      | 0.2154025       |  9.7        | 4              
-Diffusion operator                       | 0.01442309      |  0.7        | 2              
-Divergence operator                      | 0.03172028      |  1.4        | 4              
-Source terms                             | 0.0005456014    |  0.0        | 2              
-Update ::mettre_a_jour                   | 0.01157037      |  0.5        | 4              
-Computation of the time step dt          | 0.001521904     |  0.1        | 4              
-Post-treatment operations                | 0.02268971      |  1.0        | 1              
-Other operations                         | 0.3463029       | 15.6        | 
+Linear solver resolutions Ax=B           | 1.4093          | 67.0        | 3              
+Matrix assembly for implicit scheme      | 0.1359102       |  6.5        | 1              
+Convection operator                      | 0.1733518       |  8.2        | 4              
+Diffusion operator                       | 0.01203251      |  0.6        | 2              
+Divergence operator                      | 0.02118798      |  1.0        | 4              
+Source terms                             | 0.0005388649    |  0.0        | 2              
+Update ::mettre_a_jour                   | 0.00881933      |  0.4        | 4              
+Computation of the time step dt          | 0.00153714      |  0.1        | 4              
+Post-treatment operations                | 0.02121914      |  1.0        | 1              
+Other operations                         | 0.3180953       | 15.1        | 
 
 Average number of iteration of the linear solver per call:                 14.7           
 
@@ -61,17 +61,17 @@ Average number of iteration of the linear solver per call:                 14.7
 -----------------------------------------------------------------------------------------------------------
 Counter description                      | Time per step   | % loop time | Call(s)/step    | Bandwidth 
 -----------------------------------------------------------------------------------------------------------
-Libraries:                               | 0.0841743       |  3.8        | 1               | 
-Kernels:                                 | 2.1175          | 95.6        | 1301            | 
-Copy host to device:                     | 0.000659637     |  0.0        | 21              | 6.8 GB/s
-Copy device to host:                     | 0.00075392      |  0.0        | 7               | 14.8 GB/s
-Alloc/Free on device:                    | 0.000119878     |  0.0        | 4               | 
-GPU: 99% Copy H<->D: 0.064% Alloc/free: 0.0054% Comm: 0% CPU & I/O: 0.49%
+Libraries:                               | 0.0665175       |  3.2        | 1               | 
+Kernels:                                 | 2.02331         | 96.3        | 1300            | 
+Copy host to device:                     | 0.000635208     |  0.0        | 21              | 7.1 GB/s
+Copy device to host:                     | 0.000746803     |  0.0        | 7               | 14.9 GB/s
+Alloc/Free on device:                    | 0.00011886      |  0.0        | 4               | 
+GPU: 99% Copy H<->D: 0.066% Alloc/free: 0.0057% Comm: 0% CPU & I/O: 0.51%
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
                                               Post-resolution statistics
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-Time of the post-resolution:                                               0.197574       
+Time of the post-resolution:                                               0.207713       
 
-Total time for the whole computation                                       82.9371        
+Total time for the whole computation                                       96.3451        
 
-[Slurm] Power consumption (90 s):  0.535 kW  0.013 kWh  0.001 € (0.10€/kWh)
+[Slurm] Power consumption (104 s):  0.529 kW  0.015 kWh  0.002 € (0.10€/kWh)
diff --git a/tests/GPU/GMRES/GMRES_BENCH.TU.adastra_gfx942 b/tests/GPU/GMRES/GMRES_BENCH.TU.adastra_gfx942
index 3de23b884e..a2f22ab2f1 100644
--- a/tests/GPU/GMRES/GMRES_BENCH.TU.adastra_gfx942
+++ b/tests/GPU/GMRES/GMRES_BENCH.TU.adastra_gfx942
@@ -8,8 +8,8 @@ Time is given in seconds
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
                                               Context of the computation
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-Date:     23-02-2026 -- 17:59:06
-OS:       a1007__Linux__x86_64__5.14.0-570.69.1.el9_6.x86_64__#1 SMP PREEMPT_DYNAMIC Tue Nov 25 01:30:14 EST 2025
+Date:     23-04-2026 -- 14:41:34
+OS:       a1003__Linux__x86_64__5.14.0-570.69.1.el9_6.x86_64__#1 SMP PREEMPT_DYNAMIC Tue Nov 25 01:30:14 EST 2025
 CPU model : AMD Instinct MI300A Accelerator
 Total number of threads:192
 GPU model: AMD Instinct MI300A
@@ -22,36 +22,36 @@ Total number of elements used for the calculation: 2560000
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
                                            Computation start-up statistics
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-Total time of the start-up:                                                89.1523        
+Total time of the start-up:                                                63.9456        
 
 Number of calls to the linear solver per time step:                        1              
-Average time of the resolution of the linear problem per call:             3.94747        
+Average time of the resolution of the linear problem per call:             3.87063        
 Average number of iteration of the linear solver per call:                 0              
 
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
                                                  Time loop statistics
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 The first time step is not accounted for the computation of the time loop statistics
-Total time of the time loop:                                               7.1811         
+Total time of the time loop:                                               6.96203        
 Number of time steps:                                                      9              
 Skipped time steps:                                                        1              
-Average time per time step:                                                0.7979         
-Standard deviation between time steps:                                     0.123846       
-Time elapsed in the skipped time steps:                                    9.4072         
+Average time per time step:                                                0.773558       
+Standard deviation between time steps:                                     0.122661       
+Time elapsed in the skipped time steps:                                    17.8522        
 
 
 Standard counter description             | Time/step       | % loop time | Call(s)/step   
 ------------------------------------------------------------------------------------------
-Linear solver resolutions Ax=B           | 0.6164          | 77.3        | 3              
-Matrix assembly for implicit scheme      | 0.03673611      |  4.6        | 1              
-Convection operator                      | 0.04092509      |  5.1        | 4              
-Diffusion operator                       | 0.004677966     |  0.6        | 2              
-Divergence operator                      | 0.009021316     |  1.1        | 4              
-Source terms                             | 0.0003811747    |  0.0        | 2              
-Update ::mettre_a_jour                   | 0.005522843     |  0.7        | 4              
-Computation of the time step dt          | 0.001337907     |  0.2        | 4              
-Post-treatment operations                | 0.01127625      |  1.4        | 1              
-Other operations                         | 0.07162194      |  9.0        | 
+Linear solver resolutions Ax=B           | 0.606553        | 78.4        | 3              
+Matrix assembly for implicit scheme      | 0.03394749      |  4.4        | 1              
+Convection operator                      | 0.04057731      |  5.2        | 4              
+Diffusion operator                       | 0.00410471      |  0.5        | 2              
+Divergence operator                      | 0.006404666     |  0.8        | 4              
+Source terms                             | 0.0002954579    |  0.0        | 2              
+Update ::mettre_a_jour                   | 0.004831261     |  0.6        | 4              
+Computation of the time step dt          | 0.001217283     |  0.2        | 4              
+Post-treatment operations                | 0.01085299      |  1.4        | 1              
+Other operations                         | 0.06477427      |  8.4        | 
 
 Average number of iteration of the linear solver per call:                 14.7           
 
@@ -61,16 +61,17 @@ Average number of iteration of the linear solver per call:                 14.7
 -----------------------------------------------------------------------------------------------------------
 Counter description                      | Time per step   | % loop time | Call(s)/step    | Bandwidth 
 -----------------------------------------------------------------------------------------------------------
-Libraries:                               | 0.0385048       |  4.8        | 1               | 
-Kernels:                                 | 0.747298        | 93.7        | 1301            | 
-Copy host to device:                     | 0.000577279     |  0.1        | 21              | 7.8 GB/s
-Copy device to host:                     | 0.00051411      |  0.1        | 7               | 21.7 GB/s
-Alloc/Free on device:                    | 0.000865148     |  0.1        | 4               | 
-GPU: 98% Copy H<->D: 0.14% Alloc/free: 0.11% Comm: 0% CPU & I/O: 1.3%
+Libraries:                               | 0.0347636       |  4.5        | 1               | 
+Kernels:                                 | 0.726319        | 93.9        | 1300            | 
+Copy host to device:                     | 0.000620905     |  0.1        | 21              | 7.2 GB/s
+Copy device to host:                     | 0.000508449     |  0.1        | 7               | 21.9 GB/s
+Alloc/Free on device:                    | 0.000853162     |  0.1        | 4               | 
+GPU: 98% Copy H<->D: 0.15% Alloc/free: 0.11% Comm: 0% CPU & I/O: 1.4%
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
                                               Post-resolution statistics
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-Time of the post-resolution:                                               0.214741       
+Time of the post-resolution:                                               0.183286       
 
-Total time for the whole computation                                       105.955        
+Total time for the whole computation                                       88.9431        
 
+[Slurm] Power consumption (98 s):  0.703 kW  0.019 kWh  0.002 € (0.10€/kWh)
diff --git a/tests/GPU/GMRES/GMRES_BENCH.TU.dalianvl_cc100 b/tests/GPU/GMRES/GMRES_BENCH.TU.dalianvl_cc100
new file mode 100644
index 0000000000..0dfe5397eb
--- /dev/null
+++ b/tests/GPU/GMRES/GMRES_BENCH.TU.dalianvl_cc100
@@ -0,0 +1,76 @@
+                                             # Global performance file #
+
+This is the global file for tracking performance in TRUST. It stores aggregated quantities.
+More detailed statistics can be found in the GMRES_BENCH_csv.TU file
+For time loop, only standard counters of level 1 are printed alongside your custom counters
+Time is given in seconds
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+                                              Context of the computation
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Date:     22-05-2026 -- 12:15:07
+OS:       dalianvl14__Linux__aarch64__5.14.0-503.40.1.el9_5.aarch64__#1 SMP PREEMPT_DYNAMIC Thu Apr 24 13:05:29 UTC 2025
+CPU model : Unknown Linux CPU
+Total number of threads:144
+GPU model: NVIDIA GB200
+CUDA runtime version: 13.0
+CUDA drivers version: 13.20
+Nb procs used for the computation: 1
+TRUST version: 1.9.8_beta
+Total number of elements used for the calculation: 2560000
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+                                           Computation start-up statistics
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Total time of the start-up:                                                39.6331        
+
+Number of calls to the linear solver per time step:                        1              
+Average time of the resolution of the linear problem per call:             2.71223        
+Average number of iteration of the linear solver per call:                 0              
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+                                                 Time loop statistics
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+The first time step is not accounted for the computation of the time loop statistics
+Total time of the time loop:                                               1.59626        
+Number of time steps:                                                      9              
+Skipped time steps:                                                        1              
+Average time per time step:                                                0.177362       
+Standard deviation between time steps:                                     0.0223964      
+Time elapsed in the skipped time steps:                                    9.6339         
+
+
+Standard counter description             | Time/step       | % loop time | Call(s)/step   
+------------------------------------------------------------------------------------------
+Linear solver resolutions Ax=B           | 0.11388         | 64.2        | 3              
+Matrix assembly for implicit scheme      | 0.01229249      |  6.9        | 1              
+Convection operator                      | 0.01123903      |  6.3        | 4              
+Diffusion operator                       | 0.001825577     |  1.0        | 2              
+Divergence operator                      | 0.001687251     |  1.0        | 4              
+Source terms                             | 0.0001569918    |  0.1        | 2              
+Update ::mettre_a_jour                   | 0.002649643     |  1.5        | 4              
+Computation of the time step dt          | 0.0005573932    |  0.3        | 4              
+Post-treatment operations                | 0.005802905     |  3.3        | 1              
+Other operations                         | 0.02727123      | 15.4        | 
+
+Average number of iteration of the linear solver per call:                 14.7           
+
+
+-----------------------------------------------------------------------------------------------------------
+                                                    GPU statistics
+-----------------------------------------------------------------------------------------------------------
+Counter description                      | Time per step   | % loop time | Call(s)/step    | Bandwidth 
+-----------------------------------------------------------------------------------------------------------
+Libraries:                               | 0.0326826       | 18.4        | 1               | 
+Kernels:                                 | 0.129523        | 73.0        | 1300            | 
+Copy host to device:                     | 0.000431132     |  0.2        | 21              | 10.4 GB/s
+Copy device to host:                     | 0.00143199      |  0.8        | 7               | 7.8 GB/s
+Alloc/Free on device:                    | 0.00336589      |  1.9        | 4               | 
+GPU: 91% Copy H<->D: 1.1% Alloc/free: 1.9% Comm: 0% CPU & I/O: 5.6%
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+                                              Post-resolution statistics
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Time of the post-resolution:                                               0.154901       
+
+Total time for the whole computation                                       51.0182        
+
diff --git a/tests/GPU/GMRES/GMRES_BENCH.TU.eureka_cc89 b/tests/GPU/GMRES/GMRES_BENCH.TU.eureka_cc89
new file mode 100644
index 0000000000..99b4f42fb5
--- /dev/null
+++ b/tests/GPU/GMRES/GMRES_BENCH.TU.eureka_cc89
@@ -0,0 +1,76 @@
+                                             # Global performance file #
+
+This is the global file for tracking performance in TRUST. It stores aggregated quantities.
+More detailed statistics can be found in the GMRES_BENCH_csv.TU file
+For time loop, only standard counters of level 1 are printed alongside your custom counters
+Time is given in seconds
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+                                              Context of the computation
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Date:     11-05-2026 -- 08:31:26
+OS:       eureka__Linux__x86_64__6.17.0-20-generic__#20~24.04.1-Ubuntu SMP PREEMPT_DYNAMIC Thu Mar 19 01:28:37 UTC 2
+CPU model : INTEL(R) XEON(R) PLATINUM 8580
+Total number of threads:240
+GPU model: NVIDIA RTX 6000 Ada Generation
+CUDA runtime version: 12.90
+CUDA drivers version: 13.10
+Nb procs used for the computation: 1
+TRUST version: 1.9.8_beta
+Total number of elements used for the calculation: 2560000
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+                                           Computation start-up statistics
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Total time of the start-up:                                                50.7317        
+
+Number of calls to the linear solver per time step:                        1              
+Average time of the resolution of the linear problem per call:             3.94905        
+Average number of iteration of the linear solver per call:                 0              
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+                                                 Time loop statistics
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+The first time step is not accounted for the computation of the time loop statistics
+Total time of the time loop:                                               6.3836         
+Number of time steps:                                                      9              
+Skipped time steps:                                                        1              
+Average time per time step:                                                0.709288       
+Standard deviation between time steps:                                     0.0878574      
+Time elapsed in the skipped time steps:                                    21.172         
+
+
+Standard counter description             | Time/step       | % loop time | Call(s)/step   
+------------------------------------------------------------------------------------------
+Linear solver resolutions Ax=B           | 0.483116        | 68.1        | 3              
+Matrix assembly for implicit scheme      | 0.03525991      |  5.0        | 1              
+Convection operator                      | 0.03701098      |  5.2        | 4              
+Diffusion operator                       | 0.003952027     |  0.6        | 2              
+Divergence operator                      | 0.004919432     |  0.7        | 4              
+Source terms                             | 0.0006515458    |  0.1        | 2              
+Update ::mettre_a_jour                   | 0.004295798     |  0.6        | 4              
+Computation of the time step dt          | 0.001351149     |  0.2        | 4              
+Post-treatment operations                | 0.008900335     |  1.3        | 1              
+Other operations                         | 0.129831        | 18.3        | 
+
+Average number of iteration of the linear solver per call:                 14.7           
+
+
+-----------------------------------------------------------------------------------------------------------
+                                                    GPU statistics
+-----------------------------------------------------------------------------------------------------------
+Counter description                      | Time per step   | % loop time | Call(s)/step    | Bandwidth 
+-----------------------------------------------------------------------------------------------------------
+Libraries:                               | 0.065369        |  9.2        | 1               | 
+Kernels:                                 | 0.631272        | 89.0        | 1300            | 
+Copy host to device:                     | 0.00114264      |  0.2        | 21              | 3.9 GB/s
+Copy device to host:                     | 0.00259112      |  0.4        | 7               | 4.3 GB/s
+Alloc/Free on device:                    | 0.000653251     |  0.1        | 4               | 
+GPU: 98% Copy H<->D: 0.53% Alloc/free: 0.092% Comm: 0% CPU & I/O: 1.2%
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+                                              Post-resolution statistics
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Time of the post-resolution:                                               0.102475       
+
+Total time for the whole computation                                       78.3897        
+
diff --git a/tests/GPU/GMRES/GMRES_BENCH.TU.is157091_cc86 b/tests/GPU/GMRES/GMRES_BENCH.TU.is157091_cc86
index d711f92cd4..aa08c81b5d 100644
--- a/tests/GPU/GMRES/GMRES_BENCH.TU.is157091_cc86
+++ b/tests/GPU/GMRES/GMRES_BENCH.TU.is157091_cc86
@@ -8,7 +8,7 @@ Time is given in seconds
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
                                               Context of the computation
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-Date:     20-02-2026 -- 17:55:11
+Date:     22-04-2026 -- 20:42:37
 OS:       is157091__Linux__x86_64__5.11.12-300.fc34.x86_64__#1 SMP Wed Apr 7 16:31:13 UTC 2021
 CPU model : AMD Ryzen Threadripper PRO 5975WX 32-Cores
 Total number of threads:64
@@ -22,36 +22,36 @@ Total number of elements used for the calculation: 2560000
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
                                            Computation start-up statistics
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-Total time of the start-up:                                                43.6421        
+Total time of the start-up:                                                44.9979        
 
 Number of calls to the linear solver per time step:                        1              
-Average time of the resolution of the linear problem per call:             2.76651        
+Average time of the resolution of the linear problem per call:             3.21012        
 Average number of iteration of the linear solver per call:                 0              
 
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
                                                  Time loop statistics
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 The first time step is not accounted for the computation of the time loop statistics
-Total time of the time loop:                                               12.4133        
+Total time of the time loop:                                               11.2014        
 Number of time steps:                                                      9              
 Skipped time steps:                                                        1              
-Average time per time step:                                                1.37926        
-Standard deviation between time steps:                                     0.145441       
-Time elapsed in the skipped time steps:                                    12.0402        
+Average time per time step:                                                1.2446         
+Standard deviation between time steps:                                     0.142212       
+Time elapsed in the skipped time steps:                                    18.2877        
 
 
 Standard counter description             | Time/step       | % loop time | Call(s)/step   
 ------------------------------------------------------------------------------------------
-Linear solver resolutions Ax=B           | 0.839934        | 60.9        | 3              
-Matrix assembly for implicit scheme      | 0.100131        |  7.3        | 1              
-Convection operator                      | 0.0936801       |  6.8        | 4              
-Diffusion operator                       | 0.00734367      |  0.5        | 2              
-Divergence operator                      | 0.02655093      |  1.9        | 4              
-Source terms                             | 0.001257321     |  0.1        | 2              
-Update ::mettre_a_jour                   | 0.01117306      |  0.8        | 4              
-Computation of the time step dt          | 0.002250517     |  0.2        | 4              
-Post-treatment operations                | 0.01673185      |  1.2        | 1              
-Other operations                         | 0.2802051       | 20.3        | 
+Linear solver resolutions Ax=B           | 0.800799        | 64.3        | 3              
+Matrix assembly for implicit scheme      | 0.07217464      |  5.8        | 1              
+Convection operator                      | 0.0669646       |  5.4        | 4              
+Diffusion operator                       | 0.005771725     |  0.5        | 2              
+Divergence operator                      | 0.01634721      |  1.3        | 4              
+Source terms                             | 0.0007613063    |  0.1        | 2              
+Update ::mettre_a_jour                   | 0.008655555     |  0.7        | 4              
+Computation of the time step dt          | 0.002120476     |  0.2        | 4              
+Post-treatment operations                | 0.01154682      |  0.9        | 1              
+Other operations                         | 0.2594554       | 20.8        | 
 
 Average number of iteration of the linear solver per call:                 14.7           
 
@@ -61,16 +61,16 @@ Average number of iteration of the linear solver per call:                 14.7
 -----------------------------------------------------------------------------------------------------------
 Counter description                      | Time per step   | % loop time | Call(s)/step    | Bandwidth 
 -----------------------------------------------------------------------------------------------------------
-Libraries:                               | 0.123609        |  9.0        | 1               | 
-Kernels:                                 | 1.24552         | 90.3        | 1301            | 
-Copy host to device:                     | 0.00138436      |  0.1        | 21              | 3.2 GB/s
-Copy device to host:                     | 0.00108039      |  0.1        | 7               | 10.3 GB/s
-Alloc/Free on device:                    | 0.00046704      |  0.0        | 4               | 
-GPU: 99% Copy H<->D: 0.18% Alloc/free: 0.034% Comm: 0% CPU & I/O: 0.52%
+Libraries:                               | 0.105849        |  8.5        | 1               | 
+Kernels:                                 | 1.12879         | 90.7        | 1300            | 
+Copy host to device:                     | 0.00137746      |  0.1        | 21              | 3.3 GB/s
+Copy device to host:                     | 0.00110908      |  0.1        | 7               | 10.0 GB/s
+Alloc/Free on device:                    | 0.000533048     |  0.0        | 4               | 
+GPU: 99% Copy H<->D: 0.2% Alloc/free: 0.043% Comm: 0% CPU & I/O: 0.56%
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
                                               Post-resolution statistics
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-Time of the post-resolution:                                               0.0728189      
+Time of the post-resolution:                                               0.070975       
 
-Total time for the whole computation                                       68.1685        
+Total time for the whole computation                                       74.5579        
 
diff --git a/tests/GPU/GMRES/GMRES_BENCH.TU.is159479_cc120 b/tests/GPU/GMRES/GMRES_BENCH.TU.is159479_cc120
new file mode 100644
index 0000000000..834dadd02b
--- /dev/null
+++ b/tests/GPU/GMRES/GMRES_BENCH.TU.is159479_cc120
@@ -0,0 +1,76 @@
+                                             # Global performance file #
+
+This is the global file for tracking performance in TRUST. It stores aggregated quantities.
+More detailed statistics can be found in the GMRES_BENCH_csv.TU file
+For time loop, only standard counters of level 1 are printed alongside your custom counters
+Time is given in seconds
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+                                              Context of the computation
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Date:     24-04-2026 -- 14:34:27
+OS:       is159479.intra.cea.fr__Linux__x86_64__6.12.0-160000.26-default__#1 SMP PREEMPT_DYNAMIC Thu Feb  5 00:00:11 UTC 2026 (f
+CPU model : AMD Ryzen Threadripper PRO 9965WX 24-Cores
+Total number of threads:48
+GPU model: NVIDIA RTX PRO 6000 Blackwell Workstation Edition
+CUDA runtime version: 13.0
+CUDA drivers version: 13.0
+Nb procs used for the computation: 1
+TRUST version: 1.9.8_beta
+Total number of elements used for the calculation: 2560000
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+                                           Computation start-up statistics
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Total time of the start-up:                                                33.6781        
+
+Number of calls to the linear solver per time step:                        1              
+Average time of the resolution of the linear problem per call:             1.96677        
+Average number of iteration of the linear solver per call:                 0              
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+                                                 Time loop statistics
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+The first time step is not accounted for the computation of the time loop statistics
+Total time of the time loop:                                               3.25831        
+Number of time steps:                                                      9              
+Skipped time steps:                                                        1              
+Average time per time step:                                                0.362034       
+Standard deviation between time steps:                                     0.0415193      
+Time elapsed in the skipped time steps:                                    12.0571        
+
+
+Standard counter description             | Time/step       | % loop time | Call(s)/step   
+------------------------------------------------------------------------------------------
+Linear solver resolutions Ax=B           | 0.239558        | 66.2        | 3              
+Matrix assembly for implicit scheme      | 0.02138728      |  5.9        | 1              
+Convection operator                      | 0.02537361      |  7.0        | 4              
+Diffusion operator                       | 0.002365576     |  0.7        | 2              
+Divergence operator                      | 0.002552803     |  0.7        | 4              
+Source terms                             | 0.0003383349    |  0.1        | 2              
+Update ::mettre_a_jour                   | 0.002266834     |  0.6        | 4              
+Computation of the time step dt          | 0.0007245106    |  0.2        | 4              
+Post-treatment operations                | 0.005288873     |  1.5        | 1              
+Other operations                         | 0.06217785      | 17.2        | 
+
+Average number of iteration of the linear solver per call:                 14.7           
+
+
+-----------------------------------------------------------------------------------------------------------
+                                                    GPU statistics
+-----------------------------------------------------------------------------------------------------------
+Counter description                      | Time per step   | % loop time | Call(s)/step    | Bandwidth 
+-----------------------------------------------------------------------------------------------------------
+Libraries:                               | 0.043691        | 12.1        | 1               | 
+Kernels:                                 | 0.31172         | 86.1        | 1300            | 
+Copy host to device:                     | 0.000469634     |  0.1        | 21              | 9.5 GB/s
+Copy device to host:                     | 0.00146406      |  0.4        | 7               | 7.6 GB/s
+Alloc/Free on device:                    | 0.000357557     |  0.1        | 4               | 
+GPU: 98% Copy H<->D: 0.53% Alloc/free: 0.099% Comm: 0% CPU & I/O: 1.2%
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+                                              Post-resolution statistics
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Time of the post-resolution:                                               0.0655445      
+
+Total time for the whole computation                                       49.0591        
+
diff --git a/tests/GPU/GMRES/GMRES_BENCH.TU.is247793_gfx1100 b/tests/GPU/GMRES/GMRES_BENCH.TU.is247793_gfx1100
new file mode 100644
index 0000000000..051d8fa10d
--- /dev/null
+++ b/tests/GPU/GMRES/GMRES_BENCH.TU.is247793_gfx1100
@@ -0,0 +1,76 @@
+                                             # Global performance file #
+
+This is the global file for tracking performance in TRUST. It stores aggregated quantities.
+More detailed statistics can be found in the GMRES_BENCH_csv.TU file
+For time loop, only standard counters of level 1 are printed alongside your custom counters
+Time is given in seconds
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+                                              Context of the computation
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Date:     22-05-2026 -- 17:33:51
+OS:       is247793__Linux__x86_64__6.8.0-107-generic__#107~22.04.1-Ubuntu SMP PREEMPT_DYNAMIC Wed Mar 18 23:40:43 UTC 
+CPU model : AMD Ryzen Threadripper PRO 7965WX 24-Cores
+Total number of threads:48
+GPU model: AMD Radeon PRO W7900
+HIP runtime version: 7.53
+HIP drivers version: 7.53
+Nb procs used for the computation: 1
+TRUST version: 1.9.8_beta
+Total number of elements used for the calculation: 2560000
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+                                           Computation start-up statistics
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Total time of the start-up:                                                38.0716        
+
+Number of calls to the linear solver per time step:                        1              
+Average time of the resolution of the linear problem per call:             3.84061        
+Average number of iteration of the linear solver per call:                 0              
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+                                                 Time loop statistics
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+The first time step is not accounted for the computation of the time loop statistics
+Total time of the time loop:                                               13.4029        
+Number of time steps:                                                      9              
+Skipped time steps:                                                        1              
+Average time per time step:                                                1.48921        
+Standard deviation between time steps:                                     0.222303       
+Time elapsed in the skipped time steps:                                    15.3733        
+
+
+Standard counter description             | Time/step       | % loop time | Call(s)/step   
+------------------------------------------------------------------------------------------
+Linear solver resolutions Ax=B           | 1.14468         | 76.9        | 3              
+Matrix assembly for implicit scheme      | 0.06534276      |  4.4        | 1              
+Convection operator                      | 0.07545809      |  5.1        | 4              
+Diffusion operator                       | 0.008406638     |  0.6        | 2              
+Divergence operator                      | 0.01259237      |  0.8        | 4              
+Source terms                             | 0.0009561102    |  0.1        | 2              
+Update ::mettre_a_jour                   | 0.007802457     |  0.5        | 4              
+Computation of the time step dt          | 0.002251303     |  0.2        | 4              
+Post-treatment operations                | 0.01362194      |  0.9        | 1              
+Other operations                         | 0.1580985       | 10.6        | 
+
+Average number of iteration of the linear solver per call:                 13.1           
+
+
+-----------------------------------------------------------------------------------------------------------
+                                                    GPU statistics
+-----------------------------------------------------------------------------------------------------------
+Counter description                      | Time per step   | % loop time | Call(s)/step    | Bandwidth 
+-----------------------------------------------------------------------------------------------------------
+Libraries:                               | 0.146646        |  9.8        | 1               | 
+Kernels:                                 | 1.33413         | 89.6        | 1300            | 
+Copy host to device:                     | 0.00109438      |  0.1        | 21              | 4.1 GB/s
+Copy device to host:                     | 0.000674911     |  0.0        | 7               | 16.5 GB/s
+Alloc/Free on device:                    | 0.000762378     |  0.1        | 4               | 
+GPU: 99% Copy H<->D: 0.12% Alloc/free: 0.051% Comm: 0% CPU & I/O: 0.4%
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+                                              Post-resolution statistics
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Time of the post-resolution:                                               0.114679       
+
+Total time for the whole computation                                       66.9625        
+
diff --git a/tests/GPU/GMRES/GMRES_BENCH.TU.lumi_gfx90a b/tests/GPU/GMRES/GMRES_BENCH.TU.lumi_gfx90a
new file mode 100644
index 0000000000..96e5d1d18e
--- /dev/null
+++ b/tests/GPU/GMRES/GMRES_BENCH.TU.lumi_gfx90a
@@ -0,0 +1,77 @@
+                                             # Global performance file #
+
+This is the global file for tracking performance in TRUST. It stores aggregated quantities.
+More detailed statistics can be found in the GMRES_BENCH_csv.TU file
+For time loop, only standard counters of level 1 are printed alongside your custom counters
+Time is given in seconds
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+                                              Context of the computation
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Date:     18-05-2026 -- 08:53:57
+OS:       nid007955__Linux__x86_64__6.4.0-150600.23.73_15.0.14-cray_shasta_c__#1 SMP Tue Oct 21 20:32:25 UTC 2025 (89d3c98)
+CPU model : AMD EPYC 7A53 64-Core Processor
+Total number of threads:128
+GPU model: AMD Instinct MI250X
+HIP runtime version: 6.43
+HIP drivers version: 6.43
+Nb procs used for the computation: 1
+TRUST version: 1.9.8_beta
+Total number of elements used for the calculation: 2560000
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+                                           Computation start-up statistics
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Total time of the start-up:                                                104.002        
+
+Number of calls to the linear solver per time step:                        1              
+Average time of the resolution of the linear problem per call:             7.31327        
+Average number of iteration of the linear solver per call:                 0              
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+                                                 Time loop statistics
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+The first time step is not accounted for the computation of the time loop statistics
+Total time of the time loop:                                               17.8793        
+Number of time steps:                                                      9              
+Skipped time steps:                                                        1              
+Average time per time step:                                                1.98659        
+Standard deviation between time steps:                                     0.2676         
+Time elapsed in the skipped time steps:                                    24.2401        
+
+
+Standard counter description             | Time/step       | % loop time | Call(s)/step   
+------------------------------------------------------------------------------------------
+Linear solver resolutions Ax=B           | 1.31941         | 66.4        | 3              
+Matrix assembly for implicit scheme      | 0.1361106       |  6.9        | 1              
+Convection operator                      | 0.1735005       |  8.7        | 4              
+Diffusion operator                       | 0.01117017      |  0.6        | 2              
+Divergence operator                      | 0.01994576      |  1.0        | 4              
+Source terms                             | 0.0005305762    |  0.0        | 2              
+Update ::mettre_a_jour                   | 0.008478555     |  0.4        | 4              
+Computation of the time step dt          | 0.001511032     |  0.1        | 4              
+Post-treatment operations                | 0.01971884      |  1.0        | 1              
+Other operations                         | 0.2962186       | 14.9        | 
+
+Average number of iteration of the linear solver per call:                 14.7           
+
+
+-----------------------------------------------------------------------------------------------------------
+                                                    GPU statistics
+-----------------------------------------------------------------------------------------------------------
+Counter description                      | Time per step   | % loop time | Call(s)/step    | Bandwidth 
+-----------------------------------------------------------------------------------------------------------
+Libraries:                               | 0.0650842       |  3.3        | 1               | 
+Kernels:                                 | 1.90925         | 96.1        | 1300            | 
+Copy host to device:                     | 0.000662391     |  0.0        | 21              | 6.8 GB/s
+Copy device to host:                     | 0.000750408     |  0.0        | 7               | 14.8 GB/s
+Alloc/Free on device:                    | 0.000125511     |  0.0        | 4               | 
+GPU: 99% Copy H<->D: 0.071% Alloc/free: 0.0063% Comm: 0% CPU & I/O: 0.54%
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+                                              Post-resolution statistics
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Time of the post-resolution:                                               0.204038       
+
+Total time for the whole computation                                       146.325        
+
+[Slurm] Power consumption (170 s):  0.518 kW  0.024 kWh  0.002 € (0.10€/kWh)
diff --git a/tests/GPU/GMRES/GMRES_BENCH.TU.topaze_cc80 b/tests/GPU/GMRES/GMRES_BENCH.TU.topaze_cc80
index ca10fe9c5e..f22144e384 100644
--- a/tests/GPU/GMRES/GMRES_BENCH.TU.topaze_cc80
+++ b/tests/GPU/GMRES/GMRES_BENCH.TU.topaze_cc80
@@ -8,8 +8,8 @@ Time is given in seconds
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
                                               Context of the computation
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-Date:     20-03-2026 -- 15:24:53
-OS:       topaze7046__Linux__x86_64__4.18.0-553.89.1.el8_10.x86_64__#1 SMP Sat Nov 29 00:49:18 EST 2025
+Date:     15-05-2026 -- 13:31:12
+OS:       topaze7040__Linux__x86_64__4.18.0-553.89.1.el8_10.x86_64__#1 SMP Sat Nov 29 00:49:18 EST 2025
 CPU model : AMD EPYC 7763 64-Core Processor
 Total number of threads:256
 GPU model: NVIDIA A100-SXM4-80GB
@@ -22,36 +22,36 @@ Total number of elements used for the calculation: 2560000
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
                                            Computation start-up statistics
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-Total time of the start-up:                                                57.25          
+Total time of the start-up:                                                61.3805        
 
 Number of calls to the linear solver per time step:                        1              
-Average time of the resolution of the linear problem per call:             4.33425        
+Average time of the resolution of the linear problem per call:             4.66789        
 Average number of iteration of the linear solver per call:                 0              
 
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
                                                  Time loop statistics
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 The first time step is not accounted for the computation of the time loop statistics
-Total time of the time loop:                                               5.0447         
+Total time of the time loop:                                               4.57182        
 Number of time steps:                                                      9              
 Skipped time steps:                                                        1              
-Average time per time step:                                                0.560522       
-Standard deviation between time steps:                                     0.054987       
-Time elapsed in the skipped time steps:                                    16.1386        
+Average time per time step:                                                0.50798        
+Standard deviation between time steps:                                     0.0546923      
+Time elapsed in the skipped time steps:                                    23.7777        
 
 
 Standard counter description             | Time/step       | % loop time | Call(s)/step   
 ------------------------------------------------------------------------------------------
-Linear solver resolutions Ax=B           | 0.333833        | 59.6        | 3              
-Matrix assembly for implicit scheme      | 0.04433419      |  7.9        | 1              
-Convection operator                      | 0.04280126      |  7.6        | 4              
-Diffusion operator                       | 0.005348669     |  1.0        | 2              
-Divergence operator                      | 0.008769969     |  1.6        | 4              
-Source terms                             | 0.001239124     |  0.2        | 2              
-Update ::mettre_a_jour                   | 0.004984535     |  0.9        | 4              
-Computation of the time step dt          | 0.001195886     |  0.2        | 4              
-Post-treatment operations                | 0.01160467      |  2.1        | 1              
-Other operations                         | 0.10641         | 19.0        | 
+Linear solver resolutions Ax=B           | 0.327822        | 64.5        | 3              
+Matrix assembly for implicit scheme      | 0.02985497      |  5.9        | 1              
+Convection operator                      | 0.02795193      |  5.5        | 4              
+Diffusion operator                       | 0.004184844     |  0.8        | 2              
+Divergence operator                      | 0.00393144      |  0.8        | 4              
+Source terms                             | 0.0004237254    |  0.1        | 2              
+Update ::mettre_a_jour                   | 0.003992837     |  0.8        | 4              
+Computation of the time step dt          | 0.001005162     |  0.2        | 4              
+Post-treatment operations                | 0.00991028      |  2.0        | 1              
+Other operations                         | 0.09890277      | 19.5        | 
 
 Average number of iteration of the linear solver per call:                 14.7           
 
@@ -61,16 +61,17 @@ Average number of iteration of the linear solver per call:                 14.7
 -----------------------------------------------------------------------------------------------------------
 Counter description                      | Time per step   | % loop time | Call(s)/step    | Bandwidth 
 -----------------------------------------------------------------------------------------------------------
-Libraries:                               | 0.0617394       | 11.0        | 1               | 
-Kernels:                                 | 0.483261        | 86.2        | 1301            | 
-Copy host to device:                     | 0.00155029      |  0.3        | 21              | 2.9 GB/s
-Copy device to host:                     | 0.0014181       |  0.3        | 7               | 7.9 GB/s
-Alloc/Free on device:                    | 0.000880882     |  0.2        | 4               | 
-GPU: 97% Copy H<->D: 0.53% Alloc/free: 0.16% Comm: 0% CPU & I/O: 2.1%
+Libraries:                               | 0.0595002       | 11.7        | 1               | 
+Kernels:                                 | 0.433552        | 85.3        | 1300            | 
+Copy host to device:                     | 0.00174372      |  0.3        | 21              | 2.6 GB/s
+Copy device to host:                     | 0.00094954      |  0.2        | 7               | 11.7 GB/s
+Alloc/Free on device:                    | 0.00095336      |  0.2        | 4               | 
+GPU: 97% Copy H<->D: 0.53% Alloc/free: 0.19% Comm: 0% CPU & I/O: 2.2%
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
                                               Post-resolution statistics
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-Time of the post-resolution:                                               0.235477       
+Time of the post-resolution:                                               0.213536       
 
-Total time for the whole computation                                       78.6688        
+Total time for the whole computation                                       89.9436        
 
+[Slurm] Power consumption (129 s):  0.454 kW  0.016 kWh  0.002 € (0.10€/kWh)
diff --git a/tests/GPU/GPU4/GPU4.data b/tests/GPU/GPU4/GPU4.data
index 922b0221ab..c0a8e9fee7 100644
--- a/tests/GPU/GPU4/GPU4.data
+++ b/tests/GPU/GPU4/GPU4.data
@@ -40,7 +40,8 @@ END PARTITION #
 Scatter DOM.Zones dom
 END SCATTER #
 
-VEFPreP1B dis
+VEFPreP1B dis 
+Lire dis { reorder { algo Hilbert } }
 
 Scheme_euler_explicit sch
 Read sch
diff --git a/tests/GPU/GPU4/GPU4_BENCH.TU.adastra_gfx90a b/tests/GPU/GPU4/GPU4_BENCH.TU.adastra_gfx90a
index 0e42504b80..bcabd77109 100644
--- a/tests/GPU/GPU4/GPU4_BENCH.TU.adastra_gfx90a
+++ b/tests/GPU/GPU4/GPU4_BENCH.TU.adastra_gfx90a
@@ -8,8 +8,8 @@ Time is given in seconds
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
                                               Context of the computation
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-Date:     20-02-2026 -- 21:11:25
-OS:       g1031__Linux__x86_64__5.14.0-570.69.1.el9_6.x86_64__#1 SMP PREEMPT_DYNAMIC Tue Nov 25 01:30:14 EST 2025
+Date:     24-05-2026 -- 15:59:02
+OS:       g1321__Linux__x86_64__5.14.0-570.69.1.el9_6.x86_64__#1 SMP PREEMPT_DYNAMIC Tue Nov 25 01:30:14 EST 2025
 CPU model : AMD EPYC 7A53 64-Core Processor
 Total number of threads:128
 GPU model: AMD Instinct MI250X
@@ -22,37 +22,37 @@ Total number of elements used for the calculation: 2592000
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
                                            Computation start-up statistics
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-Total time of the start-up:                                                41.562         
+Total time of the start-up:                                                32.7813        
 
 Number of calls to the linear solver per time step:                        2              
-Average time of the resolution of the linear problem per call:             1.31268        
+Average time of the resolution of the linear problem per call:             1.65578        
 Average number of iteration of the linear solver per call:                 33             
 
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
                                                  Time loop statistics
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 The first time step is not accounted for the computation of the time loop statistics
-Total time of the time loop:                                               1.49597        
+Total time of the time loop:                                               1.05563        
 Number of time steps:                                                      9              
 Skipped time steps:                                                        1              
-Average time per time step:                                                0.166219       
-Standard deviation between time steps:                                     0.0078387      
-Time elapsed in the skipped time steps:                                    0.142076       
+Average time per time step:                                                0.117292       
+Standard deviation between time steps:                                     0.00824722     
+Time elapsed in the skipped time steps:                                    0.101644       
 
 
 Standard counter description             | Time/step       | % loop time | Call(s)/step   
 ------------------------------------------------------------------------------------------
-Linear solver resolutions Ax=B           | 0.0452288       | 24.9        | 1              
-Convection operator                      | 0.02206269      | 12.1        | 1              
-Diffusion operator                       | 0.008556311     |  4.7        | 1              
-Gradient operator                        | 0.04916169      | 27.0        | 2              
-Divergence operator                      | 0.01618532      |  8.9        | 2              
-Update ::mettre_a_jour                   | 0.01322576      |  7.3        | 1              
-Computation of the time step dt          | 0.006794905     |  3.7        | 2              
-Post-treatment operations                | 0.00346718      |  1.9        | 1              
-Other operations                         | 0.001535927     |  0.8        | 
+Linear solver resolutions Ax=B           | 0.0370046       | 31.5        | 1              
+Convection operator                      | 0.01428056      | 12.2        | 1              
+Diffusion operator                       | 0.008201341     |  7.0        | 1              
+Gradient operator                        | 0.02456975      | 20.9        | 2              
+Divergence operator                      | 0.01065395      |  9.1        | 2              
+Update ::mettre_a_jour                   | 0.009077858     |  7.7        | 1              
+Computation of the time step dt          | 0.005003812     |  4.3        | 2              
+Post-treatment operations                | 0.003668977     |  3.1        | 1              
+Other operations                         | 0.00483125      |  4.1        | 
 
-Average number of iteration of the linear solver per call:                 20.3           
+Average number of iteration of the linear solver per call:                 20.7           
 
 
 -----------------------------------------------------------------------------------------------------------
@@ -60,17 +60,17 @@ Average number of iteration of the linear solver per call:                 20.3
 -----------------------------------------------------------------------------------------------------------
 Counter description                      | Time per step   | % loop time | Call(s)/step    | Bandwidth 
 -----------------------------------------------------------------------------------------------------------
-Libraries:                               | 0.0450126       | 27.1        | 1               | 
-Kernels:                                 | 0.115366        | 69.4        | 147             | 
-Copy host to device:                     | 0.00027125      |  0.2        | 10              | 4.9 GB/s
-Copy device to host:                     | 0.000193674     |  0.1        | 1               | 17.3 GB/s
-Alloc/Free on device:                    | 1.50704e-05     |  0.0        | 0               | 
-GPU: 96% Copy H<->D: 0.28% Alloc/free: 0.0091% Comm: 0% CPU & I/O: 3.2%
+Libraries:                               | 0.0367933       | 31.4        | 1               | 
+Kernels:                                 | 0.0751332       | 64.1        | 153             | 
+Copy host to device:                     | 0.000225522     |  0.2        | 9               | 5.1 GB/s
+Copy device to host:                     | 0.000229117     |  0.2        | 2               | 15.5 GB/s
+Alloc/Free on device:                    | 1.44047e-05     |  0.0        | 0               | 
+GPU: 95% Copy H<->D: 0.39% Alloc/free: 0.012% Comm: 0% CPU & I/O: 4.2%
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
                                               Post-resolution statistics
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-Time of the post-resolution:                                               0.157813       
+Time of the post-resolution:                                               0.15931        
 
-Total time for the whole computation                                       43.3579        
+Total time for the whole computation                                       34.0979        
 
-Power consumption: 0 kW 0 kWh 0 € (0.12€/kWh)
+[Slurm] Power consumption (40 s):  0.438 kW  0.005 kWh  0.000 € (0.10€/kWh)
diff --git a/tests/GPU/GPU4/GPU4_BENCH.TU.adastra_gfx942 b/tests/GPU/GPU4/GPU4_BENCH.TU.adastra_gfx942
index 6b13b9d45c..3ecbd97739 100644
--- a/tests/GPU/GPU4/GPU4_BENCH.TU.adastra_gfx942
+++ b/tests/GPU/GPU4/GPU4_BENCH.TU.adastra_gfx942
@@ -8,8 +8,8 @@ Time is given in seconds
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
                                               Context of the computation
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-Date:     17-03-2026 -- 17:53:25
-OS:       a1007__Linux__x86_64__5.14.0-570.69.1.el9_6.x86_64__#1 SMP PREEMPT_DYNAMIC Tue Nov 25 01:30:14 EST 2025
+Date:     08-06-2026 -- 14:41:00
+OS:       a1002__Linux__x86_64__5.14.0-570.69.1.el9_6.x86_64__#1 SMP PREEMPT_DYNAMIC Tue Nov 25 01:30:14 EST 2025
 CPU model : AMD Instinct MI300A Accelerator
 Total number of threads:192
 GPU model: AMD Instinct MI300A
@@ -22,37 +22,37 @@ Total number of elements used for the calculation: 2592000
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
                                            Computation start-up statistics
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-Total time of the start-up:                                                79.453         
+Total time of the start-up:                                                26.4833        
 
 Number of calls to the linear solver per time step:                        2              
-Average time of the resolution of the linear problem per call:             1.8534         
+Average time of the resolution of the linear problem per call:             1.3744         
 Average number of iteration of the linear solver per call:                 33             
 
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
                                                  Time loop statistics
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 The first time step is not accounted for the computation of the time loop statistics
-Total time of the time loop:                                               0.493985       
+Total time of the time loop:                                               0.455643       
 Number of time steps:                                                      9              
 Skipped time steps:                                                        1              
-Average time per time step:                                                0.0548872      
-Standard deviation between time steps:                                     0.00732123     
-Time elapsed in the skipped time steps:                                    0.075823       
+Average time per time step:                                                0.050627       
+Standard deviation between time steps:                                     0.00715344     
+Time elapsed in the skipped time steps:                                    0.0646453      
 
 
 Standard counter description             | Time/step       | % loop time | Call(s)/step   
 ------------------------------------------------------------------------------------------
-Linear solver resolutions Ax=B           | 0.0205601       | 37.5        | 1              
-Convection operator                      | 0.005483088     | 10.0        | 1              
-Diffusion operator                       | 0.002202733     |  4.0        | 1              
-Gradient operator                        | 0.01036635      | 18.9        | 2              
-Divergence operator                      | 0.004477627     |  8.2        | 2              
-Update ::mettre_a_jour                   | 0.003807159     |  6.9        | 1              
-Computation of the time step dt          | 0.001930352     |  3.5        | 2              
-Post-treatment operations                | 0.003031623     |  5.5        | 1              
-Other operations                         | 0.003028226     |  5.5        | 
+Linear solver resolutions Ax=B           | 0.019712        | 38.9        | 1              
+Convection operator                      | 0.00476502      |  9.4        | 1              
+Diffusion operator                       | 0.002064178     |  4.1        | 1              
+Gradient operator                        | 0.01031318      | 20.4        | 2              
+Divergence operator                      | 0.002667278     |  5.3        | 2              
+Update ::mettre_a_jour                   | 0.002899753     |  5.7        | 1              
+Computation of the time step dt          | 0.001444263     |  2.9        | 2              
+Post-treatment operations                | 0.002739872     |  5.4        | 1              
+Other operations                         | 0.004021502     |  7.9        | 
 
-Average number of iteration of the linear solver per call:                 20.3           
+Average number of iteration of the linear solver per call:                 20.7           
 
 
 -----------------------------------------------------------------------------------------------------------
@@ -60,17 +60,17 @@ Average number of iteration of the linear solver per call:                 20.3
 -----------------------------------------------------------------------------------------------------------
 Counter description                      | Time per step   | % loop time | Call(s)/step    | Bandwidth 
 -----------------------------------------------------------------------------------------------------------
-Libraries:                               | 0.0204319       | 37.2        | 1               | 
-Kernels:                                 | 0.0293497       | 53.5        | 147             | 
-Copy host to device:                     | 0.000250773     |  0.5        | 10              | 5.3 GB/s
-Copy device to host:                     | 0.000111261     |  0.2        | 1               | 30.2 GB/s
-Alloc/Free on device:                    | 0.000108043     |  0.2        | 0               | 
-GPU: 91% Copy H<->D: 0.66% Alloc/free: 0.2% Comm: 0% CPU & I/O: 8.4%
+Libraries:                               | 0.0196001       | 38.7        | 1               | 
+Kernels:                                 | 0.0265284       | 52.4        | 153             | 
+Copy host to device:                     | 0.000192319     |  0.4        | 9               | 5.9 GB/s
+Copy device to host:                     | 0.000149286     |  0.3        | 2               | 23.8 GB/s
+Alloc/Free on device:                    | 0.00011148      |  0.2        | 0               | 
+GPU: 91% Copy H<->D: 0.67% Alloc/free: 0.22% Comm: 0% CPU & I/O: 8%
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
                                               Post-resolution statistics
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-Time of the post-resolution:                                               0.147033       
+Time of the post-resolution:                                               0.142964       
 
-Total time for the whole computation                                       80.1698        
+Total time for the whole computation                                       27.1465        
 
-[Slurm] Power consumption (95 s):  0.624 kW  0.016 kWh  0.002 € (0.10€/kWh)
+[Slurm] Power consumption (34 s):  0.629 kW  0.006 kWh  0.001 € (0.10€/kWh)
diff --git a/tests/GPU/GPU4/GPU4_BENCH.TU.dalianvl_cc100 b/tests/GPU/GPU4/GPU4_BENCH.TU.dalianvl_cc100
new file mode 100644
index 0000000000..f000fce39f
--- /dev/null
+++ b/tests/GPU/GPU4/GPU4_BENCH.TU.dalianvl_cc100
@@ -0,0 +1,75 @@
+                                             # Global performance file #
+
+This is the global file for tracking performance in TRUST. It stores aggregated quantities.
+More detailed statistics can be found in the GPU4_BENCH_csv.TU file
+For time loop, only standard counters of level 1 are printed alongside your custom counters
+Time is given in seconds
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+                                              Context of the computation
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Date:     22-05-2026 -- 12:15:53
+OS:       dalianvl14__Linux__aarch64__5.14.0-503.40.1.el9_5.aarch64__#1 SMP PREEMPT_DYNAMIC Thu Apr 24 13:05:29 UTC 2025
+CPU model : Unknown Linux CPU
+Total number of threads:144
+GPU model: NVIDIA GB200
+CUDA runtime version: 13.0
+CUDA drivers version: 13.20
+Nb procs used for the computation: 1
+TRUST version: 1.9.8_beta
+Total number of elements used for the calculation: 2592000
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+                                           Computation start-up statistics
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Total time of the start-up:                                                30.6801        
+
+Number of calls to the linear solver per time step:                        2              
+Average time of the resolution of the linear problem per call:             1.40983        
+Average number of iteration of the linear solver per call:                 33             
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+                                                 Time loop statistics
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+The first time step is not accounted for the computation of the time loop statistics
+Total time of the time loop:                                               0.298542       
+Number of time steps:                                                      9              
+Skipped time steps:                                                        1              
+Average time per time step:                                                0.0331714      
+Standard deviation between time steps:                                     0.00423861     
+Time elapsed in the skipped time steps:                                    0.0267229      
+
+
+Standard counter description             | Time/step       | % loop time | Call(s)/step   
+------------------------------------------------------------------------------------------
+Linear solver resolutions Ax=B           | 0.0175178       | 52.8        | 1              
+Convection operator                      | 0.002603557     |  7.8        | 1              
+Diffusion operator                       | 0.0007492484    |  2.3        | 1              
+Gradient operator                        | 0.002730113     |  8.2        | 2              
+Divergence operator                      | 0.0008824289    |  2.7        | 2              
+Update ::mettre_a_jour                   | 0.00135279      |  4.1        | 1              
+Computation of the time step dt          | 0.0005503397    |  1.7        | 2              
+Post-treatment operations                | 0.001907084     |  5.7        | 1              
+Other operations                         | 0.004878026     | 14.7        | 
+
+Average number of iteration of the linear solver per call:                 20.7           
+
+
+-----------------------------------------------------------------------------------------------------------
+                                                    GPU statistics
+-----------------------------------------------------------------------------------------------------------
+Counter description                      | Time per step   | % loop time | Call(s)/step    | Bandwidth 
+-----------------------------------------------------------------------------------------------------------
+Libraries:                               | 0.0173774       | 52.4        | 1               | 
+Kernels:                                 | 0.0106231       | 32.0        | 153             | 
+Copy host to device:                     | 0.000173476     |  0.5        | 9               | 6.6 GB/s
+Copy device to host:                     | 7.98649e-05     |  0.2        | 2               | 44.5 GB/s
+Alloc/Free on device:                    | 5.66116e-05     |  0.2        | 0               | 
+GPU: 84% Copy H<->D: 0.76% Alloc/free: 0.17% Comm: 0% CPU & I/O: 15%
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+                                              Post-resolution statistics
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Time of the post-resolution:                                               0.124356       
+
+Total time for the whole computation                                       31.1298        
+
diff --git a/tests/GPU/GPU4/GPU4_BENCH.TU.eureka_cc89 b/tests/GPU/GPU4/GPU4_BENCH.TU.eureka_cc89
new file mode 100644
index 0000000000..c4c8d5692a
--- /dev/null
+++ b/tests/GPU/GPU4/GPU4_BENCH.TU.eureka_cc89
@@ -0,0 +1,75 @@
+                                             # Global performance file #
+
+This is the global file for tracking performance in TRUST. It stores aggregated quantities.
+More detailed statistics can be found in the GPU4_BENCH_csv.TU file
+For time loop, only standard counters of level 1 are printed alongside your custom counters
+Time is given in seconds
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+                                              Context of the computation
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Date:     11-05-2026 -- 08:32:28
+OS:       eureka__Linux__x86_64__6.17.0-20-generic__#20~24.04.1-Ubuntu SMP PREEMPT_DYNAMIC Thu Mar 19 01:28:37 UTC 2
+CPU model : INTEL(R) XEON(R) PLATINUM 8580
+Total number of threads:240
+GPU model: NVIDIA RTX 6000 Ada Generation
+CUDA runtime version: 12.90
+CUDA drivers version: 13.10
+Nb procs used for the computation: 1
+TRUST version: 1.9.8_beta
+Total number of elements used for the calculation: 2592000
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+                                           Computation start-up statistics
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Total time of the start-up:                                                32.9864        
+
+Number of calls to the linear solver per time step:                        2              
+Average time of the resolution of the linear problem per call:             1.77157        
+Average number of iteration of the linear solver per call:                 33             
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+                                                 Time loop statistics
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+The first time step is not accounted for the computation of the time loop statistics
+Total time of the time loop:                                               0.649571       
+Number of time steps:                                                      9              
+Skipped time steps:                                                        1              
+Average time per time step:                                                0.0721746      
+Standard deviation between time steps:                                     0.00304736     
+Time elapsed in the skipped time steps:                                    0.0509077      
+
+
+Standard counter description             | Time/step       | % loop time | Call(s)/step   
+------------------------------------------------------------------------------------------
+Linear solver resolutions Ax=B           | 0.0351763       | 48.7        | 1              
+Convection operator                      | 0.008166033     | 11.3        | 1              
+Diffusion operator                       | 0.00277643      |  3.8        | 1              
+Gradient operator                        | 0.007993661     | 11.1        | 2              
+Divergence operator                      | 0.003646946     |  5.1        | 2              
+Update ::mettre_a_jour                   | 0.004365205     |  6.0        | 1              
+Computation of the time step dt          | 0.001764848     |  2.4        | 2              
+Post-treatment operations                | 0.001672337     |  2.3        | 1              
+Other operations                         | 0.006612768     |  9.2        | 
+
+Average number of iteration of the linear solver per call:                 20.7           
+
+
+-----------------------------------------------------------------------------------------------------------
+                                                    GPU statistics
+-----------------------------------------------------------------------------------------------------------
+Counter description                      | Time per step   | % loop time | Call(s)/step    | Bandwidth 
+-----------------------------------------------------------------------------------------------------------
+Libraries:                               | 0.0349861       | 48.5        | 1               | 
+Kernels:                                 | 0.0334288       | 46.3        | 153             | 
+Copy host to device:                     | 0.000193372     |  0.3        | 9               | 5.9 GB/s
+Copy device to host:                     | 0.000633207     |  0.9        | 2               | 5.6 GB/s
+Alloc/Free on device:                    | 2.62861e-05     |  0.0        | 0               | 
+GPU: 95% Copy H<->D: 1.1% Alloc/free: 0.036% Comm: 0% CPU & I/O: 4%
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+                                              Post-resolution statistics
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Time of the post-resolution:                                               0.0631071      
+
+Total time for the whole computation                                       33.75          
+
diff --git a/tests/GPU/GPU4/GPU4_BENCH.TU.irene-amd-ccrt_cc70 b/tests/GPU/GPU4/GPU4_BENCH.TU.irene-amd-ccrt_cc70
index 2434000bde..d7c9ed336b 100644
--- a/tests/GPU/GPU4/GPU4_BENCH.TU.irene-amd-ccrt_cc70
+++ b/tests/GPU/GPU4/GPU4_BENCH.TU.irene-amd-ccrt_cc70
@@ -8,8 +8,8 @@ Time is given in seconds
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
                                               Context of the computation
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-Date:     20-03-2026 -- 14:15:18
-OS:       irene7056__Linux__x86_64__4.18.0-553.89.1.el8_10.x86_64__#1 SMP Sat Nov 29 00:49:18 EST 2025
+Date:     23-04-2026 -- 14:51:45
+OS:       irene7051__Linux__x86_64__4.18.0-553.89.1.el8_10.x86_64__#1 SMP Sat Nov 29 00:49:18 EST 2025
 CPU model : Intel(R) Xeon(R) Gold 6230 CPU @ 2.10GHz
 Total number of threads:80
 GPU model: Tesla V100-SXM2-16GB
@@ -22,35 +22,35 @@ Total number of elements used for the calculation: 2592000
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
                                            Computation start-up statistics
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-Total time of the start-up:                                                50.9891        
+Total time of the start-up:                                                52.3282        
 
 Number of calls to the linear solver per time step:                        2              
-Average time of the resolution of the linear problem per call:             2.63023        
+Average time of the resolution of the linear problem per call:             2.75565        
 Average number of iteration of the linear solver per call:                 33             
 
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
                                                  Time loop statistics
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 The first time step is not accounted for the computation of the time loop statistics
-Total time of the time loop:                                               1.32894        
+Total time of the time loop:                                               1.28124        
 Number of time steps:                                                      9              
 Skipped time steps:                                                        1              
-Average time per time step:                                                0.14766        
-Standard deviation between time steps:                                     0.00667253     
-Time elapsed in the skipped time steps:                                    0.142905       
+Average time per time step:                                                0.14236        
+Standard deviation between time steps:                                     0.00709995     
+Time elapsed in the skipped time steps:                                    0.120199       
 
 
 Standard counter description             | Time/step       | % loop time | Call(s)/step   
 ------------------------------------------------------------------------------------------
-Linear solver resolutions Ax=B           | 0.0643949       | 43.6        | 1              
-Convection operator                      | 0.01452043      |  9.8        | 1              
-Diffusion operator                       | 0.004831964     |  3.3        | 1              
-Gradient operator                        | 0.02015019      | 13.6        | 2              
-Divergence operator                      | 0.01717549      | 11.6        | 2              
-Update ::mettre_a_jour                   | 0.0138784       |  9.4        | 1              
-Computation of the time step dt          | 0.006865698     |  4.6        | 2              
-Post-treatment operations                | 0.003516858     |  2.4        | 1              
-Other operations                         | 0.002325704     |  1.6        | 
+Linear solver resolutions Ax=B           | 0.0644412       | 45.3        | 1              
+Convection operator                      | 0.01452938      | 10.2        | 1              
+Diffusion operator                       | 0.004854963     |  3.4        | 1              
+Gradient operator                        | 0.01442492      | 10.1        | 2              
+Divergence operator                      | 0.0172205       | 12.1        | 2              
+Update ::mettre_a_jour                   | 0.01393577      |  9.8        | 1              
+Computation of the time step dt          | 0.006881474     |  4.8        | 2              
+Post-treatment operations                | 0.003676644     |  2.6        | 1              
+Other operations                         | 0.002395204     |  1.7        | 
 
 Average number of iteration of the linear solver per call:                 20.7           
 
@@ -60,17 +60,17 @@ Average number of iteration of the linear solver per call:                 20.7
 -----------------------------------------------------------------------------------------------------------
 Counter description                      | Time per step   | % loop time | Call(s)/step    | Bandwidth 
 -----------------------------------------------------------------------------------------------------------
-Libraries:                               | 0.0641017       | 43.4        | 1               | 
-Kernels:                                 | 0.0777633       | 52.7        | 147             | 
-Copy host to device:                     | 0.000519965     |  0.4        | 10              | 2.6 GB/s
-Copy device to host:                     | 0.000765415     |  0.5        | 1               | 4.4 GB/s
-Alloc/Free on device:                    | 4.60109e-05     |  0.0        | 0               | 
-GPU: 96% Copy H<->D: 0.87% Alloc/free: 0.031% Comm: 0% CPU & I/O: 3%
+Libraries:                               | 0.0641377       | 45.1        | 1               | 
+Kernels:                                 | 0.0719512       | 50.5        | 146             | 
+Copy host to device:                     | 0.00053085      |  0.4        | 10              | 2.5 GB/s
+Copy device to host:                     | 0.000839204     |  0.6        | 1               | 4.0 GB/s
+Alloc/Free on device:                    | 5.52052e-05     |  0.0        | 0               | 
+GPU: 96% Copy H<->D: 0.96% Alloc/free: 0.039% Comm: 0% CPU & I/O: 3.4%
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
                                               Post-resolution statistics
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-Time of the post-resolution:                                               0.149494       
+Time of the post-resolution:                                               0.150737       
 
-Total time for the whole computation                                       52.6105        
+Total time for the whole computation                                       53.8804        
 
-[Slurm] Power consumption (68 s):  0.205 kW  0.004 kWh  0.000 € (0.10€/kWh)
+[Slurm] Power consumption (67 s):  0.211 kW  0.004 kWh  0.000 € (0.10€/kWh)
diff --git a/tests/GPU/GPU4/GPU4_BENCH.TU.is157091_cc86 b/tests/GPU/GPU4/GPU4_BENCH.TU.is157091_cc86
index 81926a0981..ce591d36ec 100644
--- a/tests/GPU/GPU4/GPU4_BENCH.TU.is157091_cc86
+++ b/tests/GPU/GPU4/GPU4_BENCH.TU.is157091_cc86
@@ -8,13 +8,13 @@ Time is given in seconds
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
                                               Context of the computation
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-Date:     10-03-2026 -- 08:40:38
+Date:     14-05-2026 -- 16:09:55
 OS:       is157091__Linux__x86_64__5.11.12-300.fc34.x86_64__#1 SMP Wed Apr 7 16:31:13 UTC 2021
 CPU model : AMD Ryzen Threadripper PRO 5975WX 32-Cores
 Total number of threads:64
 GPU model: NVIDIA RTX A6000
 CUDA runtime version: 12.90
-CUDA drivers version: 12.70
+CUDA drivers version: 13.20
 Nb procs used for the computation: 1
 TRUST version: 1.9.8_beta
 Total number of elements used for the calculation: 2592000
@@ -22,35 +22,35 @@ Total number of elements used for the calculation: 2592000
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
                                            Computation start-up statistics
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-Total time of the start-up:                                                34.5173        
+Total time of the start-up:                                                35.7139        
 
 Number of calls to the linear solver per time step:                        2              
-Average time of the resolution of the linear problem per call:             1.46653        
+Average time of the resolution of the linear problem per call:             1.70453        
 Average number of iteration of the linear solver per call:                 33             
 
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
                                                  Time loop statistics
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 The first time step is not accounted for the computation of the time loop statistics
-Total time of the time loop:                                               1.26768        
+Total time of the time loop:                                               1.00381        
 Number of time steps:                                                      9              
 Skipped time steps:                                                        1              
-Average time per time step:                                                0.140853       
-Standard deviation between time steps:                                     0.00449016     
-Time elapsed in the skipped time steps:                                    0.0853086      
+Average time per time step:                                                0.111534       
+Standard deviation between time steps:                                     0.00154677     
+Time elapsed in the skipped time steps:                                    0.0637836      
 
 
 Standard counter description             | Time/step       | % loop time | Call(s)/step   
 ------------------------------------------------------------------------------------------
-Linear solver resolutions Ax=B           | 0.0663534       | 47.1        | 1              
-Convection operator                      | 0.01407047      | 10.0        | 1              
-Diffusion operator                       | 0.004086097     |  2.9        | 1              
-Gradient operator                        | 0.01717689      | 12.2        | 2              
-Divergence operator                      | 0.01332984      |  9.5        | 2              
-Update ::mettre_a_jour                   | 0.01163229      |  8.3        | 1              
-Computation of the time step dt          | 0.007360857     |  5.2        | 2              
-Post-treatment operations                | 0.002709982     |  1.9        | 1              
-Other operations                         | 0.004133031     |  2.9        | 
+Linear solver resolutions Ax=B           | 0.0576824       | 51.7        | 1              
+Convection operator                      | 0.01250822      | 11.2        | 1              
+Diffusion operator                       | 0.003500001     |  3.1        | 1              
+Gradient operator                        | 0.007888202     |  7.1        | 2              
+Divergence operator                      | 0.008768458     |  7.9        | 2              
+Update ::mettre_a_jour                   | 0.008300216     |  7.4        | 1              
+Computation of the time step dt          | 0.005651737     |  5.1        | 2              
+Post-treatment operations                | 0.00148399      |  1.3        | 1              
+Other operations                         | 0.005751195     |  5.2        | 
 
 Average number of iteration of the linear solver per call:                 20.7           
 
@@ -60,16 +60,16 @@ Average number of iteration of the linear solver per call:                 20.7
 -----------------------------------------------------------------------------------------------------------
 Counter description                      | Time per step   | % loop time | Call(s)/step    | Bandwidth 
 -----------------------------------------------------------------------------------------------------------
-Libraries:                               | 0.0660378       | 46.9        | 1               | 
-Kernels:                                 | 0.0706805       | 50.2        | 147             | 
-Copy host to device:                     | 0.000225445     |  0.2        | 10              | 5.9 GB/s
-Copy device to host:                     | 0.000299242     |  0.2        | 1               | 11.2 GB/s
-Alloc/Free on device:                    | 2.02081e-05     |  0.0        | 0               | 
-GPU: 97% Copy H<->D: 0.37% Alloc/free: 0.014% Comm: 0% CPU & I/O: 2.5%
+Libraries:                               | 0.0573718       | 51.4        | 1               | 
+Kernels:                                 | 0.0510853       | 45.8        | 153             | 
+Copy host to device:                     | 0.000203896     |  0.2        | 9               | 5.6 GB/s
+Copy device to host:                     | 0.000333923     |  0.3        | 2               | 10.6 GB/s
+Alloc/Free on device:                    | 2.21462e-05     |  0.0        | 0               | 
+GPU: 97% Copy H<->D: 0.48% Alloc/free: 0.02% Comm: 0% CPU & I/O: 2.3%
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
                                               Post-resolution statistics
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-Time of the post-resolution:                                               0.0596979      
+Time of the post-resolution:                                               0.0572605      
 
-Total time for the whole computation                                       35.93          
+Total time for the whole computation                                       36.8387        
 
diff --git a/tests/GPU/GPU4/GPU4_BENCH.TU.is159479_cc120 b/tests/GPU/GPU4/GPU4_BENCH.TU.is159479_cc120
new file mode 100644
index 0000000000..4c415877a8
--- /dev/null
+++ b/tests/GPU/GPU4/GPU4_BENCH.TU.is159479_cc120
@@ -0,0 +1,75 @@
+                                             # Global performance file #
+
+This is the global file for tracking performance in TRUST. It stores aggregated quantities.
+More detailed statistics can be found in the GPU4_BENCH_csv.TU file
+For time loop, only standard counters of level 1 are printed alongside your custom counters
+Time is given in seconds
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+                                              Context of the computation
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Date:     24-04-2026 -- 14:35:08
+OS:       is159479.intra.cea.fr__Linux__x86_64__6.12.0-160000.26-default__#1 SMP PREEMPT_DYNAMIC Thu Feb  5 00:00:11 UTC 2026 (f
+CPU model : AMD Ryzen Threadripper PRO 9965WX 24-Cores
+Total number of threads:48
+GPU model: NVIDIA RTX PRO 6000 Blackwell Workstation Edition
+CUDA runtime version: 13.0
+CUDA drivers version: 13.0
+Nb procs used for the computation: 1
+TRUST version: 1.9.8_beta
+Total number of elements used for the calculation: 2592000
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+                                           Computation start-up statistics
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Total time of the start-up:                                                24.6151        
+
+Number of calls to the linear solver per time step:                        2              
+Average time of the resolution of the linear problem per call:             0.970803       
+Average number of iteration of the linear solver per call:                 33             
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+                                                 Time loop statistics
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+The first time step is not accounted for the computation of the time loop statistics
+Total time of the time loop:                                               0.394239       
+Number of time steps:                                                      9              
+Skipped time steps:                                                        1              
+Average time per time step:                                                0.0438043      
+Standard deviation between time steps:                                     0.00198087     
+Time elapsed in the skipped time steps:                                    0.0374093      
+
+
+Standard counter description             | Time/step       | % loop time | Call(s)/step   
+------------------------------------------------------------------------------------------
+Linear solver resolutions Ax=B           | 0.0235285       | 53.7        | 1              
+Convection operator                      | 0.004622394     | 10.6        | 1              
+Diffusion operator                       | 0.001596872     |  3.6        | 1              
+Gradient operator                        | 0.004136096     |  9.4        | 2              
+Divergence operator                      | 0.001846463     |  4.2        | 2              
+Update ::mettre_a_jour                   | 0.002348554     |  5.4        | 1              
+Computation of the time step dt          | 0.0009105937    |  2.1        | 2              
+Post-treatment operations                | 0.001085021     |  2.5        | 1              
+Other operations                         | 0.003729836     |  8.5        | 
+
+Average number of iteration of the linear solver per call:                 20.7           
+
+
+-----------------------------------------------------------------------------------------------------------
+                                                    GPU statistics
+-----------------------------------------------------------------------------------------------------------
+Counter description                      | Time per step   | % loop time | Call(s)/step    | Bandwidth 
+-----------------------------------------------------------------------------------------------------------
+Libraries:                               | 0.0234364       | 53.5        | 1               | 
+Kernels:                                 | 0.0180744       | 41.3        | 146             | 
+Copy host to device:                     | 0.000161213     |  0.4        | 10              | 8.3 GB/s
+Copy device to host:                     | 0.000382395     |  0.9        | 1               | 8.8 GB/s
+Alloc/Free on device:                    | 2.0011e-05      |  0.0        | 0               | 
+GPU: 95% Copy H<->D: 1.2% Alloc/free: 0.046% Comm: 0% CPU & I/O: 3.9%
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+                                              Post-resolution statistics
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Time of the post-resolution:                                               0.0507625      
+
+Total time for the whole computation                                       25.0975        
+
diff --git a/tests/GPU/GPU4/GPU4_BENCH.TU.is247793_gfx1100 b/tests/GPU/GPU4/GPU4_BENCH.TU.is247793_gfx1100
index 3478687aa7..d014f13ceb 100644
--- a/tests/GPU/GPU4/GPU4_BENCH.TU.is247793_gfx1100
+++ b/tests/GPU/GPU4/GPU4_BENCH.TU.is247793_gfx1100
@@ -1,50 +1,75 @@
-Statistiques d'initialisation du calcul
-
-Temps total                       51.3163
-
-Statistiques de resolution du probleme
-
-Temps total                       16.0305
-
-
-Timesteps                         10
-Secondes / pas de temps           1.60305
-Dont solveurs Ax=B                1.409289 87% (1 appel/pas de temps)
-Dont mettre_a_jour                0.018862  1% (1 appel/pas de temps)
-Dont operateurs convection        0.029484  1% (1 appel/pas de temps)
-Dont operateurs diffusion         0.010304  0% (1 appel/pas de temps)
-Dont operateurs gradient          0.022206  1% (2 appels/pas de temps)
-Dont operateurs divergence        0.008665  0% (2 appels/pas de temps)
-Dont operations postraitement     0.074592  4% (1 appel/pas de temps)
-Dont calcul dt                    0.004162  0% (2 appels/pas de temps)
-Dont calcul divers                0.025483  1% (0 appels/pas de temps)
-Nb solveur / pas de temps         1
-Secondes / solveur                1.40929
-Iterations / solveur              275.6
-GPU statistics per time step (experimental):
-Libraries : 1.409001 s 87.9%  1.0 calls
-Kernels   : 0.094763 s  5.9% 116.7 calls
-Copy H2D  : 0.006382 s  0.4% 15.8 calls 11.9 GB/s
-Copy D2H  : 0.012701 s  0.8% 19.1 calls 17.4 GB/s
-Alloc/Free: 0.000266 s  0.0%  1.4 calls
-GPU: 93.8% Copy H<->D: 1.1% Alloc/Free: 0% Comm: 0% CPU & Others: 4.9%
-I/O:
-
-Timesteps = number of time steps
-Nb solveur = number of linear system resolutions
-Nb assemblage implicite = number of matrix assemblies for the implicit scheme
-Iterations = average number of iterations of the solver
-Communications = fraction of the time spent
-                 in communications between processors (excluding io files)
-Network latency = time of one mpsum measured by an internal bench over 0.1s
-Network bandwidth = maximum on all processors
-                    of the average bandwidth of send_recv operations
-Waiting time = estimation of the waiting time of the different processors
-
-Max_waiting_time big    => probably due to a bad partitioning
-Communications > 30%    => too many processors or network too slow
-
-Statistiques de post resolution
-
-Temps total                       0.068697
+                                             # Global performance file #
+
+This is the global file for tracking performance in TRUST. It stores aggregated quantities.
+More detailed statistics can be found in the GPU4_BENCH_csv.TU file
+For time loop, only standard counters of level 1 are printed alongside your custom counters
+Time is given in seconds
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+                                              Context of the computation
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Date:     22-05-2026 -- 19:03:06
+OS:       is247793__Linux__x86_64__6.8.0-107-generic__#107~22.04.1-Ubuntu SMP PREEMPT_DYNAMIC Wed Mar 18 23:40:43 UTC 
+CPU model : AMD Ryzen Threadripper PRO 7965WX 24-Cores
+Total number of threads:48
+GPU model: AMD Radeon PRO W7900
+HIP runtime version: 7.53
+HIP drivers version: 7.53
+Nb procs used for the computation: 1
+TRUST version: 1.9.8_beta
+Total number of elements used for the calculation: 2592000
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+                                           Computation start-up statistics
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Total time of the start-up:                                                49.3535        
+
+Number of calls to the linear solver per time step:                        2              
+Average time of the resolution of the linear problem per call:             3.13722        
+Average number of iteration of the linear solver per call:                 33             
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+                                                 Time loop statistics
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+The first time step is not accounted for the computation of the time loop statistics
+Total time of the time loop:                                               1.14455        
+Number of time steps:                                                      9              
+Skipped time steps:                                                        1              
+Average time per time step:                                                0.127172       
+Standard deviation between time steps:                                     0.00390832     
+Time elapsed in the skipped time steps:                                    0.0844205      
+
+
+Standard counter description             | Time/step       | % loop time | Call(s)/step   
+------------------------------------------------------------------------------------------
+Linear solver resolutions Ax=B           | 0.0652737       | 51.3        | 1              
+Convection operator                      | 0.01091375      |  8.6        | 1              
+Diffusion operator                       | 0.004316445     |  3.4        | 1              
+Gradient operator                        | 0.01320478      | 10.4        | 2              
+Divergence operator                      | 0.006607952     |  5.2        | 2              
+Update ::mettre_a_jour                   | 0.007333066     |  5.8        | 1              
+Computation of the time step dt          | 0.003446431     |  2.7        | 2              
+Post-treatment operations                | 0.002252622     |  1.8        | 1              
+Other operations                         | 0.01382318      | 10.9        | 
+
+Average number of iteration of the linear solver per call:                 20.7           
+
+
+-----------------------------------------------------------------------------------------------------------
+                                                    GPU statistics
+-----------------------------------------------------------------------------------------------------------
+Counter description                      | Time per step   | % loop time | Call(s)/step    | Bandwidth 
+-----------------------------------------------------------------------------------------------------------
+Libraries:                               | 0.0648951       | 51.0        | 1               | 
+Kernels:                                 | 0.055383        | 43.5        | 153             | 
+Copy host to device:                     | 0.000475677     |  0.4        | 9               | 2.4 GB/s
+Copy device to host:                     | 0.000299563     |  0.2        | 2               | 11.9 GB/s
+Alloc/Free on device:                    | 0.000134375     |  0.1        | 0               | 
+GPU: 95% Copy H<->D: 0.61% Alloc/free: 0.11% Comm: 0% CPU & I/O: 4.7%
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+                                              Post-resolution statistics
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Time of the post-resolution:                                               0.101868       
+
+Total time for the whole computation                                       50.6843        
 
diff --git a/tests/GPU/GPU4/GPU4_BENCH.TU.lumi_gfx90a b/tests/GPU/GPU4/GPU4_BENCH.TU.lumi_gfx90a
index fc180748d0..0c54e27f11 100644
--- a/tests/GPU/GPU4/GPU4_BENCH.TU.lumi_gfx90a
+++ b/tests/GPU/GPU4/GPU4_BENCH.TU.lumi_gfx90a
@@ -8,8 +8,8 @@ Time is given in seconds
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
                                               Context of the computation
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-Date:     23-02-2026 -- 23:59:18
-OS:       nid007971__Linux__x86_64__6.4.0-150600.23.73_15.0.14-cray_shasta_c__#1 SMP Tue Oct 21 20:32:25 UTC 2025 (89d3c98)
+Date:     18-05-2026 -- 08:58:15
+OS:       nid007955__Linux__x86_64__6.4.0-150600.23.73_15.0.14-cray_shasta_c__#1 SMP Tue Oct 21 20:32:25 UTC 2025 (89d3c98)
 CPU model : AMD EPYC 7A53 64-Core Processor
 Total number of threads:128
 GPU model: AMD Instinct MI250X
@@ -22,37 +22,37 @@ Total number of elements used for the calculation: 2592000
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
                                            Computation start-up statistics
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-Total time of the start-up:                                                80.7221        
+Total time of the start-up:                                                95.9527        
 
 Number of calls to the linear solver per time step:                        2              
-Average time of the resolution of the linear problem per call:             3.24025        
-Average number of iteration of the linear solver per call:                 31.5           
+Average time of the resolution of the linear problem per call:             4.54876        
+Average number of iteration of the linear solver per call:                 33             
 
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
                                                  Time loop statistics
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 The first time step is not accounted for the computation of the time loop statistics
-Total time of the time loop:                                               1.48028        
+Total time of the time loop:                                               0.983467       
 Number of time steps:                                                      9              
 Skipped time steps:                                                        1              
-Average time per time step:                                                0.164476       
-Standard deviation between time steps:                                     0.00638041     
-Time elapsed in the skipped time steps:                                    0.139515       
+Average time per time step:                                                0.109274       
+Standard deviation between time steps:                                     0.00630091     
+Time elapsed in the skipped time steps:                                    0.0982491      
 
 
 Standard counter description             | Time/step       | % loop time | Call(s)/step   
 ------------------------------------------------------------------------------------------
-Linear solver resolutions Ax=B           | 0.044052        | 24.5        | 1              
-Convection operator                      | 0.02158292      | 12.0        | 1              
-Diffusion operator                       | 0.008052386     |  4.5        | 1              
-Gradient operator                        | 0.05045038      | 28.0        | 2              
-Divergence operator                      | 0.01614263      |  9.0        | 2              
-Update ::mettre_a_jour                   | 0.01302959      |  7.2        | 1              
-Computation of the time step dt          | 0.006851856     |  3.8        | 2              
-Post-treatment operations                | 0.003123114     |  1.7        | 1              
-Other operations                         | 0.001190999     |  0.7        | 
+Linear solver resolutions Ax=B           | 0.0349821       | 32.0        | 1              
+Convection operator                      | 0.01327583      | 12.1        | 1              
+Diffusion operator                       | 0.007554157     |  6.9        | 1              
+Gradient operator                        | 0.02225961      | 20.4        | 2              
+Divergence operator                      | 0.01005504      |  9.2        | 2              
+Update ::mettre_a_jour                   | 0.008657176     |  7.9        | 1              
+Computation of the time step dt          | 0.004889879     |  4.5        | 2              
+Post-treatment operations                | 0.002895587     |  2.6        | 1              
+Other operations                         | 0.004704638     |  4.3        | 
 
-Average number of iteration of the linear solver per call:                 17.7           
+Average number of iteration of the linear solver per call:                 20.7           
 
 
 -----------------------------------------------------------------------------------------------------------
@@ -60,17 +60,17 @@ Average number of iteration of the linear solver per call:                 17.7
 -----------------------------------------------------------------------------------------------------------
 Counter description                      | Time per step   | % loop time | Call(s)/step    | Bandwidth 
 -----------------------------------------------------------------------------------------------------------
-Libraries:                               | 0.0432993       | 26.3        | 1               | 
-Kernels:                                 | 0.115903        | 70.5        | 147             | 
-Copy host to device:                     | 0.000290883     |  0.2        | 10              | 4.6 GB/s
-Copy device to host:                     | 0.000189674     |  0.1        | 1               | 17.7 GB/s
-Alloc/Free on device:                    | 1.32044e-05     |  0.0        | 0               | 
-GPU: 97% Copy H<->D: 0.29% Alloc/free: 0.008% Comm: 0% CPU & I/O: 2.9%
+Libraries:                               | 0.0347634       | 31.8        | 1               | 
+Kernels:                                 | 0.0698308       | 63.9        | 153             | 
+Copy host to device:                     | 0.000244381     |  0.2        | 9               | 4.7 GB/s
+Copy device to host:                     | 0.000228841     |  0.2        | 2               | 15.5 GB/s
+Alloc/Free on device:                    | 1.47298e-05     |  0.0        | 0               | 
+GPU: 96% Copy H<->D: 0.43% Alloc/free: 0.013% Comm: 0% CPU & I/O: 3.8%
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
                                               Post-resolution statistics
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-Time of the post-resolution:                                               0.139935       
+Time of the post-resolution:                                               0.149004       
 
-Total time for the whole computation                                       82.4819        
+Total time for the whole computation                                       97.1834        
 
-Power consumption: 0 kW 0 kWh 0 € (0.12€/kWh)
+[Slurm] Power consumption (121 s):  0.485 kW  0.016 kWh  0.002 € (0.10€/kWh)
diff --git a/tests/GPU/GPU4/GPU4_BENCH.TU.topaze_cc80 b/tests/GPU/GPU4/GPU4_BENCH.TU.topaze_cc80
index e257d2f32c..97412b3ffd 100644
--- a/tests/GPU/GPU4/GPU4_BENCH.TU.topaze_cc80
+++ b/tests/GPU/GPU4/GPU4_BENCH.TU.topaze_cc80
@@ -8,8 +8,8 @@ Time is given in seconds
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
                                               Context of the computation
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-Date:     20-02-2026 -- 17:43:48
-OS:       topaze7059__Linux__x86_64__4.18.0-553.89.1.el8_10.x86_64__#1 SMP Sat Nov 29 00:49:18 EST 2025
+Date:     15-05-2026 -- 13:33:13
+OS:       topaze7040__Linux__x86_64__4.18.0-553.89.1.el8_10.x86_64__#1 SMP Sat Nov 29 00:49:18 EST 2025
 CPU model : AMD EPYC 7763 64-Core Processor
 Total number of threads:256
 GPU model: NVIDIA A100-SXM4-80GB
@@ -22,35 +22,35 @@ Total number of elements used for the calculation: 2592000
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
                                            Computation start-up statistics
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-Total time of the start-up:                                                45.444         
+Total time of the start-up:                                                45.6469        
 
 Number of calls to the linear solver per time step:                        2              
-Average time of the resolution of the linear problem per call:             2.14431        
+Average time of the resolution of the linear problem per call:             2.08417        
 Average number of iteration of the linear solver per call:                 33             
 
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
                                                  Time loop statistics
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 The first time step is not accounted for the computation of the time loop statistics
-Total time of the time loop:                                               0.652446       
+Total time of the time loop:                                               0.556963       
 Number of time steps:                                                      9              
 Skipped time steps:                                                        1              
-Average time per time step:                                                0.072494       
-Standard deviation between time steps:                                     0.00699117     
-Time elapsed in the skipped time steps:                                    0.115932       
+Average time per time step:                                                0.0618848      
+Standard deviation between time steps:                                     0.00639562     
+Time elapsed in the skipped time steps:                                    0.0650986      
 
 
 Standard counter description             | Time/step       | % loop time | Call(s)/step   
 ------------------------------------------------------------------------------------------
-Linear solver resolutions Ax=B           | 0.0327829       | 38.4        | 1              
-Convection operator                      | 0.008547117     | 10.0        | 1              
-Diffusion operator                       | 0.002531127     |  3.0        | 1              
-Gradient operator                        | 0.009250802     | 10.8        | 2              
-Divergence operator                      | 0.004430615     |  5.2        | 2              
-Update ::mettre_a_jour                   | 0.004444057     |  5.2        | 1              
-Computation of the time step dt          | 0.002272804     |  2.7        | 2              
-Post-treatment operations                | 0.003122811     |  3.7        | 1              
-Other operations                         | 0.005111731     |  6.0        | 
+Linear solver resolutions Ax=B           | 0.0319398       | 51.6        | 1              
+Convection operator                      | 0.005375055     |  8.7        | 1              
+Diffusion operator                       | 0.002208149     |  3.6        | 1              
+Gradient operator                        | 0.0070071       | 11.3        | 2              
+Divergence operator                      | 0.001991878     |  3.2        | 2              
+Update ::mettre_a_jour                   | 0.0028667       |  4.6        | 1              
+Computation of the time step dt          | 0.001355453     |  2.2        | 2              
+Post-treatment operations                | 0.002902446     |  4.7        | 1              
+Other operations                         | 0.00623828      | 10.1        | 
 
 Average number of iteration of the linear solver per call:                 20.7           
 
@@ -60,16 +60,17 @@ Average number of iteration of the linear solver per call:                 20.7
 -----------------------------------------------------------------------------------------------------------
 Counter description                      | Time per step   | % loop time | Call(s)/step    | Bandwidth 
 -----------------------------------------------------------------------------------------------------------
-Libraries:                               | 0.0326061       | 45.0        | 1               | 
-Kernels:                                 | 0.0341615       | 47.1        | 147             | 
-Copy host to device:                     | 0.000264131     |  0.4        | 10              | 5.1 GB/s
-Copy device to host:                     | 0.000340788     |  0.5        | 1               | 9.9 GB/s
-Alloc/Free on device:                    | 4.67921e-05     |  0.1        | 0               | 
-GPU: 92% Copy H<->D: 0.83% Alloc/free: 0.065% Comm: 0% CPU & I/O: 7%
+Libraries:                               | 0.0317634       | 51.3        | 1               | 
+Kernels:                                 | 0.0238069       | 38.5        | 153             | 
+Copy host to device:                     | 0.000227296     |  0.4        | 9               | 5.0 GB/s
+Copy device to host:                     | 0.000285206     |  0.5        | 2               | 12.4 GB/s
+Alloc/Free on device:                    | 4.25278e-05     |  0.1        | 0               | 
+GPU: 90% Copy H<->D: 0.83% Alloc/free: 0.069% Comm: 0% CPU & I/O: 9.3%
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
                                               Post-resolution statistics
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-Time of the post-resolution:                                               0.181824       
+Time of the post-resolution:                                               0.171073       
 
-Total time for the whole computation                                       46.3942        
+Total time for the whole computation                                       46.4401        
 
+[Slurm] Power consumption (80 s):  0.409 kW  0.009 kWh  0.001 € (0.10€/kWh)
diff --git a/tests/GPU/Implicit_ef_stab/Implicit_ef_stab.data b/tests/GPU/Implicit_ef_stab/Implicit_ef_stab.data
index d49102d58b..7aa2fe01ab 100644
--- a/tests/GPU/Implicit_ef_stab/Implicit_ef_stab.data
+++ b/tests/GPU/Implicit_ef_stab/Implicit_ef_stab.data
@@ -41,7 +41,8 @@ Scatter DOM.Zones dom_fluide
 END SCATTER #
 
 
-VEFPreP1B dis
+VEFPreP1B dis 
+Lire dis { reorder { algo Hilbert } }
 
 Scheme_euler_implicit sch
 Read sch
diff --git a/tests/GPU/Implicit_ef_stab/Implicit_ef_stab_BENCH.TU.adastra_gfx90a b/tests/GPU/Implicit_ef_stab/Implicit_ef_stab_BENCH.TU.adastra_gfx90a
index d91d299b7a..916ff81393 100644
--- a/tests/GPU/Implicit_ef_stab/Implicit_ef_stab_BENCH.TU.adastra_gfx90a
+++ b/tests/GPU/Implicit_ef_stab/Implicit_ef_stab_BENCH.TU.adastra_gfx90a
@@ -8,8 +8,8 @@ Time is given in seconds
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
                                               Context of the computation
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-Date:     12-03-2026 -- 18:28:24
-OS:       g1016__Linux__x86_64__5.14.0-570.69.1.el9_6.x86_64__#1 SMP PREEMPT_DYNAMIC Tue Nov 25 01:30:14 EST 2025
+Date:     24-05-2026 -- 16:00:01
+OS:       g1321__Linux__x86_64__5.14.0-570.69.1.el9_6.x86_64__#1 SMP PREEMPT_DYNAMIC Tue Nov 25 01:30:14 EST 2025
 CPU model : AMD EPYC 7A53 64-Core Processor
 Total number of threads:128
 GPU model: AMD Instinct MI250X
@@ -22,36 +22,36 @@ Total number of elements used for the calculation: 2560000
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
                                            Computation start-up statistics
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-Total time of the start-up:                                                54.1322        
+Total time of the start-up:                                                44.9673        
 
 Number of calls to the linear solver per time step:                        1              
-Average time of the resolution of the linear problem per call:             2.49013        
+Average time of the resolution of the linear problem per call:             3.22864        
 Average number of iteration of the linear solver per call:                 0              
 
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
                                                  Time loop statistics
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 The first time step is not accounted for the computation of the time loop statistics
-Total time of the time loop:                                               12.1558        
+Total time of the time loop:                                               10.5043        
 Number of time steps:                                                      9              
 Skipped time steps:                                                        1              
-Average time per time step:                                                1.35065        
-Standard deviation between time steps:                                     0.0459982      
-Time elapsed in the skipped time steps:                                    14.7339        
+Average time per time step:                                                1.16715        
+Standard deviation between time steps:                                     0.0400912      
+Time elapsed in the skipped time steps:                                    26.41          
 
 
 Standard counter description             | Time/step       | % loop time | Call(s)/step   
 ------------------------------------------------------------------------------------------
-Linear solver resolutions Ax=B           | 0.538321        | 39.9        | 3              
-Matrix assembly for implicit scheme      | 0.1717073       | 12.7        | 1              
-Convection operator                      | 0.2151823       | 15.9        | 4              
-Diffusion operator                       | 0.01442614      |  1.1        | 2              
-Divergence operator                      | 0.03195598      |  2.4        | 4              
-Source terms                             | 0.0005473772    |  0.0        | 2              
-Update ::mettre_a_jour                   | 0.01190256      |  0.9        | 4              
-Computation of the time step dt          | 0.001549987     |  0.1        | 4              
-Post-treatment operations                | 0.02270432      |  1.7        | 1              
-Other operations                         | 0.3423522       | 25.3        | 
+Linear solver resolutions Ax=B           | 0.498521        | 42.7        | 3              
+Matrix assembly for implicit scheme      | 0.1270503       | 10.9        | 1              
+Convection operator                      | 0.1485969       | 12.7        | 4              
+Diffusion operator                       | 0.01211031      |  1.0        | 2              
+Divergence operator                      | 0.021462        |  1.8        | 4              
+Source terms                             | 0.0005480331    |  0.0        | 2              
+Update ::mettre_a_jour                   | 0.009032102     |  0.8        | 4              
+Computation of the time step dt          | 0.001589899     |  0.1        | 4              
+Post-treatment operations                | 0.02160866      |  1.9        | 1              
+Other operations                         | 0.3266274       | 28.0        | 
 
 Average number of iteration of the linear solver per call:                 23.4           
 
@@ -61,17 +61,17 @@ Average number of iteration of the linear solver per call:                 23.4
 -----------------------------------------------------------------------------------------------------------
 Counter description                      | Time per step   | % loop time | Call(s)/step    | Bandwidth 
 -----------------------------------------------------------------------------------------------------------
-Libraries:                               | 0.515509        | 38.2        | 3               | 
-Kernels:                                 | 0.823336        | 61.0        | 435             | 
-Copy host to device:                     | 0.00069958      |  0.1        | 21              | 6.4 GB/s
-Copy device to host:                     | 0.000783069     |  0.1        | 7               | 14.2 GB/s
-Alloc/Free on device:                    | 0.00013029      |  0.0        | 4               | 
-GPU: 99% Copy H<->D: 0.11% Alloc/free: 0.0096% Comm: 0% CPU & I/O: 0.75%
+Libraries:                               | 0.476198        | 40.8        | 3               | 
+Kernels:                                 | 0.678666        | 58.1        | 434             | 
+Copy host to device:                     | 0.000692706     |  0.1        | 21              | 6.5 GB/s
+Copy device to host:                     | 0.00079482      |  0.1        | 7               | 14.0 GB/s
+Alloc/Free on device:                    | 0.000150397     |  0.0        | 4               | 
+GPU: 99% Copy H<->D: 0.13% Alloc/free: 0.013% Comm: 0% CPU & I/O: 0.91%
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
                                               Post-resolution statistics
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-Time of the post-resolution:                                               0.198921       
+Time of the post-resolution:                                               0.202604       
 
-Total time for the whole computation                                       81.2209        
+Total time for the whole computation                                       82.0843        
 
-[Slurm] Power consumption (89 s):  0.513 kW  0.013 kWh  0.001 € (0.10€/kWh)
+[Slurm] Power consumption (88 s):  0.476 kW  0.012 kWh  0.001 € (0.10€/kWh)
diff --git a/tests/GPU/Implicit_ef_stab/Implicit_ef_stab_BENCH.TU.adastra_gfx942 b/tests/GPU/Implicit_ef_stab/Implicit_ef_stab_BENCH.TU.adastra_gfx942
index 7a2a08434d..0830e1ccbe 100644
--- a/tests/GPU/Implicit_ef_stab/Implicit_ef_stab_BENCH.TU.adastra_gfx942
+++ b/tests/GPU/Implicit_ef_stab/Implicit_ef_stab_BENCH.TU.adastra_gfx942
@@ -8,8 +8,8 @@ Time is given in seconds
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
                                               Context of the computation
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-Date:     17-03-2026 -- 17:55:25
-OS:       a1007__Linux__x86_64__5.14.0-570.69.1.el9_6.x86_64__#1 SMP PREEMPT_DYNAMIC Tue Nov 25 01:30:14 EST 2025
+Date:     23-04-2026 -- 14:44:39
+OS:       a1003__Linux__x86_64__5.14.0-570.69.1.el9_6.x86_64__#1 SMP PREEMPT_DYNAMIC Tue Nov 25 01:30:14 EST 2025
 CPU model : AMD Instinct MI300A Accelerator
 Total number of threads:192
 GPU model: AMD Instinct MI300A
@@ -22,36 +22,36 @@ Total number of elements used for the calculation: 2560000
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
                                            Computation start-up statistics
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-Total time of the start-up:                                                85.6592        
+Total time of the start-up:                                                63.6504        
 
 Number of calls to the linear solver per time step:                        1              
-Average time of the resolution of the linear problem per call:             3.56771        
+Average time of the resolution of the linear problem per call:             3.86101        
 Average number of iteration of the linear solver per call:                 0              
 
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
                                                  Time loop statistics
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 The first time step is not accounted for the computation of the time loop statistics
-Total time of the time loop:                                               5.31926        
+Total time of the time loop:                                               5.12227        
 Number of time steps:                                                      9              
 Skipped time steps:                                                        1              
-Average time per time step:                                                0.591029       
-Standard deviation between time steps:                                     0.0943462      
-Time elapsed in the skipped time steps:                                    26.3575        
+Average time per time step:                                                0.569141       
+Standard deviation between time steps:                                     0.0908845      
+Time elapsed in the skipped time steps:                                    28.5199        
 
 
 Standard counter description             | Time/step       | % loop time | Call(s)/step   
 ------------------------------------------------------------------------------------------
-Linear solver resolutions Ax=B           | 0.408167        | 69.1        | 3              
-Matrix assembly for implicit scheme      | 0.03734021      |  6.3        | 1              
-Convection operator                      | 0.04406685      |  7.5        | 4              
-Diffusion operator                       | 0.004099765     |  0.7        | 2              
-Divergence operator                      | 0.008980409     |  1.5        | 4              
-Source terms                             | 0.0002865144    |  0.0        | 2              
-Update ::mettre_a_jour                   | 0.006264622     |  1.1        | 4              
-Computation of the time step dt          | 0.001301963     |  0.2        | 4              
-Post-treatment operations                | 0.01142354      |  1.9        | 1              
-Other operations                         | 0.06909795      | 11.7        | 
+Linear solver resolutions Ax=B           | 0.399856        | 70.3        | 3              
+Matrix assembly for implicit scheme      | 0.03381677      |  5.9        | 1              
+Convection operator                      | 0.04030673      |  7.1        | 4              
+Diffusion operator                       | 0.004144521     |  0.7        | 2              
+Divergence operator                      | 0.006502531     |  1.1        | 4              
+Source terms                             | 0.0003088568    |  0.1        | 2              
+Update ::mettre_a_jour                   | 0.005641864     |  1.0        | 4              
+Computation of the time step dt          | 0.001270153     |  0.2        | 4              
+Post-treatment operations                | 0.0110431       |  1.9        | 1              
+Other operations                         | 0.06625069      | 11.6        | 
 
 Average number of iteration of the linear solver per call:                 23.4           
 
@@ -61,17 +61,17 @@ Average number of iteration of the linear solver per call:                 23.4
 -----------------------------------------------------------------------------------------------------------
 Counter description                      | Time per step   | % loop time | Call(s)/step    | Bandwidth 
 -----------------------------------------------------------------------------------------------------------
-Libraries:                               | 0.398663        | 67.5        | 3               | 
-Kernels:                                 | 0.179445        | 30.4        | 434             | 
-Copy host to device:                     | 0.000668456     |  0.1        | 21              | 6.7 GB/s
-Copy device to host:                     | 0.00053813      |  0.1        | 7               | 20.7 GB/s
-Alloc/Free on device:                    | 0.000937058     |  0.2        | 4               | 
-GPU: 98% Copy H<->D: 0.2% Alloc/free: 0.16% Comm: 0% CPU & I/O: 1.8%
+Libraries:                               | 0.390398        | 68.6        | 3               | 
+Kernels:                                 | 0.165589        | 29.1        | 434             | 
+Copy host to device:                     | 0.000691497     |  0.1        | 21              | 6.5 GB/s
+Copy device to host:                     | 0.000544154     |  0.1        | 7               | 20.5 GB/s
+Alloc/Free on device:                    | 0.000897341     |  0.2        | 4               | 
+GPU: 98% Copy H<->D: 0.22% Alloc/free: 0.16% Comm: 0% CPU & I/O: 1.9%
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
                                               Post-resolution statistics
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-Time of the post-resolution:                                               0.190028       
+Time of the post-resolution:                                               0.182175       
 
-Total time for the whole computation                                       117.526        
+Total time for the whole computation                                       97.4747        
 
-[Slurm] Power consumption (127 s):  0.693 kW  0.024 kWh  0.002 € (0.10€/kWh)
+[Slurm] Power consumption (107 s):  0.680 kW  0.020 kWh  0.002 € (0.10€/kWh)
diff --git a/tests/GPU/Implicit_ef_stab/Implicit_ef_stab_BENCH.TU.dalianvl_cc100 b/tests/GPU/Implicit_ef_stab/Implicit_ef_stab_BENCH.TU.dalianvl_cc100
new file mode 100644
index 0000000000..1aef35a8ee
--- /dev/null
+++ b/tests/GPU/Implicit_ef_stab/Implicit_ef_stab_BENCH.TU.dalianvl_cc100
@@ -0,0 +1,76 @@
+                                             # Global performance file #
+
+This is the global file for tracking performance in TRUST. It stores aggregated quantities.
+More detailed statistics can be found in the Implicit_ef_stab_BENCH_csv.TU file
+For time loop, only standard counters of level 1 are printed alongside your custom counters
+Time is given in seconds
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+                                              Context of the computation
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Date:     22-05-2026 -- 12:16:39
+OS:       dalianvl14__Linux__aarch64__5.14.0-503.40.1.el9_5.aarch64__#1 SMP PREEMPT_DYNAMIC Thu Apr 24 13:05:29 UTC 2025
+CPU model : Unknown Linux CPU
+Total number of threads:144
+GPU model: NVIDIA GB200
+CUDA runtime version: 13.0
+CUDA drivers version: 13.20
+Nb procs used for the computation: 1
+TRUST version: 1.9.8_beta
+Total number of elements used for the calculation: 2560000
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+                                           Computation start-up statistics
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Total time of the start-up:                                                40.0777        
+
+Number of calls to the linear solver per time step:                        1              
+Average time of the resolution of the linear problem per call:             2.69552        
+Average number of iteration of the linear solver per call:                 0              
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+                                                 Time loop statistics
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+The first time step is not accounted for the computation of the time loop statistics
+Total time of the time loop:                                               1.52199        
+Number of time steps:                                                      9              
+Skipped time steps:                                                        1              
+Average time per time step:                                                0.16911        
+Standard deviation between time steps:                                     0.0156024      
+Time elapsed in the skipped time steps:                                    14.9398        
+
+
+Standard counter description             | Time/step       | % loop time | Call(s)/step   
+------------------------------------------------------------------------------------------
+Linear solver resolutions Ax=B           | 0.108892        | 64.4        | 3              
+Matrix assembly for implicit scheme      | 0.01232241      |  7.3        | 1              
+Convection operator                      | 0.0112719       |  6.7        | 4              
+Diffusion operator                       | 0.001829879     |  1.1        | 2              
+Divergence operator                      | 0.00168482      |  1.0        | 4              
+Source terms                             | 0.0001482987    |  0.1        | 2              
+Update ::mettre_a_jour                   | 0.002643297     |  1.6        | 4              
+Computation of the time step dt          | 0.000565564     |  0.3        | 4              
+Post-treatment operations                | 0.005844443     |  3.5        | 1              
+Other operations                         | 0.02390721      | 14.1        | 
+
+Average number of iteration of the linear solver per call:                 23.4           
+
+
+-----------------------------------------------------------------------------------------------------------
+                                                    GPU statistics
+-----------------------------------------------------------------------------------------------------------
+Counter description                      | Time per step   | % loop time | Call(s)/step    | Bandwidth 
+-----------------------------------------------------------------------------------------------------------
+Libraries:                               | 0.101757        | 60.2        | 3               | 
+Kernels:                                 | 0.0561644       | 33.2        | 434             | 
+Copy host to device:                     | 0.000435701     |  0.3        | 21              | 10.3 GB/s
+Copy device to host:                     | 0.000350478     |  0.2        | 7               | 31.8 GB/s
+Alloc/Free on device:                    | 0.0019882       |  1.2        | 4               | 
+GPU: 93% Copy H<->D: 0.46% Alloc/free: 1.2% Comm: 0% CPU & I/O: 5%
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+                                              Post-resolution statistics
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Time of the post-resolution:                                               0.151419       
+
+Total time for the whole computation                                       56.691         
+
diff --git a/tests/GPU/Implicit_ef_stab/Implicit_ef_stab_BENCH.TU.eureka_cc89 b/tests/GPU/Implicit_ef_stab/Implicit_ef_stab_BENCH.TU.eureka_cc89
new file mode 100644
index 0000000000..cfcb620dd7
--- /dev/null
+++ b/tests/GPU/Implicit_ef_stab/Implicit_ef_stab_BENCH.TU.eureka_cc89
@@ -0,0 +1,76 @@
+                                             # Global performance file #
+
+This is the global file for tracking performance in TRUST. It stores aggregated quantities.
+More detailed statistics can be found in the Implicit_ef_stab_BENCH_csv.TU file
+For time loop, only standard counters of level 1 are printed alongside your custom counters
+Time is given in seconds
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+                                              Context of the computation
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Date:     11-05-2026 -- 08:33:22
+OS:       eureka__Linux__x86_64__6.17.0-20-generic__#20~24.04.1-Ubuntu SMP PREEMPT_DYNAMIC Thu Mar 19 01:28:37 UTC 2
+CPU model : INTEL(R) XEON(R) PLATINUM 8580
+Total number of threads:240
+GPU model: NVIDIA RTX 6000 Ada Generation
+CUDA runtime version: 12.90
+CUDA drivers version: 13.10
+Nb procs used for the computation: 1
+TRUST version: 1.9.8_beta
+Total number of elements used for the calculation: 2560000
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+                                           Computation start-up statistics
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Total time of the start-up:                                                51.361         
+
+Number of calls to the linear solver per time step:                        1              
+Average time of the resolution of the linear problem per call:             3.54282        
+Average number of iteration of the linear solver per call:                 0              
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+                                                 Time loop statistics
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+The first time step is not accounted for the computation of the time loop statistics
+Total time of the time loop:                                               5.0599         
+Number of time steps:                                                      9              
+Skipped time steps:                                                        1              
+Average time per time step:                                                0.562211       
+Standard deviation between time steps:                                     0.0438569      
+Time elapsed in the skipped time steps:                                    36.3611        
+
+
+Standard counter description             | Time/step       | % loop time | Call(s)/step   
+------------------------------------------------------------------------------------------
+Linear solver resolutions Ax=B           | 0.338952        | 60.3        | 3              
+Matrix assembly for implicit scheme      | 0.03494757      |  6.2        | 1              
+Convection operator                      | 0.03678962      |  6.5        | 4              
+Diffusion operator                       | 0.003927381     |  0.7        | 2              
+Divergence operator                      | 0.004817764     |  0.9        | 4              
+Source terms                             | 0.0006496889    |  0.1        | 2              
+Update ::mettre_a_jour                   | 0.004099632     |  0.7        | 4              
+Computation of the time step dt          | 0.001328052     |  0.2        | 4              
+Post-treatment operations                | 0.007013156     |  1.2        | 1              
+Other operations                         | 0.1296865       | 23.1        | 
+
+Average number of iteration of the linear solver per call:                 23.4           
+
+
+-----------------------------------------------------------------------------------------------------------
+                                                    GPU statistics
+-----------------------------------------------------------------------------------------------------------
+Counter description                      | Time per step   | % loop time | Call(s)/step    | Bandwidth 
+-----------------------------------------------------------------------------------------------------------
+Libraries:                               | 0.306028        | 54.4        | 3               | 
+Kernels:                                 | 0.247187        | 44.0        | 434             | 
+Copy host to device:                     | 0.00103896      |  0.2        | 21              | 4.3 GB/s
+Copy device to host:                     | 0.00102148      |  0.2        | 7               | 10.9 GB/s
+Alloc/Free on device:                    | 0.000524655     |  0.1        | 4               | 
+GPU: 98% Copy H<->D: 0.37% Alloc/free: 0.093% Comm: 0% CPU & I/O: 1.1%
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+                                              Post-resolution statistics
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Time of the post-resolution:                                               0.0875391      
+
+Total time for the whole computation                                       92.8696        
+
diff --git a/tests/GPU/Implicit_ef_stab/Implicit_ef_stab_BENCH.TU.is157091_cc86 b/tests/GPU/Implicit_ef_stab/Implicit_ef_stab_BENCH.TU.is157091_cc86
index fe44c55875..5a041cf020 100644
--- a/tests/GPU/Implicit_ef_stab/Implicit_ef_stab_BENCH.TU.is157091_cc86
+++ b/tests/GPU/Implicit_ef_stab/Implicit_ef_stab_BENCH.TU.is157091_cc86
@@ -8,13 +8,13 @@ Time is given in seconds
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
                                               Context of the computation
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-Date:     20-02-2026 -- 17:56:57
+Date:     08-06-2026 -- 11:18:18
 OS:       is157091__Linux__x86_64__5.11.12-300.fc34.x86_64__#1 SMP Wed Apr 7 16:31:13 UTC 2021
 CPU model : AMD Ryzen Threadripper PRO 5975WX 32-Cores
 Total number of threads:64
 GPU model: NVIDIA RTX A6000
 CUDA runtime version: 12.90
-CUDA drivers version: 12.70
+CUDA drivers version: 13.20
 Nb procs used for the computation: 1
 TRUST version: 1.9.8_beta
 Total number of elements used for the calculation: 2560000
@@ -22,36 +22,36 @@ Total number of elements used for the calculation: 2560000
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
                                            Computation start-up statistics
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-Total time of the start-up:                                                43.4891        
+Total time of the start-up:                                                27.814         
 
 Number of calls to the linear solver per time step:                        1              
-Average time of the resolution of the linear problem per call:             2.69107        
+Average time of the resolution of the linear problem per call:             2.49741        
 Average number of iteration of the linear solver per call:                 0              
 
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
                                                  Time loop statistics
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 The first time step is not accounted for the computation of the time loop statistics
-Total time of the time loop:                                               9.01733        
+Total time of the time loop:                                               7.71198        
 Number of time steps:                                                      9              
 Skipped time steps:                                                        1              
-Average time per time step:                                                1.00193        
-Standard deviation between time steps:                                     0.056026       
-Time elapsed in the skipped time steps:                                    21.6859        
+Average time per time step:                                                0.856887       
+Standard deviation between time steps:                                     0.0537875      
+Time elapsed in the skipped time steps:                                    26.9383        
 
 
 Standard counter description             | Time/step       | % loop time | Call(s)/step   
 ------------------------------------------------------------------------------------------
-Linear solver resolutions Ax=B           | 0.461884        | 46.1        | 3              
-Matrix assembly for implicit scheme      | 0.09949726      |  9.9        | 1              
-Convection operator                      | 0.09331728      |  9.3        | 4              
-Diffusion operator                       | 0.007343724     |  0.7        | 2              
-Divergence operator                      | 0.02650007      |  2.6        | 4              
-Source terms                             | 0.001259325     |  0.1        | 2              
-Update ::mettre_a_jour                   | 0.01118447      |  1.1        | 4              
-Computation of the time step dt          | 0.002266761     |  0.2        | 4              
-Post-treatment operations                | 0.01669265      |  1.7        | 1              
-Other operations                         | 0.2819804       | 28.1        | 
+Linear solver resolutions Ax=B           | 0.444648        | 51.9        | 3              
+Matrix assembly for implicit scheme      | 0.07059693      |  8.2        | 1              
+Convection operator                      | 0.07447829      |  8.7        | 4              
+Diffusion operator                       | 0.005784988     |  0.7        | 2              
+Divergence operator                      | 0.01199142      |  1.4        | 4              
+Source terms                             | 0.0007986473    |  0.1        | 2              
+Update ::mettre_a_jour                   | 0.007751844     |  0.9        | 4              
+Computation of the time step dt          | 0.002273929     |  0.3        | 4              
+Post-treatment operations                | 0.01234477      |  1.4        | 1              
+Other operations                         | 0.2262184       | 26.4        | 
 
 Average number of iteration of the linear solver per call:                 23.4           
 
@@ -61,16 +61,16 @@ Average number of iteration of the linear solver per call:                 23.4
 -----------------------------------------------------------------------------------------------------------
 Counter description                      | Time per step   | % loop time | Call(s)/step    | Bandwidth 
 -----------------------------------------------------------------------------------------------------------
-Libraries:                               | 0.42272         | 42.2        | 3               | 
-Kernels:                                 | 0.568741        | 56.8        | 435             | 
-Copy host to device:                     | 0.00135453      |  0.1        | 21              | 3.3 GB/s
-Copy device to host:                     | 0.00110431      |  0.1        | 7               | 10.1 GB/s
-Alloc/Free on device:                    | 0.000495608     |  0.0        | 4               | 
-GPU: 99% Copy H<->D: 0.25% Alloc/free: 0.049% Comm: 0% CPU & I/O: 0.75%
+Libraries:                               | 0.403468        | 47.1        | 3               | 
+Kernels:                                 | 0.442613        | 51.7        | 434             | 
+Copy host to device:                     | 0.00135345      |  0.2        | 21              | 3.3 GB/s
+Copy device to host:                     | 0.00111343      |  0.1        | 7               | 10.0 GB/s
+Alloc/Free on device:                    | 0.000585798     |  0.1        | 4               | 
+GPU: 99% Copy H<->D: 0.29% Alloc/free: 0.068% Comm: 0% CPU & I/O: 0.9%
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
                                               Post-resolution statistics
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-Time of the post-resolution:                                               0.0718186      
+Time of the post-resolution:                                               0.0743327      
 
-Total time for the whole computation                                       74.2642        
+Total time for the whole computation                                       62.5386        
 
diff --git a/tests/GPU/Implicit_ef_stab/Implicit_ef_stab_BENCH.TU.is159479_cc120 b/tests/GPU/Implicit_ef_stab/Implicit_ef_stab_BENCH.TU.is159479_cc120
new file mode 100644
index 0000000000..07c59a74a3
--- /dev/null
+++ b/tests/GPU/Implicit_ef_stab/Implicit_ef_stab_BENCH.TU.is159479_cc120
@@ -0,0 +1,76 @@
+                                             # Global performance file #
+
+This is the global file for tracking performance in TRUST. It stores aggregated quantities.
+More detailed statistics can be found in the Implicit_ef_stab_BENCH_csv.TU file
+For time loop, only standard counters of level 1 are printed alongside your custom counters
+Time is given in seconds
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+                                              Context of the computation
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Date:     24-04-2026 -- 14:35:42
+OS:       is159479.intra.cea.fr__Linux__x86_64__6.12.0-160000.26-default__#1 SMP PREEMPT_DYNAMIC Thu Feb  5 00:00:11 UTC 2026 (f
+CPU model : AMD Ryzen Threadripper PRO 9965WX 24-Cores
+Total number of threads:48
+GPU model: NVIDIA RTX PRO 6000 Blackwell Workstation Edition
+CUDA runtime version: 13.0
+CUDA drivers version: 13.0
+Nb procs used for the computation: 1
+TRUST version: 1.9.8_beta
+Total number of elements used for the calculation: 2560000
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+                                           Computation start-up statistics
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Total time of the start-up:                                                33.7225        
+
+Number of calls to the linear solver per time step:                        1              
+Average time of the resolution of the linear problem per call:             1.97342        
+Average number of iteration of the linear solver per call:                 0              
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+                                                 Time loop statistics
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+The first time step is not accounted for the computation of the time loop statistics
+Total time of the time loop:                                               2.81034        
+Number of time steps:                                                      9              
+Skipped time steps:                                                        1              
+Average time per time step:                                                0.31226        
+Standard deviation between time steps:                                     0.0220986      
+Time elapsed in the skipped time steps:                                    20.2445        
+
+
+Standard counter description             | Time/step       | % loop time | Call(s)/step   
+------------------------------------------------------------------------------------------
+Linear solver resolutions Ax=B           | 0.189262        | 60.6        | 3              
+Matrix assembly for implicit scheme      | 0.02137981      |  6.8        | 1              
+Convection operator                      | 0.02542805      |  8.1        | 4              
+Diffusion operator                       | 0.002372511     |  0.8        | 2              
+Divergence operator                      | 0.002581936     |  0.8        | 4              
+Source terms                             | 0.000339013     |  0.1        | 2              
+Update ::mettre_a_jour                   | 0.002378845     |  0.8        | 4              
+Computation of the time step dt          | 0.0007381257    |  0.2        | 4              
+Post-treatment operations                | 0.005218892     |  1.7        | 1              
+Other operations                         | 0.06256046      | 20.0        | 
+
+Average number of iteration of the linear solver per call:                 23.4           
+
+
+-----------------------------------------------------------------------------------------------------------
+                                                    GPU statistics
+-----------------------------------------------------------------------------------------------------------
+Counter description                      | Time per step   | % loop time | Call(s)/step    | Bandwidth 
+-----------------------------------------------------------------------------------------------------------
+Libraries:                               | 0.171479        | 54.9        | 3               | 
+Kernels:                                 | 0.134278        | 43.0        | 434             | 
+Copy host to device:                     | 0.000496272     |  0.2        | 21              | 9.0 GB/s
+Copy device to host:                     | 0.00135792      |  0.4        | 7               | 8.2 GB/s
+Alloc/Free on device:                    | 0.000442294     |  0.1        | 4               | 
+GPU: 98% Copy H<->D: 0.59% Alloc/free: 0.14% Comm: 0% CPU & I/O: 1.3%
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+                                              Post-resolution statistics
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Time of the post-resolution:                                               0.0666653      
+
+Total time for the whole computation                                       56.844         
+
diff --git a/tests/GPU/Implicit_ef_stab/Implicit_ef_stab_BENCH.TU.is247793_gfx1100 b/tests/GPU/Implicit_ef_stab/Implicit_ef_stab_BENCH.TU.is247793_gfx1100
new file mode 100644
index 0000000000..0ca61660c2
--- /dev/null
+++ b/tests/GPU/Implicit_ef_stab/Implicit_ef_stab_BENCH.TU.is247793_gfx1100
@@ -0,0 +1,76 @@
+                                             # Global performance file #
+
+This is the global file for tracking performance in TRUST. It stores aggregated quantities.
+More detailed statistics can be found in the Implicit_ef_stab_BENCH_csv.TU file
+For time loop, only standard counters of level 1 are printed alongside your custom counters
+Time is given in seconds
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+                                              Context of the computation
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Date:     22-05-2026 -- 17:39:24
+OS:       is247793__Linux__x86_64__6.8.0-107-generic__#107~22.04.1-Ubuntu SMP PREEMPT_DYNAMIC Wed Mar 18 23:40:43 UTC 
+CPU model : AMD Ryzen Threadripper PRO 7965WX 24-Cores
+Total number of threads:48
+GPU model: AMD Radeon PRO W7900
+HIP runtime version: 7.53
+HIP drivers version: 7.53
+Nb procs used for the computation: 1
+TRUST version: 1.9.8_beta
+Total number of elements used for the calculation: 2560000
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+                                           Computation start-up statistics
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Total time of the start-up:                                                39.0243        
+
+Number of calls to the linear solver per time step:                        1              
+Average time of the resolution of the linear problem per call:             4.05154        
+Average number of iteration of the linear solver per call:                 0              
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+                                                 Time loop statistics
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+The first time step is not accounted for the computation of the time loop statistics
+Total time of the time loop:                                               14.9036        
+Number of time steps:                                                      9              
+Skipped time steps:                                                        1              
+Average time per time step:                                                1.65596        
+Standard deviation between time steps:                                     0.558843       
+Time elapsed in the skipped time steps:                                    27.9679        
+
+
+Standard counter description             | Time/step       | % loop time | Call(s)/step   
+------------------------------------------------------------------------------------------
+Linear solver resolutions Ax=B           | 1.27397         | 76.9        | 3              
+Matrix assembly for implicit scheme      | 0.09543697      |  5.8        | 1              
+Convection operator                      | 0.1076007       |  6.5        | 4              
+Diffusion operator                       | 0.007823668     |  0.5        | 2              
+Divergence operator                      | 0.01254682      |  0.8        | 4              
+Source terms                             | 0.0009698686    |  0.1        | 2              
+Update ::mettre_a_jour                   | 0.007694129     |  0.5        | 4              
+Computation of the time step dt          | 0.002277375     |  0.1        | 4              
+Post-treatment operations                | 0.01607319      |  1.0        | 1              
+Other operations                         | 0.1315699       |  7.9        | 
+
+Average number of iteration of the linear solver per call:                 21.9           
+
+
+-----------------------------------------------------------------------------------------------------------
+                                                    GPU statistics
+-----------------------------------------------------------------------------------------------------------
+Counter description                      | Time per step   | % loop time | Call(s)/step    | Bandwidth 
+-----------------------------------------------------------------------------------------------------------
+Libraries:                               | 1.22954         | 74.2        | 3               | 
+Kernels:                                 | 0.41592         | 25.1        | 434             | 
+Copy host to device:                     | 0.000712979     |  0.0        | 21              | 6.3 GB/s
+Copy device to host:                     | 0.000712966     |  0.0        | 7               | 15.6 GB/s
+Alloc/Free on device:                    | 0.000785282     |  0.0        | 4               | 
+GPU: 99% Copy H<->D: 0.086% Alloc/free: 0.047% Comm: 0% CPU & I/O: 0.5%
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+                                              Post-resolution statistics
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Time of the post-resolution:                                               0.0998975      
+
+Total time for the whole computation                                       81.9958        
+
diff --git a/tests/GPU/Implicit_ef_stab/Implicit_ef_stab_BENCH.TU.jean-zay_cc90 b/tests/GPU/Implicit_ef_stab/Implicit_ef_stab_BENCH.TU.jean-zay_cc90
index 14f0261cec..f113bd06f4 100644
--- a/tests/GPU/Implicit_ef_stab/Implicit_ef_stab_BENCH.TU.jean-zay_cc90
+++ b/tests/GPU/Implicit_ef_stab/Implicit_ef_stab_BENCH.TU.jean-zay_cc90
@@ -8,13 +8,13 @@ Time is given in seconds
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
                                               Context of the computation
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-Date:     21-02-2026 -- 07:35:00
-OS:       jzxh025__Linux__x86_64__5.14.0-570.58.1.el9_6.x86_64__#1 SMP PREEMPT_DYNAMIC Tue Oct 21 04:15:07 EDT 2025
+Date:     10-06-2026 -- 10:38:22
+OS:       jzxh361__Linux__x86_64__5.14.0-570.116.1.el9_6.x86_64__#1 SMP PREEMPT_DYNAMIC Tue May 19 08:44:54 EDT 2026
 CPU model : Intel(R) Xeon(R) Platinum 8468
 Total number of threads:192
 GPU model: NVIDIA H100 80GB HBM3
 CUDA runtime version: 12.60
-CUDA drivers version: 13.0
+CUDA drivers version: 13.20
 Nb procs used for the computation: 1
 TRUST version: 1.9.8_beta
 Total number of elements used for the calculation: 2560000
@@ -22,36 +22,36 @@ Total number of elements used for the calculation: 2560000
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
                                            Computation start-up statistics
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-Total time of the start-up:                                                41.2371        
+Total time of the start-up:                                                32.7393        
 
 Number of calls to the linear solver per time step:                        1              
-Average time of the resolution of the linear problem per call:             2.40859        
+Average time of the resolution of the linear problem per call:             2.67772        
 Average number of iteration of the linear solver per call:                 0              
 
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
                                                  Time loop statistics
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 The first time step is not accounted for the computation of the time loop statistics
-Total time of the time loop:                                               2.85861        
+Total time of the time loop:                                               2.6547         
 Number of time steps:                                                      9              
 Skipped time steps:                                                        1              
-Average time per time step:                                                0.317624       
-Standard deviation between time steps:                                     0.0171556      
-Time elapsed in the skipped time steps:                                    19.6314        
+Average time per time step:                                                0.294966       
+Standard deviation between time steps:                                     0.0274564      
+Time elapsed in the skipped time steps:                                    30.4392        
 
 
 Standard counter description             | Time/step       | % loop time | Call(s)/step   
 ------------------------------------------------------------------------------------------
-Linear solver resolutions Ax=B           | 0.163966        | 51.6        | 3              
-Matrix assembly for implicit scheme      | 0.02640968      |  8.3        | 1              
-Convection operator                      | 0.02437444      |  7.7        | 4              
-Diffusion operator                       | 0.003660248     |  1.2        | 2              
-Divergence operator                      | 0.006555501     |  2.1        | 4              
-Source terms                             | 0.0004897059    |  0.2        | 2              
-Update ::mettre_a_jour                   | 0.004763247     |  1.5        | 4              
-Computation of the time step dt          | 0.0008664688    |  0.3        | 4              
-Post-treatment operations                | 0.00878672      |  2.8        | 1              
-Other operations                         | 0.07775182      | 24.5        | 
+Linear solver resolutions Ax=B           | 0.159624        | 54.1        | 3              
+Matrix assembly for implicit scheme      | 0.01918769      |  6.5        | 1              
+Convection operator                      | 0.01836007      |  6.2        | 4              
+Diffusion operator                       | 0.002982189     |  1.0        | 2              
+Divergence operator                      | 0.00251469      |  0.9        | 4              
+Source terms                             | 0.0002447141    |  0.1        | 2              
+Update ::mettre_a_jour                   | 0.003794273     |  1.3        | 4              
+Computation of the time step dt          | 0.0007570568    |  0.3        | 4              
+Post-treatment operations                | 0.014242        |  4.8        | 1              
+Other operations                         | 0.07325931      | 24.8        | 
 
 Average number of iteration of the linear solver per call:                 23.4           
 
@@ -61,17 +61,17 @@ Average number of iteration of the linear solver per call:                 23.4
 -----------------------------------------------------------------------------------------------------------
 Counter description                      | Time per step   | % loop time | Call(s)/step    | Bandwidth 
 -----------------------------------------------------------------------------------------------------------
-Libraries:                               | 0.152809        | 48.1        | 3               | 
-Kernels:                                 | 0.15188         | 47.8        | 435             | 
-Copy host to device:                     | 0.00076906      |  0.2        | 21              | 5.8 GB/s
-Copy device to host:                     | 0.00166772      |  0.5        | 7               | 6.7 GB/s
-Alloc/Free on device:                    | 0.000731562     |  0.2        | 4               | 
-GPU: 96% Copy H<->D: 0.77% Alloc/free: 0.23% Comm: 0% CPU & I/O: 3.1%
+Libraries:                               | 0.148476        | 50.3        | 3               | 
+Kernels:                                 | 0.12706         | 43.1        | 434             | 
+Copy host to device:                     | 0.000763417     |  0.3        | 21              | 5.9 GB/s
+Copy device to host:                     | 0.00111141      |  0.4        | 7               | 10.0 GB/s
+Alloc/Free on device:                    | 0.000743817     |  0.3        | 4               | 
+GPU: 93% Copy H<->D: 0.64% Alloc/free: 0.25% Comm: 0% CPU & I/O: 5.7%
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
                                               Post-resolution statistics
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-Time of the post-resolution:                                               0.185702       
+Time of the post-resolution:                                               0.122407       
 
-Total time for the whole computation                                       63.9128        
+Total time for the whole computation                                       65.9557        
 
-Power consumption: 0 kW 0 kWh 0 € (0.12€/kWh)
+[Slurm] Power consumption (76 s):  0.451 kW  0.010 kWh  0.001 € (0.10€/kWh)
diff --git a/tests/GPU/Implicit_ef_stab/Implicit_ef_stab_BENCH.TU.lumi_gfx90a b/tests/GPU/Implicit_ef_stab/Implicit_ef_stab_BENCH.TU.lumi_gfx90a
index 4e9fe93c02..cdeb8a3e64 100644
--- a/tests/GPU/Implicit_ef_stab/Implicit_ef_stab_BENCH.TU.lumi_gfx90a
+++ b/tests/GPU/Implicit_ef_stab/Implicit_ef_stab_BENCH.TU.lumi_gfx90a
@@ -8,8 +8,8 @@ Time is given in seconds
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
                                               Context of the computation
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-Date:     24-02-2026 -- 00:02:23
-OS:       nid007971__Linux__x86_64__6.4.0-150600.23.73_15.0.14-cray_shasta_c__#1 SMP Tue Oct 21 20:32:25 UTC 2025 (89d3c98)
+Date:     15-05-2026 -- 20:25:26
+OS:       nid007955__Linux__x86_64__6.4.0-150600.23.73_15.0.14-cray_shasta_c__#1 SMP Tue Oct 21 20:32:25 UTC 2025 (89d3c98)
 CPU model : AMD EPYC 7A53 64-Core Processor
 Total number of threads:128
 GPU model: AMD Instinct MI250X
@@ -22,36 +22,36 @@ Total number of elements used for the calculation: 2560000
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
                                            Computation start-up statistics
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-Total time of the start-up:                                                90.1836        
+Total time of the start-up:                                                125.203        
 
 Number of calls to the linear solver per time step:                        1              
-Average time of the resolution of the linear problem per call:             6.28351        
+Average time of the resolution of the linear problem per call:             7.04209        
 Average number of iteration of the linear solver per call:                 0              
 
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
                                                  Time loop statistics
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 The first time step is not accounted for the computation of the time loop statistics
-Total time of the time loop:                                               12.2985        
+Total time of the time loop:                                               10.6218        
 Number of time steps:                                                      9              
 Skipped time steps:                                                        1              
-Average time per time step:                                                1.3665         
-Standard deviation between time steps:                                     0.0415515      
-Time elapsed in the skipped time steps:                                    30.9744        
+Average time per time step:                                                1.1802         
+Standard deviation between time steps:                                     0.0454529      
+Time elapsed in the skipped time steps:                                    40.9321        
 
 
 Standard counter description             | Time/step       | % loop time | Call(s)/step   
 ------------------------------------------------------------------------------------------
-Linear solver resolutions Ax=B           | 0.536635        | 11.2        | 3              
-Matrix assembly for implicit scheme      | 0.171109        |  3.6        | 1              
-Convection operator                      | 0.2075454       |  4.3        | 4              
-Diffusion operator                       | 0.01382455      |  0.3        | 2              
-Divergence operator                      | 0.03201726      |  0.7        | 4              
-Source terms                             | 0.0006596       |  0.0        | 2              
-Update ::mettre_a_jour                   | 0.01181509      |  0.2        | 4              
-Computation of the time step dt          | 0.00181459      |  0.0        | 4              
-Post-treatment operations                | 0.02227527      |  0.5        | 1              
-Other operations                         | 0.3688072       |  7.7        | 
+Linear solver resolutions Ax=B           | 0.499157        | 42.3        | 3              
+Matrix assembly for implicit scheme      | 0.1353561       | 11.5        | 1              
+Convection operator                      | 0.1723566       | 14.6        | 4              
+Diffusion operator                       | 0.01120409      |  0.9        | 2              
+Divergence operator                      | 0.02008284      |  1.7        | 4              
+Source terms                             | 0.002546747     |  0.2        | 2              
+Update ::mettre_a_jour                   | 0.008625667     |  0.7        | 4              
+Computation of the time step dt          | 0.001518505     |  0.1        | 4              
+Post-treatment operations                | 0.01973878      |  1.7        | 1              
+Other operations                         | 0.3096102       | 26.2        | 
 
 Average number of iteration of the linear solver per call:                 23.4           
 
@@ -61,17 +61,17 @@ Average number of iteration of the linear solver per call:                 23.4
 -----------------------------------------------------------------------------------------------------------
 Counter description                      | Time per step   | % loop time | Call(s)/step    | Bandwidth 
 -----------------------------------------------------------------------------------------------------------
-Libraries:                               | 0.513683        | 37.6        | 3               | 
-Kernels:                                 | 0.841248        | 61.6        | 435             | 
-Copy host to device:                     | 0.000779138     |  0.1        | 21              | 5.8 GB/s
-Copy device to host:                     | 0.00079515      |  0.1        | 7               | 14.0 GB/s
-Alloc/Free on device:                    | 0.000129299     |  0.0        | 4               | 
-GPU: 99% Copy H<->D: 0.12% Alloc/free: 0.0095% Comm: 0% CPU & I/O: 0.72%
+Libraries:                               | 0.47665         | 40.4        | 3               | 
+Kernels:                                 | 0.692056        | 58.6        | 434             | 
+Copy host to device:                     | 0.000810867     |  0.1        | 21              | 5.5 GB/s
+Copy device to host:                     | 0.000802662     |  0.1        | 7               | 13.9 GB/s
+Alloc/Free on device:                    | 0.000131847     |  0.0        | 4               | 
+GPU: 99% Copy H<->D: 0.14% Alloc/free: 0.011% Comm: 0% CPU & I/O: 0.83%
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
                                               Post-resolution statistics
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-Time of the post-resolution:                                               0.176294       
+Time of the post-resolution:                                               0.184331       
 
-Total time for the whole computation                                       133.633        
+Total time for the whole computation                                       176.941        
 
-Power consumption: 0 kW 0 kWh 0 € (0.12€/kWh)
+[Slurm] Power consumption (210 s):  0.501 kW  0.029 kWh  0.003 € (0.10€/kWh)
diff --git a/tests/GPU/Implicit_ef_stab/Implicit_ef_stab_BENCH.TU.topaze_cc80 b/tests/GPU/Implicit_ef_stab/Implicit_ef_stab_BENCH.TU.topaze_cc80
index 06000653dc..a6de5aed35 100644
--- a/tests/GPU/Implicit_ef_stab/Implicit_ef_stab_BENCH.TU.topaze_cc80
+++ b/tests/GPU/Implicit_ef_stab/Implicit_ef_stab_BENCH.TU.topaze_cc80
@@ -8,8 +8,8 @@ Time is given in seconds
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
                                               Context of the computation
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-Date:     20-02-2026 -- 17:46:19
-OS:       topaze7059__Linux__x86_64__4.18.0-553.89.1.el8_10.x86_64__#1 SMP Sat Nov 29 00:49:18 EST 2025
+Date:     15-05-2026 -- 13:34:57
+OS:       topaze7040__Linux__x86_64__4.18.0-553.89.1.el8_10.x86_64__#1 SMP Sat Nov 29 00:49:18 EST 2025
 CPU model : AMD EPYC 7763 64-Core Processor
 Total number of threads:256
 GPU model: NVIDIA A100-SXM4-80GB
@@ -22,36 +22,36 @@ Total number of elements used for the calculation: 2560000
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
                                            Computation start-up statistics
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-Total time of the start-up:                                                58.6111        
+Total time of the start-up:                                                58.2478        
 
 Number of calls to the linear solver per time step:                        1              
-Average time of the resolution of the linear problem per call:             4.21863        
+Average time of the resolution of the linear problem per call:             4.08683        
 Average number of iteration of the linear solver per call:                 0              
 
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
                                                  Time loop statistics
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 The first time step is not accounted for the computation of the time loop statistics
-Total time of the time loop:                                               4.31741        
+Total time of the time loop:                                               3.80344        
 Number of time steps:                                                      9              
 Skipped time steps:                                                        1              
-Average time per time step:                                                0.479712       
-Standard deviation between time steps:                                     0.0290592      
-Time elapsed in the skipped time steps:                                    31.3083        
+Average time per time step:                                                0.422605       
+Standard deviation between time steps:                                     0.0280516      
+Time elapsed in the skipped time steps:                                    39.0974        
 
 
 Standard counter description             | Time/step       | % loop time | Call(s)/step   
 ------------------------------------------------------------------------------------------
-Linear solver resolutions Ax=B           | 0.252873        |  6.4        | 3              
-Matrix assembly for implicit scheme      | 0.04422392      |  1.1        | 1              
-Convection operator                      | 0.04284369      |  1.1        | 4              
-Diffusion operator                       | 0.005367558     |  0.1        | 2              
-Divergence operator                      | 0.008792561     |  0.2        | 4              
-Source terms                             | 0.00110497      |  0.0        | 2              
-Update ::mettre_a_jour                   | 0.005035711     |  0.1        | 4              
-Computation of the time step dt          | 0.001184417     |  0.0        | 4              
-Post-treatment operations                | 0.0116908       |  0.3        | 1              
-Other operations                         | 0.1065955       |  2.7        | 
+Linear solver resolutions Ax=B           | 0.242674        | 57.4        | 3              
+Matrix assembly for implicit scheme      | 0.02970123      |  7.0        | 1              
+Convection operator                      | 0.02794444      |  6.6        | 4              
+Diffusion operator                       | 0.004167412     |  1.0        | 2              
+Divergence operator                      | 0.003920845     |  0.9        | 4              
+Source terms                             | 0.0003530411    |  0.1        | 2              
+Update ::mettre_a_jour                   | 0.003896615     |  0.9        | 4              
+Computation of the time step dt          | 0.0009981211    |  0.2        | 4              
+Post-treatment operations                | 0.009928146     |  2.3        | 1              
+Other operations                         | 0.09902102      | 23.4        | 
 
 Average number of iteration of the linear solver per call:                 23.4           
 
@@ -61,16 +61,17 @@ Average number of iteration of the linear solver per call:                 23.4
 -----------------------------------------------------------------------------------------------------------
 Counter description                      | Time per step   | % loop time | Call(s)/step    | Bandwidth 
 -----------------------------------------------------------------------------------------------------------
-Libraries:                               | 0.235152        | 49.0        | 3               | 
-Kernels:                                 | 0.230355        | 48.0        | 435             | 
-Copy host to device:                     | 0.00154799      |  0.3        | 21              | 2.9 GB/s
-Copy device to host:                     | 0.00143133      |  0.3        | 7               | 7.8 GB/s
-Alloc/Free on device:                    | 0.000915653     |  0.2        | 4               | 
-GPU: 97% Copy H<->D: 0.62% Alloc/free: 0.19% Comm: 0% CPU & I/O: 2.1%
+Libraries:                               | 0.224982        | 53.2        | 3               | 
+Kernels:                                 | 0.184112        | 43.6        | 434             | 
+Copy host to device:                     | 0.00172855      |  0.4        | 21              | 2.6 GB/s
+Copy device to host:                     | 0.000943387     |  0.2        | 7               | 11.8 GB/s
+Alloc/Free on device:                    | 0.000872976     |  0.2        | 4               | 
+GPU: 97% Copy H<->D: 0.63% Alloc/free: 0.21% Comm: 0% CPU & I/O: 2.4%
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
                                               Post-resolution statistics
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-Time of the post-resolution:                                               0.224487       
+Time of the post-resolution:                                               0.211573       
 
-Total time for the whole computation                                       94.4613        
+Total time for the whole computation                                       101.36         
 
+[Slurm] Power consumption (135 s):  0.443 kW  0.017 kWh  0.002 € (0.10€/kWh)
diff --git a/tests/GPU/JEL_bous/JEL_bous.data b/tests/GPU/JEL_bous/JEL_bous.data
index 9a74e93fb3..4e05f4c9b9 100644
--- a/tests/GPU/JEL_bous/JEL_bous.data
+++ b/tests/GPU/JEL_bous/JEL_bous.data
@@ -39,7 +39,9 @@ END PARTITION #
 Scatter dom.Zones dom
 END SCATTER #
 
-vef dis
+vef dis 
+Lire dis { reorder { algo hilbert } }
+
 Runge_Kutta_Rationnel_ordre_2 sch
 Lire sch
 {
diff --git a/tests/GPU/JEL_bous/JEL_bous_BENCH.TU.adastra_gfx90a b/tests/GPU/JEL_bous/JEL_bous_BENCH.TU.adastra_gfx90a
index 45216fa313..a4e38d7c08 100644
--- a/tests/GPU/JEL_bous/JEL_bous_BENCH.TU.adastra_gfx90a
+++ b/tests/GPU/JEL_bous/JEL_bous_BENCH.TU.adastra_gfx90a
@@ -8,8 +8,8 @@ Time is given in seconds
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
                                               Context of the computation
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-Date:     18-03-2026 -- 19:54:45
-OS:       g1085__Linux__x86_64__5.14.0-570.69.1.el9_6.x86_64__#1 SMP PREEMPT_DYNAMIC Tue Nov 25 01:30:14 EST 2025
+Date:     23-04-2026 -- 09:25:45
+OS:       g1182__Linux__x86_64__5.14.0-570.69.1.el9_6.x86_64__#1 SMP PREEMPT_DYNAMIC Tue Nov 25 01:30:14 EST 2025
 CPU model : AMD EPYC 7A53 64-Core Processor
 Total number of threads:128
 GPU model: AMD Instinct MI250X
@@ -22,40 +22,40 @@ Total number of elements used for the calculation: 2332800
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
                                            Computation start-up statistics
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-Total time of the start-up:                                                41.1329        
+Total time of the start-up:                                                43.3078        
 
 Number of calls to the linear solver per time step:                        2              
-Average time of the resolution of the linear problem per call:             1.21027        
+Average time of the resolution of the linear problem per call:             1.50034        
 Average number of iteration of the linear solver per call:                 19.5           
 
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
                                                  Time loop statistics
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 The first time step is not accounted for the computation of the time loop statistics
-Total time of the time loop:                                               5.62354        
+Total time of the time loop:                                               4.2439         
 Number of time steps:                                                      9              
 Skipped time steps:                                                        1              
-Average time per time step:                                                0.624837       
-Standard deviation between time steps:                                     0.0134143      
-Time elapsed in the skipped time steps:                                    1.0322         
+Average time per time step:                                                0.471545       
+Standard deviation between time steps:                                     0.0104923      
+Time elapsed in the skipped time steps:                                    0.864911       
 
 
 Standard counter description             | Time/step       | % loop time | Call(s)/step   
 ------------------------------------------------------------------------------------------
-Linear solver resolutions Ax=B           | 0.12578         | 20.1        | 2              
-Convection operator                      | 0.05487504      |  8.8        | 4              
-Diffusion operator                       | 0.1397324       | 22.4        | 26             
-Gradient operator                        | 0.09053912      | 14.5        | 4              
-Divergence operator                      | 0.02166166      |  3.5        | 3              
-Source terms                             | 0.09107066      | 14.6        | 2              
-Update ::mettre_a_jour                   | 0.01492529      |  2.4        | 1              
-Solver for implicit diffusion            | 0.03447241      |  5.5        | 4              
-Computation of the time step dt          | 0.03681047      |  5.9        | 8              
-Turbulence model::update                 | 0.005562819     |  0.9        | 1              
-Post-treatment operations                | 0.006412253     |  1.0        | 1              
-Other operations                         | 0.002994933     |  0.5        | 
+Linear solver resolutions Ax=B           | 0.112863        | 23.9        | 2              
+Convection operator                      | 0.03530255      |  7.5        | 4              
+Diffusion operator                       | 0.09859322      | 20.9        | 26             
+Gradient operator                        | 0.04428958      |  9.4        | 4              
+Divergence operator                      | 0.01316282      |  2.8        | 3              
+Source terms                             | 0.07620722      | 16.2        | 2              
+Update ::mettre_a_jour                   | 0.01118618      |  2.4        | 1              
+Solver for implicit diffusion            | 0.03458315      |  7.3        | 4              
+Computation of the time step dt          | 0.02691001      |  5.7        | 8              
+Turbulence model::update                 | 0.005000574     |  1.1        | 1              
+Post-treatment operations                | 0.006459059     |  1.4        | 1              
+Other operations                         | 0.006986863     |  1.5        | 
 
-Average number of iteration of the linear solver per call:                 33             
+Average number of iteration of the linear solver per call:                 36             
 
 
 -----------------------------------------------------------------------------------------------------------
@@ -63,17 +63,17 @@ Average number of iteration of the linear solver per call:                 33
 -----------------------------------------------------------------------------------------------------------
 Counter description                      | Time per step   | % loop time | Call(s)/step    | Bandwidth 
 -----------------------------------------------------------------------------------------------------------
-Libraries:                               | 0.125386        | 20.1        | 2               | 
-Kernels:                                 | 0.491607        | 78.7        | 912             | 
-Copy host to device:                     | 0.000504216     |  0.1        | 18              | 7.1 GB/s
-Copy device to host:                     | 0.000747571     |  0.1        | 7               | 14.1 GB/s
-Alloc/Free on device:                    | 2.81546e-05     |  0.0        | 0               | 
-GPU: 99% Copy H<->D: 0.2% Alloc/free: 0.0045% Comm: 0% CPU & I/O: 1.1%
+Libraries:                               | 0.112453        | 23.8        | 2               | 
+Kernels:                                 | 0.35097         | 74.4        | 910             | 
+Copy host to device:                     | 0.000499709     |  0.1        | 18              | 7.1 GB/s
+Copy device to host:                     | 0.000743318     |  0.2        | 7               | 14.2 GB/s
+Alloc/Free on device:                    | 2.73587e-05     |  0.0        | 0               | 
+GPU: 98% Copy H<->D: 0.26% Alloc/free: 0.0058% Comm: 0% CPU & I/O: 1.5%
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
                                               Post-resolution statistics
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-Time of the post-resolution:                                               0.227312       
+Time of the post-resolution:                                               0.184669       
 
-Total time for the whole computation                                       48.016         
+Total time for the whole computation                                       48.6013        
 
-[Slurm] Power consumption (55 s):  0.490 kW  0.007 kWh  0.001 € (0.10€/kWh)
+[Slurm] Power consumption (57 s):  0.460 kW  0.007 kWh  0.001 € (0.10€/kWh)
diff --git a/tests/GPU/JEL_bous/JEL_bous_BENCH.TU.adastra_gfx942 b/tests/GPU/JEL_bous/JEL_bous_BENCH.TU.adastra_gfx942
index 3f909c97a7..ecb0139234 100644
--- a/tests/GPU/JEL_bous/JEL_bous_BENCH.TU.adastra_gfx942
+++ b/tests/GPU/JEL_bous/JEL_bous_BENCH.TU.adastra_gfx942
@@ -8,8 +8,8 @@ Time is given in seconds
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
                                               Context of the computation
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-Date:     17-03-2026 -- 19:11:30
-OS:       a1007__Linux__x86_64__5.14.0-570.69.1.el9_6.x86_64__#1 SMP PREEMPT_DYNAMIC Tue Nov 25 01:30:14 EST 2025
+Date:     23-04-2026 -- 14:46:22
+OS:       a1003__Linux__x86_64__5.14.0-570.69.1.el9_6.x86_64__#1 SMP PREEMPT_DYNAMIC Tue Nov 25 01:30:14 EST 2025
 CPU model : AMD Instinct MI300A Accelerator
 Total number of threads:192
 GPU model: AMD Instinct MI300A
@@ -22,40 +22,40 @@ Total number of elements used for the calculation: 2332800
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
                                            Computation start-up statistics
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-Total time of the start-up:                                                52.8219        
+Total time of the start-up:                                                50.9967        
 
 Number of calls to the linear solver per time step:                        2              
-Average time of the resolution of the linear problem per call:             1.66151        
+Average time of the resolution of the linear problem per call:             1.80608        
 Average number of iteration of the linear solver per call:                 19.5           
 
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
                                                  Time loop statistics
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 The first time step is not accounted for the computation of the time loop statistics
-Total time of the time loop:                                               1.7971         
+Total time of the time loop:                                               1.7142         
 Number of time steps:                                                      9              
 Skipped time steps:                                                        1              
-Average time per time step:                                                0.199678       
-Standard deviation between time steps:                                     0.00636364     
-Time elapsed in the skipped time steps:                                    0.793102       
+Average time per time step:                                                0.190467       
+Standard deviation between time steps:                                     0.00651408     
+Time elapsed in the skipped time steps:                                    0.781128       
 
 
 Standard counter description             | Time/step       | % loop time | Call(s)/step   
 ------------------------------------------------------------------------------------------
-Linear solver resolutions Ax=B           | 0.0579626       | 29.0        | 2              
-Convection operator                      | 0.01474113      |  7.4        | 4              
-Diffusion operator                       | 0.04030403      | 20.2        | 26             
-Gradient operator                        | 0.01876236      |  9.4        | 4              
-Divergence operator                      | 0.005992479     |  3.0        | 3              
-Source terms                             | 0.01091527      |  5.5        | 2              
-Update ::mettre_a_jour                   | 0.00445164      |  2.2        | 1              
-Solver for implicit diffusion            | 0.02312014      | 11.6        | 4              
-Computation of the time step dt          | 0.01067176      |  5.3        | 8              
-Turbulence model::update                 | 0.00141954      |  0.7        | 1              
-Post-treatment operations                | 0.005924567     |  3.0        | 1              
-Other operations                         | 0.005412221     |  2.7        | 
+Linear solver resolutions Ax=B           | 0.0581491       | 30.5        | 2              
+Convection operator                      | 0.01218876      |  6.4        | 4              
+Diffusion operator                       | 0.03752443      | 19.7        | 26             
+Gradient operator                        | 0.01810211      |  9.5        | 4              
+Divergence operator                      | 0.004492509     |  2.4        | 3              
+Source terms                             | 0.01069241      |  5.6        | 2              
+Update ::mettre_a_jour                   | 0.003962929     |  2.1        | 1              
+Solver for implicit diffusion            | 0.02360572      | 12.4        | 4              
+Computation of the time step dt          | 0.008076957     |  4.2        | 8              
+Turbulence model::update                 | 0.001345484     |  0.7        | 1              
+Post-treatment operations                | 0.006274298     |  3.3        | 1              
+Other operations                         | 0.006052508     |  3.2        | 
 
-Average number of iteration of the linear solver per call:                 33             
+Average number of iteration of the linear solver per call:                 36             
 
 
 -----------------------------------------------------------------------------------------------------------
@@ -63,17 +63,17 @@ Average number of iteration of the linear solver per call:                 33
 -----------------------------------------------------------------------------------------------------------
 Counter description                      | Time per step   | % loop time | Call(s)/step    | Bandwidth 
 -----------------------------------------------------------------------------------------------------------
-Libraries:                               | 0.0577297       | 28.9        | 2               | 
-Kernels:                                 | 0.134693        | 67.5        | 912             | 
-Copy host to device:                     | 0.00041607      |  0.2        | 18              | 8.6 GB/s
-Copy device to host:                     | 0.000501596     |  0.3        | 7               | 21.0 GB/s
-Alloc/Free on device:                    | 0.000339519     |  0.2        | 0               | 
-GPU: 96% Copy H<->D: 0.46% Alloc/free: 0.17% Comm: 0% CPU & I/O: 3%
+Libraries:                               | 0.0579074       | 30.4        | 2               | 
+Kernels:                                 | 0.12491         | 65.6        | 910             | 
+Copy host to device:                     | 0.000426176     |  0.2        | 18              | 8.4 GB/s
+Copy device to host:                     | 0.000547077     |  0.3        | 7               | 19.3 GB/s
+Alloc/Free on device:                    | 0.000345408     |  0.2        | 0               | 
+GPU: 96% Copy H<->D: 0.51% Alloc/free: 0.18% Comm: 0% CPU & I/O: 3.3%
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
                                               Post-resolution statistics
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-Time of the post-resolution:                                               0.170727       
+Time of the post-resolution:                                               0.160747       
 
-Total time for the whole computation                                       55.5828        
+Total time for the whole computation                                       53.6528        
 
-[Slurm] Power consumption (76 s):  0.557 kW  0.012 kWh  0.001 € (0.10€/kWh)
+[Slurm] Power consumption (63 s):  0.647 kW  0.011 kWh  0.001 € (0.10€/kWh)
diff --git a/tests/GPU/JEL_bous/JEL_bous_BENCH.TU.dalianvl_cc100 b/tests/GPU/JEL_bous/JEL_bous_BENCH.TU.dalianvl_cc100
new file mode 100644
index 0000000000..a4325074e6
--- /dev/null
+++ b/tests/GPU/JEL_bous/JEL_bous_BENCH.TU.dalianvl_cc100
@@ -0,0 +1,78 @@
+                                             # Global performance file #
+
+This is the global file for tracking performance in TRUST. It stores aggregated quantities.
+More detailed statistics can be found in the JEL_bous_BENCH_csv.TU file
+For time loop, only standard counters of level 1 are printed alongside your custom counters
+Time is given in seconds
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+                                              Context of the computation
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Date:     22-05-2026 -- 12:17:26
+OS:       dalianvl14__Linux__aarch64__5.14.0-503.40.1.el9_5.aarch64__#1 SMP PREEMPT_DYNAMIC Thu Apr 24 13:05:29 UTC 2025
+CPU model : Unknown Linux CPU
+Total number of threads:144
+GPU model: NVIDIA GB200
+CUDA runtime version: 13.0
+CUDA drivers version: 13.20
+Nb procs used for the computation: 1
+TRUST version: 1.9.8_beta
+Total number of elements used for the calculation: 2332800
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+                                           Computation start-up statistics
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Total time of the start-up:                                                27.6976        
+
+Number of calls to the linear solver per time step:                        2              
+Average time of the resolution of the linear problem per call:             1.22819        
+Average number of iteration of the linear solver per call:                 19.5           
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+                                                 Time loop statistics
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+The first time step is not accounted for the computation of the time loop statistics
+Total time of the time loop:                                               1.08387        
+Number of time steps:                                                      9              
+Skipped time steps:                                                        1              
+Average time per time step:                                                0.12043        
+Standard deviation between time steps:                                     0.0031966      
+Time elapsed in the skipped time steps:                                    0.371688       
+
+
+Standard counter description             | Time/step       | % loop time | Call(s)/step   
+------------------------------------------------------------------------------------------
+Linear solver resolutions Ax=B           | 0.0523012       | 43.4        | 2              
+Convection operator                      | 0.008088967     |  6.7        | 4              
+Diffusion operator                       | 0.02270846      | 18.9        | 26             
+Gradient operator                        | 0.004848305     |  4.0        | 4              
+Divergence operator                      | 0.001220368     |  1.0        | 3              
+Source terms                             | 0.003061725     |  2.5        | 2              
+Update ::mettre_a_jour                   | 0.00164626      |  1.4        | 1              
+Solver for implicit diffusion            | 0.01236225      | 10.3        | 4              
+Computation of the time step dt          | 0.002862308     |  2.4        | 8              
+Turbulence model::update                 | 0.0006005719    |  0.5        | 1              
+Post-treatment operations                | 0.005131665     |  4.3        | 1              
+Other operations                         | 0.005598225     |  4.6        | 
+
+Average number of iteration of the linear solver per call:                 33             
+
+
+-----------------------------------------------------------------------------------------------------------
+                                                    GPU statistics
+-----------------------------------------------------------------------------------------------------------
+Counter description                      | Time per step   | % loop time | Call(s)/step    | Bandwidth 
+-----------------------------------------------------------------------------------------------------------
+Libraries:                               | 0.0520444       | 43.2        | 2               | 
+Kernels:                                 | 0.0606194       | 50.3        | 910             | 
+Copy host to device:                     | 0.000342986     |  0.3        | 18              | 10.4 GB/s
+Copy device to host:                     | 0.000255232     |  0.2        | 7               | 41.3 GB/s
+Alloc/Free on device:                    | 0.000147381     |  0.1        | 0               | 
+GPU: 94% Copy H<->D: 0.5% Alloc/free: 0.12% Comm: 0% CPU & I/O: 5.8%
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+                                              Post-resolution statistics
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Time of the post-resolution:                                               0.130801       
+
+Total time for the whole computation                                       29.284         
+
diff --git a/tests/GPU/JEL_bous/JEL_bous_BENCH.TU.eureka_cc89 b/tests/GPU/JEL_bous/JEL_bous_BENCH.TU.eureka_cc89
new file mode 100644
index 0000000000..d0c1e60cfb
--- /dev/null
+++ b/tests/GPU/JEL_bous/JEL_bous_BENCH.TU.eureka_cc89
@@ -0,0 +1,78 @@
+                                             # Global performance file #
+
+This is the global file for tracking performance in TRUST. It stores aggregated quantities.
+More detailed statistics can be found in the JEL_bous_BENCH_csv.TU file
+For time loop, only standard counters of level 1 are printed alongside your custom counters
+Time is given in seconds
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+                                              Context of the computation
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Date:     11-05-2026 -- 08:34:44
+OS:       eureka__Linux__x86_64__6.17.0-20-generic__#20~24.04.1-Ubuntu SMP PREEMPT_DYNAMIC Thu Mar 19 01:28:37 UTC 2
+CPU model : INTEL(R) XEON(R) PLATINUM 8580
+Total number of threads:240
+GPU model: NVIDIA RTX 6000 Ada Generation
+CUDA runtime version: 12.90
+CUDA drivers version: 13.10
+Nb procs used for the computation: 1
+TRUST version: 1.9.8_beta
+Total number of elements used for the calculation: 2332800
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+                                           Computation start-up statistics
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Total time of the start-up:                                                37.701         
+
+Number of calls to the linear solver per time step:                        2              
+Average time of the resolution of the linear problem per call:             1.65191        
+Average number of iteration of the linear solver per call:                 19.5           
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+                                                 Time loop statistics
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+The first time step is not accounted for the computation of the time loop statistics
+Total time of the time loop:                                               2.346          
+Number of time steps:                                                      9              
+Skipped time steps:                                                        1              
+Average time per time step:                                                0.260667       
+Standard deviation between time steps:                                     0.00717772     
+Time elapsed in the skipped time steps:                                    1.30342        
+
+
+Standard counter description             | Time/step       | % loop time | Call(s)/step   
+------------------------------------------------------------------------------------------
+Linear solver resolutions Ax=B           | 0.100194        | 38.4        | 2              
+Convection operator                      | 0.01751202      |  6.7        | 4              
+Diffusion operator                       | 0.04631485      | 17.8        | 26             
+Gradient operator                        | 0.01122161      |  4.3        | 4              
+Divergence operator                      | 0.003520645     |  1.4        | 3              
+Source terms                             | 0.01072508      |  4.1        | 2              
+Update ::mettre_a_jour                   | 0.004113718     |  1.6        | 1              
+Solver for implicit diffusion            | 0.03897518      | 15.0        | 4              
+Computation of the time step dt          | 0.007720912     |  3.0        | 8              
+Turbulence model::update                 | 0.001730033     |  0.7        | 1              
+Post-treatment operations                | 0.005838174     |  2.2        | 1              
+Other operations                         | 0.01280066      |  4.9        | 
+
+Average number of iteration of the linear solver per call:                 33             
+
+
+-----------------------------------------------------------------------------------------------------------
+                                                    GPU statistics
+-----------------------------------------------------------------------------------------------------------
+Counter description                      | Time per step   | % loop time | Call(s)/step    | Bandwidth 
+-----------------------------------------------------------------------------------------------------------
+Libraries:                               | 0.0999176       | 38.3        | 2               | 
+Kernels:                                 | 0.153016        | 58.7        | 910             | 
+Copy host to device:                     | 0.000531594     |  0.2        | 18              | 6.7 GB/s
+Copy device to host:                     | 0.00163624      |  0.6        | 7               | 6.4 GB/s
+Alloc/Free on device:                    | 0.000130279     |  0.0        | 0               | 
+GPU: 97% Copy H<->D: 0.83% Alloc/free: 0.05% Comm: 0% CPU & I/O: 2.1%
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+                                              Post-resolution statistics
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Time of the post-resolution:                                               0.0708352      
+
+Total time for the whole computation                                       41.4213        
+
diff --git a/tests/GPU/JEL_bous/JEL_bous_BENCH.TU.irene-amd-ccrt_cc70 b/tests/GPU/JEL_bous/JEL_bous_BENCH.TU.irene-amd-ccrt_cc70
index 1979b6ebb7..e757b8e9f3 100644
--- a/tests/GPU/JEL_bous/JEL_bous_BENCH.TU.irene-amd-ccrt_cc70
+++ b/tests/GPU/JEL_bous/JEL_bous_BENCH.TU.irene-amd-ccrt_cc70
@@ -8,8 +8,8 @@ Time is given in seconds
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
                                               Context of the computation
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-Date:     20-03-2026 -- 14:19:16
-OS:       irene7053__Linux__x86_64__4.18.0-553.89.1.el8_10.x86_64__#1 SMP Sat Nov 29 00:49:18 EST 2025
+Date:     23-04-2026 -- 14:55:37
+OS:       irene7051__Linux__x86_64__4.18.0-553.89.1.el8_10.x86_64__#1 SMP Sat Nov 29 00:49:18 EST 2025
 CPU model : Intel(R) Xeon(R) Gold 6230 CPU @ 2.10GHz
 Total number of threads:80
 GPU model: Tesla V100-SXM2-16GB
@@ -22,38 +22,38 @@ Total number of elements used for the calculation: 2332800
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
                                            Computation start-up statistics
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-Total time of the start-up:                                                52.8484        
+Total time of the start-up:                                                52.0313        
 
 Number of calls to the linear solver per time step:                        2              
-Average time of the resolution of the linear problem per call:             2.81103        
+Average time of the resolution of the linear problem per call:             2.46353        
 Average number of iteration of the linear solver per call:                 19.5           
 
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
                                                  Time loop statistics
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 The first time step is not accounted for the computation of the time loop statistics
-Total time of the time loop:                                               5.40516        
+Total time of the time loop:                                               3.91246        
 Number of time steps:                                                      9              
 Skipped time steps:                                                        1              
-Average time per time step:                                                0.600573       
-Standard deviation between time steps:                                     0.016464       
-Time elapsed in the skipped time steps:                                    2.03739        
+Average time per time step:                                                0.434718       
+Standard deviation between time steps:                                     0.0117024      
+Time elapsed in the skipped time steps:                                    1.83216        
 
 
 Standard counter description             | Time/step       | % loop time | Call(s)/step   
 ------------------------------------------------------------------------------------------
-Linear solver resolutions Ax=B           | 0.178813        | 29.8        | 2              
-Convection operator                      | 0.04298263      |  7.2        | 4              
-Diffusion operator                       | 0.1637061       | 27.3        | 26             
-Gradient operator                        | 0.03590419      |  6.0        | 4              
-Divergence operator                      | 0.02346987      |  3.9        | 3              
-Source terms                             | 0.03134122      |  5.2        | 2              
-Update ::mettre_a_jour                   | 0.01515883      |  2.5        | 1              
-Solver for implicit diffusion            | 0.04919646      |  8.2        | 4              
-Computation of the time step dt          | 0.03779348      |  6.3        | 8              
-Turbulence model::update                 | 0.004684588     |  0.8        | 1              
-Post-treatment operations                | 0.008450638     |  1.4        | 1              
-Other operations                         | 0.009071418     |  1.5        | 
+Linear solver resolutions Ax=B           | 0.151745        | 34.9        | 2              
+Convection operator                      | 0.02512061      |  5.8        | 4              
+Diffusion operator                       | 0.09764007      | 22.5        | 26             
+Gradient operator                        | 0.01804826      |  4.2        | 4              
+Divergence operator                      | 0.01293974      |  3.0        | 3              
+Source terms                             | 0.02053407      |  4.7        | 2              
+Update ::mettre_a_jour                   | 0.01018209      |  2.3        | 1              
+Solver for implicit diffusion            | 0.04906223      | 11.3        | 4              
+Computation of the time step dt          | 0.02539198      |  5.8        | 8              
+Turbulence model::update                 | 0.00363699      |  0.8        | 1              
+Post-treatment operations                | 0.008557846     |  2.0        | 1              
+Other operations                         | 0.01185921      |  2.7        | 
 
 Average number of iteration of the linear solver per call:                 33             
 
@@ -63,17 +63,17 @@ Average number of iteration of the linear solver per call:                 33
 -----------------------------------------------------------------------------------------------------------
 Counter description                      | Time per step   | % loop time | Call(s)/step    | Bandwidth 
 -----------------------------------------------------------------------------------------------------------
-Libraries:                               | 0.17824         | 29.7        | 2               | 
-Kernels:                                 | 0.410514        | 68.4        | 912             | 
-Copy host to device:                     | 0.00116731      |  0.2        | 18              | 3.1 GB/s
-Copy device to host:                     | 0.00258639      |  0.4        | 7               | 4.1 GB/s
-Alloc/Free on device:                    | 0.000100786     |  0.0        | 0               | 
-GPU: 98% Copy H<->D: 0.63% Alloc/free: 0.017% Comm: 0% CPU & I/O: 1.3%
+Libraries:                               | 0.15118         | 34.8        | 2               | 
+Kernels:                                 | 0.271847        | 62.5        | 910             | 
+Copy host to device:                     | 0.00114374      |  0.3        | 18              | 3.1 GB/s
+Copy device to host:                     | 0.00269277      |  0.6        | 7               | 3.9 GB/s
+Alloc/Free on device:                    | 0.000103352     |  0.0        | 0               | 
+GPU: 97% Copy H<->D: 0.88% Alloc/free: 0.024% Comm: 0% CPU & I/O: 1.8%
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
                                               Post-resolution statistics
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-Time of the post-resolution:                                               0.168883       
+Time of the post-resolution:                                               0.165987       
 
-Total time for the whole computation                                       60.4598        
+Total time for the whole computation                                       57.9419        
 
-[Slurm] Power consumption (83 s):  0.208 kW  0.005 kWh  0.000 € (0.10€/kWh)
+[Slurm] Power consumption (70 s):  0.168 kW  0.003 kWh  0.000 € (0.10€/kWh)
diff --git a/tests/GPU/JEL_bous/JEL_bous_BENCH.TU.is157091_cc86 b/tests/GPU/JEL_bous/JEL_bous_BENCH.TU.is157091_cc86
index 1748ac8337..b5dfaed898 100644
--- a/tests/GPU/JEL_bous/JEL_bous_BENCH.TU.is157091_cc86
+++ b/tests/GPU/JEL_bous/JEL_bous_BENCH.TU.is157091_cc86
@@ -8,7 +8,7 @@ Time is given in seconds
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
                                               Context of the computation
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-Date:     11-03-2026 -- 20:22:08
+Date:     23-04-2026 -- 11:36:24
 OS:       is157091__Linux__x86_64__5.11.12-300.fc34.x86_64__#1 SMP Wed Apr 7 16:31:13 UTC 2021
 CPU model : AMD Ryzen Threadripper PRO 5975WX 32-Cores
 Total number of threads:64
@@ -22,38 +22,38 @@ Total number of elements used for the calculation: 2332800
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
                                            Computation start-up statistics
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-Total time of the start-up:                                                32.5388        
+Total time of the start-up:                                                33.1677        
 
 Number of calls to the linear solver per time step:                        2              
-Average time of the resolution of the linear problem per call:             1.27684        
+Average time of the resolution of the linear problem per call:             1.52807        
 Average number of iteration of the linear solver per call:                 19.5           
 
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
                                                  Time loop statistics
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 The first time step is not accounted for the computation of the time loop statistics
-Total time of the time loop:                                               5.07593        
+Total time of the time loop:                                               4.11182        
 Number of time steps:                                                      9              
 Skipped time steps:                                                        1              
-Average time per time step:                                                0.563992       
-Standard deviation between time steps:                                     0.0136432      
-Time elapsed in the skipped time steps:                                    1.26174        
+Average time per time step:                                                0.456869       
+Standard deviation between time steps:                                     0.0119697      
+Time elapsed in the skipped time steps:                                    1.21145        
 
 
 Standard counter description             | Time/step       | % loop time | Call(s)/step   
 ------------------------------------------------------------------------------------------
-Linear solver resolutions Ax=B           | 0.186567        | 33.1        | 2              
-Convection operator                      | 0.04115435      |  7.3        | 4              
-Diffusion operator                       | 0.118614        | 21.0        | 26             
-Gradient operator                        | 0.03055508      |  5.4        | 4              
-Divergence operator                      | 0.01801101      |  3.2        | 3              
-Source terms                             | 0.03885931      |  6.9        | 2              
-Update ::mettre_a_jour                   | 0.01322145      |  2.3        | 1              
-Solver for implicit diffusion            | 0.05424782      |  9.6        | 4              
-Computation of the time step dt          | 0.0408768       |  7.2        | 8              
-Turbulence model::update                 | 0.005068876     |  0.9        | 1              
-Post-treatment operations                | 0.005201474     |  0.9        | 1              
-Other operations                         | 0.0116147       |  2.1        | 
+Linear solver resolutions Ax=B           | 0.160973        | 35.2        | 2              
+Convection operator                      | 0.03524869      |  7.7        | 4              
+Diffusion operator                       | 0.08888196      | 19.5        | 26             
+Gradient operator                        | 0.01371358      |  3.0        | 4              
+Divergence operator                      | 0.0114062       |  2.5        | 3              
+Source terms                             | 0.03076204      |  6.7        | 2              
+Update ::mettre_a_jour                   | 0.01001952      |  2.2        | 1              
+Solver for implicit diffusion            | 0.05382268      | 11.8        | 4              
+Computation of the time step dt          | 0.02984461      |  6.5        | 8              
+Turbulence model::update                 | 0.004468384     |  1.0        | 1              
+Post-treatment operations                | 0.005182095     |  1.1        | 1              
+Other operations                         | 0.01254604      |  2.7        | 
 
 Average number of iteration of the linear solver per call:                 33             
 
@@ -63,16 +63,16 @@ Average number of iteration of the linear solver per call:                 33
 -----------------------------------------------------------------------------------------------------------
 Counter description                      | Time per step   | % loop time | Call(s)/step    | Bandwidth 
 -----------------------------------------------------------------------------------------------------------
-Libraries:                               | 0.185407        | 32.9        | 2               | 
-Kernels:                                 | 0.371386        | 65.8        | 912             | 
-Copy host to device:                     | 0.000484581     |  0.1        | 18              | 7.4 GB/s
-Copy device to host:                     | 0.00107678      |  0.2        | 7               | 9.8 GB/s
-Alloc/Free on device:                    | 0.000149894     |  0.0        | 0               | 
-GPU: 99% Copy H<->D: 0.28% Alloc/free: 0.027% Comm: 0% CPU & I/O: 0.97%
+Libraries:                               | 0.160438        | 35.1        | 2               | 
+Kernels:                                 | 0.289686        | 63.4        | 910             | 
+Copy host to device:                     | 0.000593965     |  0.1        | 18              | 6.0 GB/s
+Copy device to host:                     | 0.00107771      |  0.2        | 7               | 9.8 GB/s
+Alloc/Free on device:                    | 0.000141054     |  0.0        | 0               | 
+GPU: 99% Copy H<->D: 0.37% Alloc/free: 0.031% Comm: 0% CPU & I/O: 1.1%
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
                                               Post-resolution statistics
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-Time of the post-resolution:                                               0.265853       
+Time of the post-resolution:                                               0.0692136      
 
-Total time for the whole computation                                       39.1423        
+Total time for the whole computation                                       38.5602        
 
diff --git a/tests/GPU/JEL_bous/JEL_bous_BENCH.TU.is159479_cc120 b/tests/GPU/JEL_bous/JEL_bous_BENCH.TU.is159479_cc120
new file mode 100644
index 0000000000..ceb6b4f984
--- /dev/null
+++ b/tests/GPU/JEL_bous/JEL_bous_BENCH.TU.is159479_cc120
@@ -0,0 +1,78 @@
+                                             # Global performance file #
+
+This is the global file for tracking performance in TRUST. It stores aggregated quantities.
+More detailed statistics can be found in the JEL_bous_BENCH_csv.TU file
+For time loop, only standard counters of level 1 are printed alongside your custom counters
+Time is given in seconds
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+                                              Context of the computation
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Date:     24-04-2026 -- 14:36:31
+OS:       is159479.intra.cea.fr__Linux__x86_64__6.12.0-160000.26-default__#1 SMP PREEMPT_DYNAMIC Thu Feb  5 00:00:11 UTC 2026 (f
+CPU model : AMD Ryzen Threadripper PRO 9965WX 24-Cores
+Total number of threads:48
+GPU model: NVIDIA RTX PRO 6000 Blackwell Workstation Edition
+CUDA runtime version: 13.0
+CUDA drivers version: 13.0
+Nb procs used for the computation: 1
+TRUST version: 1.9.8_beta
+Total number of elements used for the calculation: 2332800
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+                                           Computation start-up statistics
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Total time of the start-up:                                                24.8498        
+
+Number of calls to the linear solver per time step:                        2              
+Average time of the resolution of the linear problem per call:             0.909361       
+Average number of iteration of the linear solver per call:                 19.5           
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+                                                 Time loop statistics
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+The first time step is not accounted for the computation of the time loop statistics
+Total time of the time loop:                                               1.46471        
+Number of time steps:                                                      9              
+Skipped time steps:                                                        1              
+Average time per time step:                                                0.162746       
+Standard deviation between time steps:                                     0.00530569     
+Time elapsed in the skipped time steps:                                    0.776553       
+
+
+Standard counter description             | Time/step       | % loop time | Call(s)/step   
+------------------------------------------------------------------------------------------
+Linear solver resolutions Ax=B           | 0.0690585       | 42.4        | 2              
+Convection operator                      | 0.01185419      |  7.3        | 4              
+Diffusion operator                       | 0.02884948      | 17.7        | 26             
+Gradient operator                        | 0.006323226     |  3.9        | 4              
+Divergence operator                      | 0.001823318     |  1.1        | 3              
+Source terms                             | 0.006613129     |  4.1        | 2              
+Update ::mettre_a_jour                   | 0.002277541     |  1.4        | 1              
+Solver for implicit diffusion            | 0.0200064       | 12.3        | 4              
+Computation of the time step dt          | 0.004297829     |  2.6        | 8              
+Turbulence model::update                 | 0.001027548     |  0.6        | 1              
+Post-treatment operations                | 0.003883966     |  2.4        | 1              
+Other operations                         | 0.006730717     |  4.1        | 
+
+Average number of iteration of the linear solver per call:                 33             
+
+
+-----------------------------------------------------------------------------------------------------------
+                                                    GPU statistics
+-----------------------------------------------------------------------------------------------------------
+Counter description                      | Time per step   | % loop time | Call(s)/step    | Bandwidth 
+-----------------------------------------------------------------------------------------------------------
+Libraries:                               | 0.0682865       | 42.0        | 2               | 
+Kernels:                                 | 0.0889239       | 54.6        | 910             | 
+Copy host to device:                     | 0.000346185     |  0.2        | 18              | 10.3 GB/s
+Copy device to host:                     | 0.00149129      |  0.9        | 7               | 7.1 GB/s
+Alloc/Free on device:                    | 4.52976e-05     |  0.0        | 0               | 
+GPU: 97% Copy H<->D: 1.1% Alloc/free: 0.028% Comm: 0% CPU & I/O: 2.2%
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+                                              Post-resolution statistics
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Time of the post-resolution:                                               0.0580743      
+
+Total time for the whole computation                                       27.1491        
+
diff --git a/tests/GPU/JEL_bous/JEL_bous_BENCH.TU.is247793_gfx1100 b/tests/GPU/JEL_bous/JEL_bous_BENCH.TU.is247793_gfx1100
index 6655875c6c..7713af45da 100644
--- a/tests/GPU/JEL_bous/JEL_bous_BENCH.TU.is247793_gfx1100
+++ b/tests/GPU/JEL_bous/JEL_bous_BENCH.TU.is247793_gfx1100
@@ -1,53 +1,78 @@
-Statistiques d'initialisation du calcul
-
-Temps total                       45.5252
-
-Statistiques de resolution du probleme
-
-Temps total                       8.20278
-
-
-Timesteps                         10
-Secondes / pas de temps           0.820271
-Dont solveurs Ax=B                0.209617 25% (2 appels/pas de temps)
-Dont solveur diffusion_implicite  0.087671 10% (4 appels/pas de temps)
-Dont mettre_a_jour                0.018403  2% (1 appel/pas de temps)
-Dont operateurs convection        0.113021 13% (4 appels/pas de temps)
-Dont operateurs diffusion         0.218056 26% (26 appels/pas de temps)
-Dont operateurs gradient          0.039941  4% (4 appels/pas de temps)
-Dont operateurs divergence        0.012809  1% (3 appels/pas de temps)
-Dont operateurs source            0.052145  6% (2 appels/pas de temps)
-Dont operations postraitement     0.017725  2% (1 appel/pas de temps)
-Dont calcul dt                    0.022425  2% (8 appels/pas de temps)
-Dont modele turbulence            0.006923  0% (1 appel/pas de temps)
-Dont calcul divers                0.021534  2% (0 appels/pas de temps)
-Nb solveur / pas de temps         2
-Secondes / solveur                0.104808
-Iterations / solveur              27.5
-GPU statistics per time step (experimental):
-Libraries : 0.209143 s 25.5%  2.0 calls
-Kernels   : 0.547938 s 66.8% 895.4 calls
-Copy H2D  : 0.025278 s  3.1% 59.6 calls 11.2 GB/s
-Copy D2H  : 0.013749 s  1.7% 80.7 calls 12.7 GB/s
-Alloc/Free: 0.001174 s  0.1%  8.0 calls
-GPU: 92.2% Copy H<->D: 4.7% Alloc/Free: 0.1% Comm: 0% CPU & Others: 2.8%
-I/O:
-
-Timesteps = number of time steps
-Nb solveur = number of linear system resolutions
-Nb assemblage implicite = number of matrix assemblies for the implicit scheme
-Iterations = average number of iterations of the solver
-Communications = fraction of the time spent
-                 in communications between processors (excluding io files)
-Network latency = time of one mpsum measured by an internal bench over 0.1s
-Network bandwidth = maximum on all processors
-                    of the average bandwidth of send_recv operations
-Waiting time = estimation of the waiting time of the different processors
-
-Max_waiting_time big    => probably due to a bad partitioning
-Communications > 30%    => too many processors or network too slow
-
-Statistiques de post resolution
-
-Temps total                       0.119974
+                                             # Global performance file #
+
+This is the global file for tracking performance in TRUST. It stores aggregated quantities.
+More detailed statistics can be found in the JEL_bous_BENCH_csv.TU file
+For time loop, only standard counters of level 1 are printed alongside your custom counters
+Time is given in seconds
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+                                              Context of the computation
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Date:     22-05-2026 -- 18:28:55
+OS:       is247793__Linux__x86_64__6.8.0-107-generic__#107~22.04.1-Ubuntu SMP PREEMPT_DYNAMIC Wed Mar 18 23:40:43 UTC 
+CPU model : AMD Ryzen Threadripper PRO 7965WX 24-Cores
+Total number of threads:48
+GPU model: AMD Radeon PRO W7900
+HIP runtime version: 7.53
+HIP drivers version: 7.53
+Nb procs used for the computation: 1
+TRUST version: 1.9.8_beta
+Total number of elements used for the calculation: 2332800
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+                                           Computation start-up statistics
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Total time of the start-up:                                                29.6466        
+
+Number of calls to the linear solver per time step:                        2              
+Average time of the resolution of the linear problem per call:             2.0431         
+Average number of iteration of the linear solver per call:                 19.5           
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+                                                 Time loop statistics
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+The first time step is not accounted for the computation of the time loop statistics
+Total time of the time loop:                                               4.40577        
+Number of time steps:                                                      9              
+Skipped time steps:                                                        1              
+Average time per time step:                                                0.48953        
+Standard deviation between time steps:                                     0.011864       
+Time elapsed in the skipped time steps:                                    1.64208        
+
+
+Standard counter description             | Time/step       | % loop time | Call(s)/step   
+------------------------------------------------------------------------------------------
+Linear solver resolutions Ax=B           | 0.190709        | 39.0        | 2              
+Convection operator                      | 0.0311751       |  6.4        | 4              
+Diffusion operator                       | 0.08601413      | 17.6        | 26             
+Gradient operator                        | 0.02478783      |  5.1        | 4              
+Divergence operator                      | 0.007880499     |  1.6        | 3              
+Source terms                             | 0.03365951      |  6.9        | 2              
+Update ::mettre_a_jour                   | 0.007327208     |  1.5        | 1              
+Solver for implicit diffusion            | 0.06193504      | 12.7        | 4              
+Computation of the time step dt          | 0.01854743      |  3.8        | 8              
+Turbulence model::update                 | 0.00304363      |  0.6        | 1              
+Post-treatment operations                | 0.004884117     |  1.0        | 1              
+Other operations                         | 0.01956652      |  4.0        | 
+
+Average number of iteration of the linear solver per call:                 36             
+
+
+-----------------------------------------------------------------------------------------------------------
+                                                    GPU statistics
+-----------------------------------------------------------------------------------------------------------
+Counter description                      | Time per step   | % loop time | Call(s)/step    | Bandwidth 
+-----------------------------------------------------------------------------------------------------------
+Libraries:                               | 0.190255        | 38.9        | 2               | 
+Kernels:                                 | 0.293721        | 60.0        | 910             | 
+Copy host to device:                     | 0.000463657     |  0.1        | 18              | 7.7 GB/s
+Copy device to host:                     | 0.00109986      |  0.2        | 7               | 9.6 GB/s
+Alloc/Free on device:                    | 0.000104513     |  0.0        | 0               | 
+GPU: 99% Copy H<->D: 0.32% Alloc/free: 0.021% Comm: 0% CPU & I/O: 0.79%
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+                                              Post-resolution statistics
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Time of the post-resolution:                                               0.138724       
+
+Total time for the whole computation                                       35.8332        
 
diff --git a/tests/GPU/JEL_bous/JEL_bous_BENCH.TU.jean-zay_cc90 b/tests/GPU/JEL_bous/JEL_bous_BENCH.TU.jean-zay_cc90
index 5710fadd8e..8bc7ec847b 100644
--- a/tests/GPU/JEL_bous/JEL_bous_BENCH.TU.jean-zay_cc90
+++ b/tests/GPU/JEL_bous/JEL_bous_BENCH.TU.jean-zay_cc90
@@ -8,8 +8,8 @@ Time is given in seconds
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
                                               Context of the computation
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-Date:     20-03-2026 -- 14:04:54
-OS:       jzxh082__Linux__x86_64__5.14.0-570.58.1.el9_6.x86_64__#1 SMP PREEMPT_DYNAMIC Tue Oct 21 04:15:07 EDT 2025
+Date:     23-04-2026 -- 08:17:58
+OS:       jzxh011__Linux__x86_64__5.14.0-570.58.1.el9_6.x86_64__#1 SMP PREEMPT_DYNAMIC Tue Oct 21 04:15:07 EDT 2025
 CPU model : Intel(R) Xeon(R) Platinum 8468
 Total number of threads:192
 GPU model: NVIDIA H100 80GB HBM3
@@ -22,38 +22,38 @@ Total number of elements used for the calculation: 2332800
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
                                            Computation start-up statistics
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-Total time of the start-up:                                                34.1096        
+Total time of the start-up:                                                39.7913        
 
 Number of calls to the linear solver per time step:                        2              
-Average time of the resolution of the linear problem per call:             1.50123        
+Average time of the resolution of the linear problem per call:             2.18749        
 Average number of iteration of the linear solver per call:                 19.5           
 
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
                                                  Time loop statistics
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 The first time step is not accounted for the computation of the time loop statistics
-Total time of the time loop:                                               1.58632        
+Total time of the time loop:                                               1.42528        
 Number of time steps:                                                      9              
 Skipped time steps:                                                        1              
-Average time per time step:                                                0.176258       
-Standard deviation between time steps:                                     0.0118648      
-Time elapsed in the skipped time steps:                                    1.20137        
+Average time per time step:                                                0.158365       
+Standard deviation between time steps:                                     0.00459821     
+Time elapsed in the skipped time steps:                                    1.00394        
 
 
 Standard counter description             | Time/step       | % loop time | Call(s)/step   
 ------------------------------------------------------------------------------------------
-Linear solver resolutions Ax=B           | 0.066867        | 37.9        | 2              
-Convection operator                      | 0.01270288      |  7.2        | 4              
-Diffusion operator                       | 0.03147784      | 17.9        | 26             
-Gradient operator                        | 0.009996821     |  5.7        | 4              
-Divergence operator                      | 0.004424433     |  2.5        | 3              
-Source terms                             | 0.006921498     |  3.9        | 2              
-Update ::mettre_a_jour                   | 0.003625511     |  2.1        | 1              
-Solver for implicit diffusion            | 0.0164828       |  9.4        | 4              
-Computation of the time step dt          | 0.0054422       |  3.1        | 8              
-Turbulence model::update                 | 0.001022485     |  0.6        | 1              
-Post-treatment operations                | 0.01127463      |  6.4        | 1              
-Other operations                         | 0.006019646     |  3.4        | 
+Linear solver resolutions Ax=B           | 0.0654879       | 41.4        | 2              
+Convection operator                      | 0.009842782     |  6.2        | 4              
+Diffusion operator                       | 0.02657007      | 16.8        | 26             
+Gradient operator                        | 0.00834769      |  5.3        | 4              
+Divergence operator                      | 0.002102096     |  1.3        | 3              
+Source terms                             | 0.006326014     |  4.0        | 2              
+Update ::mettre_a_jour                   | 0.002668702     |  1.7        | 1              
+Solver for implicit diffusion            | 0.0166026       | 10.5        | 4              
+Computation of the time step dt          | 0.004435701     |  2.8        | 8              
+Turbulence model::update                 | 0.0009156802    |  0.6        | 1              
+Post-treatment operations                | 0.008085213     |  5.1        | 1              
+Other operations                         | 0.00698039      |  4.4        | 
 
 Average number of iteration of the linear solver per call:                 33             
 
@@ -63,17 +63,17 @@ Average number of iteration of the linear solver per call:                 33
 -----------------------------------------------------------------------------------------------------------
 Counter description                      | Time per step   | % loop time | Call(s)/step    | Bandwidth 
 -----------------------------------------------------------------------------------------------------------
-Libraries:                               | 0.0666176       | 37.8        | 2               | 
-Kernels:                                 | 0.0952303       | 54.0        | 912             | 
-Copy host to device:                     | 0.00059641      |  0.3        | 18              | 6.0 GB/s
-Copy device to host:                     | 0.00112574      |  0.6        | 7               | 9.4 GB/s
-Alloc/Free on device:                    | 6.89258e-05     |  0.0        | 0               | 
-GPU: 92% Copy H<->D: 0.98% Alloc/free: 0.039% Comm: 0% CPU & I/O: 7.2%
+Libraries:                               | 0.0652517       | 41.2        | 2               | 
+Kernels:                                 | 0.0818582       | 51.7        | 910             | 
+Copy host to device:                     | 0.000631026     |  0.4        | 18              | 5.7 GB/s
+Copy device to host:                     | 0.00116906      |  0.7        | 7               | 9.0 GB/s
+Alloc/Free on device:                    | 6.88022e-05     |  0.0        | 0               | 
+GPU: 93% Copy H<->D: 1.1% Alloc/free: 0.043% Comm: 0% CPU & I/O: 5.9%
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
                                               Post-resolution statistics
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-Time of the post-resolution:                                               0.132989       
+Time of the post-resolution:                                               0.0998874      
 
-Total time for the whole computation                                       37.0303        
+Total time for the whole computation                                       42.3205        
 
-[Slurm] Power consumption (62 s):  0.367 kW  0.006 kWh  0.001 € (0.10€/kWh)
+[Slurm] Power consumption (50 s):  0.425 kW  0.006 kWh  0.001 € (0.10€/kWh)
diff --git a/tests/GPU/JEL_bous/JEL_bous_BENCH.TU.lumi_gfx90a b/tests/GPU/JEL_bous/JEL_bous_BENCH.TU.lumi_gfx90a
index 518c051131..038c9a11fd 100644
--- a/tests/GPU/JEL_bous/JEL_bous_BENCH.TU.lumi_gfx90a
+++ b/tests/GPU/JEL_bous/JEL_bous_BENCH.TU.lumi_gfx90a
@@ -8,8 +8,8 @@ Time is given in seconds
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
                                               Context of the computation
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-Date:     24-02-2026 -- 00:05:59
-OS:       nid007971__Linux__x86_64__6.4.0-150600.23.73_15.0.14-cray_shasta_c__#1 SMP Tue Oct 21 20:32:25 UTC 2025 (89d3c98)
+Date:     15-05-2026 -- 20:30:58
+OS:       nid005005__Linux__x86_64__6.4.0-150600.23.73_15.0.14-cray_shasta_c__#1 SMP Tue Oct 21 20:32:25 UTC 2025 (89d3c98)
 CPU model : AMD EPYC 7A53 64-Core Processor
 Total number of threads:128
 GPU model: AMD Instinct MI250X
@@ -22,40 +22,40 @@ Total number of elements used for the calculation: 2332800
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
                                            Computation start-up statistics
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-Total time of the start-up:                                                79.3918        
+Total time of the start-up:                                                87.1572        
 
 Number of calls to the linear solver per time step:                        2              
-Average time of the resolution of the linear problem per call:             4.23093        
+Average time of the resolution of the linear problem per call:             3.02811        
 Average number of iteration of the linear solver per call:                 19.5           
 
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
                                                  Time loop statistics
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 The first time step is not accounted for the computation of the time loop statistics
-Total time of the time loop:                                               5.85712        
+Total time of the time loop:                                               4.06447        
 Number of time steps:                                                      9              
 Skipped time steps:                                                        1              
-Average time per time step:                                                0.650791       
-Standard deviation between time steps:                                     0.0144864      
-Time elapsed in the skipped time steps:                                    1.53947        
+Average time per time step:                                                0.451608       
+Standard deviation between time steps:                                     0.00999368     
+Time elapsed in the skipped time steps:                                    1.32075        
 
 
 Standard counter description             | Time/step       | % loop time | Call(s)/step   
 ------------------------------------------------------------------------------------------
-Linear solver resolutions Ax=B           | 0.124112        | 15.1        | 2              
-Convection operator                      | 0.05576208      |  6.8        | 4              
-Diffusion operator                       | 0.1568919       | 19.1        | 26             
-Gradient operator                        | 0.08943955      | 10.9        | 4              
-Divergence operator                      | 0.02169871      |  2.6        | 3              
-Source terms                             | 0.09198948      | 11.2        | 2              
-Update ::mettre_a_jour                   | 0.01530994      |  1.9        | 1              
-Solver for implicit diffusion            | 0.03785859      |  4.6        | 4              
-Computation of the time step dt          | 0.03651262      |  4.4        | 8              
-Turbulence model::update                 | 0.005892088     |  0.7        | 1              
-Post-treatment operations                | 0.01152028      |  1.4        | 1              
-Other operations                         | 0.003804343     |  0.5        | 
+Linear solver resolutions Ax=B           | 0.108979        | 24.1        | 2              
+Convection operator                      | 0.03298987      |  7.3        | 4              
+Diffusion operator                       | 0.09218553      | 20.4        | 26             
+Gradient operator                        | 0.03981864      |  8.8        | 4              
+Divergence operator                      | 0.01219288      |  2.7        | 3              
+Source terms                             | 0.07573489      | 16.8        | 2              
+Update ::mettre_a_jour                   | 0.01071812      |  2.4        | 1              
+Solver for implicit diffusion            | 0.03443697      |  7.6        | 4              
+Computation of the time step dt          | 0.02651764      |  5.9        | 8              
+Turbulence model::update                 | 0.004947016     |  1.1        | 1              
+Post-treatment operations                | 0.006235741     |  1.4        | 1              
+Other operations                         | 0.006851491     |  1.5        | 
 
-Average number of iteration of the linear solver per call:                 30             
+Average number of iteration of the linear solver per call:                 36             
 
 
 -----------------------------------------------------------------------------------------------------------
@@ -63,17 +63,17 @@ Average number of iteration of the linear solver per call:                 30
 -----------------------------------------------------------------------------------------------------------
 Counter description                      | Time per step   | % loop time | Call(s)/step    | Bandwidth 
 -----------------------------------------------------------------------------------------------------------
-Libraries:                               | 0.122748        | 18.9        | 2               | 
-Kernels:                                 | 0.5152          | 79.2        | 912             | 
-Copy host to device:                     | 0.000510664     |  0.1        | 18              | 7.0 GB/s
-Copy device to host:                     | 0.000761906     |  0.1        | 7               | 13.8 GB/s
-Alloc/Free on device:                    | 2.60104e-05     |  0.0        | 0               | 
-GPU: 98% Copy H<->D: 0.2% Alloc/free: 0.004% Comm: 0% CPU & I/O: 1.8%
+Libraries:                               | 0.108587        | 24.0        | 2               | 
+Kernels:                                 | 0.335198        | 74.2        | 910             | 
+Copy host to device:                     | 0.000510021     |  0.1        | 18              | 7.0 GB/s
+Copy device to host:                     | 0.000747086     |  0.2        | 7               | 14.1 GB/s
+Alloc/Free on device:                    | 2.6155e-05      |  0.0        | 0               | 
+GPU: 98% Copy H<->D: 0.28% Alloc/free: 0.0058% Comm: 0% CPU & I/O: 1.4%
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
                                               Post-resolution statistics
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-Time of the post-resolution:                                               0.169877       
+Time of the post-resolution:                                               0.170278       
 
-Total time for the whole computation                                       86.9583        
+Total time for the whole computation                                       92.7127        
 
-Power consumption: 0 kW 0 kWh 0 € (0.12€/kWh)
+[Slurm] Power consumption (121 s):  0.490 kW  0.016 kWh  0.002 € (0.10€/kWh)
diff --git a/tests/GPU/JEL_bous/JEL_bous_BENCH.TU.topaze_cc80 b/tests/GPU/JEL_bous/JEL_bous_BENCH.TU.topaze_cc80
index e197f0213a..379c065968 100644
--- a/tests/GPU/JEL_bous/JEL_bous_BENCH.TU.topaze_cc80
+++ b/tests/GPU/JEL_bous/JEL_bous_BENCH.TU.topaze_cc80
@@ -8,8 +8,8 @@ Time is given in seconds
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
                                               Context of the computation
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-Date:     20-02-2026 -- 17:49:16
-OS:       topaze7050__Linux__x86_64__4.18.0-553.89.1.el8_10.x86_64__#1 SMP Sat Nov 29 00:49:18 EST 2025
+Date:     15-05-2026 -- 13:37:16
+OS:       topaze7040__Linux__x86_64__4.18.0-553.89.1.el8_10.x86_64__#1 SMP Sat Nov 29 00:49:18 EST 2025
 CPU model : AMD EPYC 7763 64-Core Processor
 Total number of threads:256
 GPU model: NVIDIA A100-SXM4-80GB
@@ -22,38 +22,38 @@ Total number of elements used for the calculation: 2332800
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
                                            Computation start-up statistics
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-Total time of the start-up:                                                41.3779        
+Total time of the start-up:                                                41.8203        
 
 Number of calls to the linear solver per time step:                        2              
-Average time of the resolution of the linear problem per call:             2.0777         
+Average time of the resolution of the linear problem per call:             1.83419        
 Average number of iteration of the linear solver per call:                 19.5           
 
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
                                                  Time loop statistics
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 The first time step is not accounted for the computation of the time loop statistics
-Total time of the time loop:                                               2.51937        
+Total time of the time loop:                                               2.04769        
 Number of time steps:                                                      9              
 Skipped time steps:                                                        1              
-Average time per time step:                                                0.27993        
-Standard deviation between time steps:                                     0.00755805     
-Time elapsed in the skipped time steps:                                    1.24275        
+Average time per time step:                                                0.227521       
+Standard deviation between time steps:                                     0.00633132     
+Time elapsed in the skipped time steps:                                    1.15739        
 
 
 Standard counter description             | Time/step       | % loop time | Call(s)/step   
 ------------------------------------------------------------------------------------------
-Linear solver resolutions Ax=B           | 0.095653        | 22.9        | 2              
-Convection operator                      | 0.02602373      |  6.2        | 4              
-Diffusion operator                       | 0.06149648      | 14.7        | 26             
-Gradient operator                        | 0.01634973      |  3.9        | 4              
-Divergence operator                      | 0.005980824     |  1.4        | 3              
-Source terms                             | 0.0107638       |  2.6        | 2              
-Update ::mettre_a_jour                   | 0.004935778     |  1.2        | 1              
-Solver for implicit diffusion            | 0.02931744      |  7.0        | 4              
-Computation of the time step dt          | 0.01242498      |  3.0        | 8              
-Turbulence model::update                 | 0.001668194     |  0.4        | 1              
-Post-treatment operations                | 0.006153714     |  1.5        | 1              
-Other operations                         | 0.009161827     |  2.2        | 
+Linear solver resolutions Ax=B           | 0.0931296       | 40.9        | 2              
+Convection operator                      | 0.01537725      |  6.8        | 4              
+Diffusion operator                       | 0.04146383      | 18.2        | 26             
+Gradient operator                        | 0.01223332      |  5.4        | 4              
+Divergence operator                      | 0.002743943     |  1.2        | 3              
+Source terms                             | 0.009263052     |  4.1        | 2              
+Update ::mettre_a_jour                   | 0.003475224     |  1.5        | 1              
+Solver for implicit diffusion            | 0.0260186       | 11.4        | 4              
+Computation of the time step dt          | 0.007452805     |  3.3        | 8              
+Turbulence model::update                 | 0.001451508     |  0.6        | 1              
+Post-treatment operations                | 0.006422715     |  2.8        | 1              
+Other operations                         | 0.008489126     |  3.7        | 
 
 Average number of iteration of the linear solver per call:                 33             
 
@@ -63,16 +63,17 @@ Average number of iteration of the linear solver per call:                 33
 -----------------------------------------------------------------------------------------------------------
 Counter description                      | Time per step   | % loop time | Call(s)/step    | Bandwidth 
 -----------------------------------------------------------------------------------------------------------
-Libraries:                               | 0.0953374       | 34.1        | 2               | 
-Kernels:                                 | 0.176263        | 63.0        | 912             | 
-Copy host to device:                     | 0.000504537     |  0.2        | 18              | 7.1 GB/s
-Copy device to host:                     | 0.000921495     |  0.3        | 7               | 11.5 GB/s
-Alloc/Free on device:                    | 9.07312e-05     |  0.0        | 0               | 
-GPU: 97% Copy H<->D: 0.51% Alloc/free: 0.032% Comm: 0% CPU & I/O: 2.4%
+Libraries:                               | 0.0928011       | 40.8        | 2               | 
+Kernels:                                 | 0.126048        | 55.4        | 910             | 
+Copy host to device:                     | 0.000506734     |  0.2        | 18              | 7.1 GB/s
+Copy device to host:                     | 0.000953698     |  0.4        | 7               | 11.1 GB/s
+Alloc/Free on device:                    | 0.00010037      |  0.0        | 0               | 
+GPU: 96% Copy H<->D: 0.64% Alloc/free: 0.044% Comm: 0% CPU & I/O: 3.1%
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
                                               Post-resolution statistics
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-Time of the post-resolution:                                               0.193253       
+Time of the post-resolution:                                               0.184978       
 
-Total time for the whole computation                                       45.3332        
+Total time for the whole computation                                       45.2103        
 
+[Slurm] Power consumption (80 s):  0.400 kW  0.009 kWh  0.001 € (0.10€/kWh)
diff --git a/tests/GPU/JEL_bous/PAR_JEL_bous_BENCH.TU.adastra_gfx90ax8 b/tests/GPU/JEL_bous/PAR_JEL_bous_BENCH.TU.adastra_gfx90ax8
index dbf1d79796..d6ecc5ce4c 100644
--- a/tests/GPU/JEL_bous/PAR_JEL_bous_BENCH.TU.adastra_gfx90ax8
+++ b/tests/GPU/JEL_bous/PAR_JEL_bous_BENCH.TU.adastra_gfx90ax8
@@ -8,8 +8,8 @@ Time is given in seconds
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
                                               Context of the computation
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-Date:     20-02-2026 -- 18:07:54
-OS:       g1081__Linux__x86_64__5.14.0-570.69.1.el9_6.x86_64__#1 SMP PREEMPT_DYNAMIC Tue Nov 25 01:30:14 EST 2025
+Date:     08-06-2026 -- 16:05:31
+OS:       g1331__Linux__x86_64__5.14.0-570.69.1.el9_6.x86_64__#1 SMP PREEMPT_DYNAMIC Tue Nov 25 01:30:14 EST 2025
 CPU model : AMD EPYC 7A53 64-Core Processor
 Total number of threads:128
 GPU model: AMD Instinct MI250X
@@ -22,86 +22,86 @@ Total number of elements used for the calculation: 2332800
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
                                            Computation start-up statistics
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-Total time of the start-up:                                                11.3269        
-Number of virtual exchanges:                                               88             
-Maximum number of MPI allreduce per time step                              230            
+Total time of the start-up:                                                17.1944        
+Number of virtual exchanges:                                               91             
+Maximum number of MPI allreduce per time step                              234            
 
 Number of calls to the linear solver per time step:                        2              
-Average time of the resolution of the linear problem per call:             0.544609       
+Average time of the resolution of the linear problem per call:             1.73554        
 Average number of iteration of the linear solver per call:                 19.5           
 
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
                                                  Time loop statistics
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 The first time step is not accounted for the computation of the time loop statistics
-Total time of the time loop:                                               1.7114         
+Total time of the time loop:                                               1.3246         
 Number of time steps:                                                      9              
 Skipped time steps:                                                        1              
-Average time per time step:                                                0.190155       
-Standard deviation between time steps:                                     0.00412601     
-Time elapsed in the skipped time steps:                                    0.300029       
+Average time per time step:                                                0.147178       
+Standard deviation between time steps:                                     0.0035771      
+Time elapsed in the skipped time steps:                                    0.332575       
 
-Percent of total time spend in communication:                              6.97401        
+Percent of total time spend in communication:                              8.20387        
 
 Standard counter description             | Time/step       | % loop time | Call(s)/step   
 ------------------------------------------------------------------------------------------
-Linear solver resolutions Ax=B           | 0.097368        | 43.6        | 2              
-Convection operator                      | 0.008809825     |  3.9        | 4              
-Diffusion operator                       | 0.0271289       | 12.1        | 26             
-Gradient operator                        | 0.01032468      |  4.6        | 4              
-Divergence operator                      | 0.003033461     |  1.4        | 3              
-Source terms                             | 0.009662045     |  4.3        | 2              
-Update ::mettre_a_jour                   | 0.003470453     |  1.6        | 1              
-Solver for implicit diffusion            | 0.01615869      |  7.2        | 4              
-Computation of the time step dt          | 0.00510907      |  2.3        | 8              
-Turbulence model::update                 | 0.001198047     |  0.5        | 1              
-Post-treatment operations                | 0.002356216     |  1.1        | 1              
-Other operations                         | 0.005535735     |  2.5        | 
-Number of virtual exchanges per time step:                                 80             
+Linear solver resolutions Ax=B           | 0.0605146       | 41.1        | 2              
+Convection operator                      | 0.005624402     |  3.8        | 4              
+Diffusion operator                       | 0.02587681      | 17.6        | 26             
+Gradient operator                        | 0.006207187     |  4.2        | 4              
+Divergence operator                      | 0.002373149     |  1.6        | 3              
+Source terms                             | 0.009926151     |  6.7        | 2              
+Update ::mettre_a_jour                   | 0.003552744     |  2.4        | 1              
+Solver for implicit diffusion            | 0.01746022      | 11.9        | 4              
+Computation of the time step dt          | 0.004997502     |  3.4        | 8              
+Turbulence model::update                 | 0.00122556      |  0.8        | 1              
+Post-treatment operations                | 0.002465866     |  1.7        | 1              
+Other operations                         | 0.006953813     |  4.7        | 
+Number of virtual exchanges per time step:                                 76             
 Maximum number of MPI allreduce per time step                              66.7           
 
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
  Warning: number of MPI allreduce calls per time step is high. Contact TRUST team to run massive parallel calculations
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
-Average number of iteration of the linear solver per call:                 33             
+Average number of iteration of the linear solver per call:                 30             
 
 
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
                                                Time loop statistics: IO
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-Output write sequential:                                                   895            MB/s
+Output write sequential:                                                   886            MB/s
 
 ---------------------------------------------------------------------------------------------------------
 Warning: One or several PETSc solvers are used and thus the communication time below are under-estimated.
 ---------------------------------------------------------------------------------------------------------
 
-Average of the fraction of the time spent in communications between processors:      4.7            %
-Max of the fraction of the time spent in communications between processors:          8.4            %
-Min of the fraction of the time spent in communications between processors:          4              %
-Time of one mpsum measured by an internal bench over 0.1s (network latency):         3.39618e-06    
-Network maximum bandwidth on all processors:                                         47.1 GB/s      
+Average of the fraction of the time spent in communications between processors:      6.5            %
+Max of the fraction of the time spent in communications between processors:          11.6           %
+Min of the fraction of the time spent in communications between processors:          5.3            %
+Time of one mpsum measured by an internal bench over 0.1s (network latency):         3.28297e-06    
+Network maximum bandwidth on all processors:                                         46.3 GB/s      
 Total network traffic:                                                               1136.61        MB/time step
 Average message size:                                                                473.762        kB
-Min waiting time:                                                                    4.2            % of total time
-Max waiting time:                                                                    8              % of total time
-Avg waiting time:                                                                    5.4375         % of total time
+Min waiting time:                                                                    5.2            % of total time
+Max waiting time:                                                                    10.1           % of total time
+Avg waiting time:                                                                    8.075          % of total time
 
 -----------------------------------------------------------------------------------------------------------
                                                     GPU statistics
 -----------------------------------------------------------------------------------------------------------
 Counter description                      | Time per step   | % loop time | Call(s)/step    | Bandwidth 
 -----------------------------------------------------------------------------------------------------------
-Libraries:                               | 0.096971        | 51.0        | 2               | 
-Kernels:                                 | 0.0718802       | 37.8        | 1084            | 
-Copy host to device:                     | 0.000331019     |  0.2        | 12              | 3.9 GB/s
-Copy device to host:                     | 0.000457719     |  0.2        | 10              | 4.9 GB/s
-Alloc/Free on device:                    | 0.000156874     |  0.1        | 60              | 
-GPU: 89% Copy H<->D: 0.41% Alloc/free: 0.082% Comm: 8.2% CPU & I/O: 2.5%
+Libraries:                               | 0.0600684       | 40.8        | 2               | 
+Kernels:                                 | 0.0671133       | 45.6        | 1086            | 
+Copy host to device:                     | 0.000222822     |  0.2        | 9               | 2.4 GB/s
+Copy device to host:                     | 0.000324692     |  0.2        | 7               | 4.6 GB/s
+Alloc/Free on device:                    | 2.89998e-05     |  0.0        | 60              | 
+GPU: 86% Copy H<->D: 0.37% Alloc/free: 0.02% Comm: 10% CPU & I/O: 2.9%
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
                                               Post-resolution statistics
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-Time of the post-resolution:                                               0.0357117      
+Time of the post-resolution:                                               0.0364495      
 Maximum number of MPI allreduce per time step                              7              
 
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
@@ -109,6 +109,6 @@ Max waiting time big    => probably due to a bad partitioning
 Communications > 30%    => too many processors or network too slow
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
-Total time for the whole computation                                       13.374         
+Total time for the whole computation                                       18.8881        
 
-Power consumption: 0 kW 0 kWh 0 € (0.12€/kWh)
+[Slurm] Power consumption (31 s):  0.524 kW  0.005 kWh  0.000 € (0.10€/kWh)
diff --git a/tests/GPU/JEL_bous/PAR_JEL_bous_BENCH.TU.adastra_gfx942x8 b/tests/GPU/JEL_bous/PAR_JEL_bous_BENCH.TU.adastra_gfx942x8
new file mode 100644
index 0000000000..4de8236f49
--- /dev/null
+++ b/tests/GPU/JEL_bous/PAR_JEL_bous_BENCH.TU.adastra_gfx942x8
@@ -0,0 +1,114 @@
+                                             # Global performance file #
+
+This is the global file for tracking performance in TRUST. It stores aggregated quantities.
+More detailed statistics can be found in the PAR_JEL_bous_BENCH_csv.TU file
+For time loop, only standard counters of level 1 are printed alongside your custom counters
+Time is given in seconds
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+                                              Context of the computation
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Date:     08-06-2026 -- 14:44:57
+OS:       a1001__Linux__x86_64__5.14.0-570.69.1.el9_6.x86_64__#1 SMP PREEMPT_DYNAMIC Tue Nov 25 01:30:14 EST 2025
+CPU model : AMD Instinct MI300A Accelerator
+Total number of threads:192
+GPU model: AMD Instinct MI300A
+HIP runtime version: 6.43
+HIP drivers version: 6.43
+Nb procs used for the computation: 8
+TRUST version: 1.9.8_beta
+Total number of elements used for the calculation: 2332800
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+                                           Computation start-up statistics
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Total time of the start-up:                                                11.8595        
+Number of virtual exchanges:                                               91             
+Maximum number of MPI allreduce per time step                              234            
+
+Number of calls to the linear solver per time step:                        2              
+Average time of the resolution of the linear problem per call:             0.833593       
+Average number of iteration of the linear solver per call:                 19.5           
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+                                                 Time loop statistics
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+The first time step is not accounted for the computation of the time loop statistics
+Total time of the time loop:                                               1.14404        
+Number of time steps:                                                      9              
+Skipped time steps:                                                        1              
+Average time per time step:                                                0.127115       
+Standard deviation between time steps:                                     0.00446888     
+Time elapsed in the skipped time steps:                                    0.267194       
+
+Percent of total time spend in communication:                              8.26359        
+
+Standard counter description             | Time/step       | % loop time | Call(s)/step   
+------------------------------------------------------------------------------------------
+Linear solver resolutions Ax=B           | 0.0791995       | 62.3        | 2              
+Convection operator                      | 0.002333325     |  1.8        | 4              
+Diffusion operator                       | 0.01340551      | 10.5        | 26             
+Gradient operator                        | 0.002692253     |  2.1        | 4              
+Divergence operator                      | 0.000894222     |  0.7        | 3              
+Source terms                             | 0.001624698     |  1.3        | 2              
+Update ::mettre_a_jour                   | 0.002804274     |  2.2        | 1              
+Solver for implicit diffusion            | 0.01320826      | 10.4        | 4              
+Computation of the time step dt          | 0.00250911      |  2.0        | 8              
+Turbulence model::update                 | 0.000777171     |  0.6        | 1              
+Post-treatment operations                | 0.002293142     |  1.8        | 1              
+Other operations                         | 0.005373586     |  4.2        | 
+Number of virtual exchanges per time step:                                 76             
+Maximum number of MPI allreduce per time step                              66.7           
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ Warning: number of MPI allreduce calls per time step is high. Contact TRUST team to run massive parallel calculations
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Average number of iteration of the linear solver per call:                 30             
+
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+                                               Time loop statistics: IO
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Output write sequential:                                                   1025           MB/s
+
+---------------------------------------------------------------------------------------------------------
+Warning: One or several PETSc solvers are used and thus the communication time below are under-estimated.
+---------------------------------------------------------------------------------------------------------
+
+Average of the fraction of the time spent in communications between processors:      7.2            %
+Max of the fraction of the time spent in communications between processors:          11.6           %
+Min of the fraction of the time spent in communications between processors:          5.9            %
+Time of one mpsum measured by an internal bench over 0.1s (network latency):         1.26385e-05    
+Network maximum bandwidth on all processors:                                         51.3 GB/s      
+Total network traffic:                                                               1136.61        MB/time step
+Average message size:                                                                473.762        kB
+Min waiting time:                                                                    6.3            % of total time
+Max waiting time:                                                                    10.5           % of total time
+Avg waiting time:                                                                    8.1875         % of total time
+
+-----------------------------------------------------------------------------------------------------------
+                                                    GPU statistics
+-----------------------------------------------------------------------------------------------------------
+Counter description                      | Time per step   | % loop time | Call(s)/step    | Bandwidth 
+-----------------------------------------------------------------------------------------------------------
+Libraries:                               | 0.0787359       | 61.9        | 2               | 
+Kernels:                                 | 0.0313283       | 24.6        | 1086            | 
+Copy host to device:                     | 0.000201995     |  0.2        | 9               | 2.7 GB/s
+Copy device to host:                     | 0.000282562     |  0.2        | 7               | 5.3 GB/s
+Alloc/Free on device:                    | 7.34502e-05     |  0.1        | 60              | 
+GPU: 87% Copy H<->D: 0.38% Alloc/free: 0.058% Comm: 10% CPU & I/O: 2.8%
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+                                              Post-resolution statistics
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Time of the post-resolution:                                               0.0479479      
+Maximum number of MPI allreduce per time step                              7              
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Max waiting time big    => probably due to a bad partitioning
+Communications > 30%    => too many processors or network too slow
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Total time for the whole computation                                       13.3187        
+
+[Slurm] Power consumption (21 s):  0.735 kW  0.004 kWh  0.000 € (0.10€/kWh)
diff --git a/tests/GPU/JEL_bous/PAR_JEL_bous_BENCH.TU.jean-zay_cc90x8 b/tests/GPU/JEL_bous/PAR_JEL_bous_BENCH.TU.jean-zay_cc90x8
index 1cdc57caf3..22117dc475 100644
--- a/tests/GPU/JEL_bous/PAR_JEL_bous_BENCH.TU.jean-zay_cc90x8
+++ b/tests/GPU/JEL_bous/PAR_JEL_bous_BENCH.TU.jean-zay_cc90x8
@@ -8,101 +8,100 @@ Time is given in seconds
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
                                               Context of the computation
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-Date:     19-11-2025 -- 12:36:54
-OS:       jzxh178__Linux__x86_64__5.14.0-427.76.1.el9_4.x86_64__#1 SMP PREEMPT_DYNAMIC Fri Jun 27 09:53:45 EDT 2025
-CPU:      Model name:  Intel(R) Xeon(R) Platinum 8468 ; Thread(s) per core:  2
-GPU:      | NVIDIA-SMI 570.86.15  Driver Version: 570.86.15  CUDA Version: 12.8  |
-|  0  NVIDIA H100 80GB HBM3  On  |  00000000:
-Nb procs: 8
-TRUST version: 1.9.7_beta
+Date:     10-06-2026 -- 10:42:54
+OS:       jzxh021__Linux__x86_64__5.14.0-570.116.1.el9_6.x86_64__#1 SMP PREEMPT_DYNAMIC Tue May 19 08:44:54 EDT 2026
+CPU model : Intel(R) Xeon(R) Platinum 8468
+Total number of threads:192
+GPU model: NVIDIA H100 80GB HBM3
+CUDA runtime version: 12.60
+CUDA drivers version: 13.20
+Nb procs used for the computation: 8
+TRUST version: 1.9.8_beta
+Total number of elements used for the calculation: 2332800
 
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
                                            Computation start-up statistics
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-Total time of the start-up:                                                7.94766        
-Percent of untracked time during computation start-up:                     3.64887e-08    
-Number of virtual exchanges:                                               88             
+Total time of the start-up:                                                8.36044        
+Number of virtual exchanges:                                               91             
 Maximum number of MPI allreduce per time step                              234            
 
 Number of calls to the linear solver per time step:                        2              
-Average time of the resolution of the linear problem per call:             1.06429        
-Average number of iteration of the linear solver per call:                 10.5           
+Average time of the resolution of the linear problem per call:             1.48918        
+Average number of iteration of the linear solver per call:                 18             
 
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
                                                  Time loop statistics
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 The first time step is not accounted for the computation of the time loop statistics
-Total time of the time loop:                                               1.28802        
+Total time of the time loop:                                               1.08964        
 Number of time steps:                                                      9              
 Skipped time steps:                                                        1              
-Average time per time step:                                                0.143113       
-Standard deviation between time steps:                                     0.00230694     
-Time elapsed in the skipped time steps:                                    0.283043       
+Average time per time step:                                                0.121071       
+Standard deviation between time steps:                                     0.00937755     
+Time elapsed in the skipped time steps:                                    0.389716       
 
-Percent of total time spend in communication:                              8.32178        
+Percent of total time spend in communication:                              3.62109        
 
 Standard counter description             | Time/step       | % loop time | Call(s)/step   
 ------------------------------------------------------------------------------------------
-Linear solver resolutions Ax=B           | 0.0953142       | 66.6        | 2              
-Convection operator                      | 0.003255428     |  2.3        | 4              
-Diffusion operator                       | 0.01383943      |  9.7        | 26             
-Gradient operator                        | 0.001517576     |  1.1        | 4              
-Divergence operator                      | 0.002025552     |  1.4        | 3              
-Source terms                             | 0.00112365      |  0.8        | 2              
-Update ::mettre_a_jour                   | 0.002446269     |  1.7        | 1              
-Solver for implicit diffusion            | 0.01201407      |  8.4        | 4              
-Computation of the time step dt          | 0.001550891     |  1.1        | 8              
-Turbulence model::update                 | 0.0007519211    |  0.5        | 1              
-Post-treatment operations                | 0.003537479     |  2.5        | 1              
-Other operations                         | 0.005736537     |  4.0        | 
-
-Untracked time                           | 7.27e-05        | 0.00564     | 
-
-Total number of virtual exchanges:                                         848            
-Maximum number of MPI allreduce per time step                              76.6           
+Linear solver resolutions Ax=B           | 0.0842484       | 69.6        | 2              
+Convection operator                      | 0.00200081      |  1.7        | 4              
+Diffusion operator                       | 0.01016694      |  8.4        | 26             
+Gradient operator                        | 0.001346033     |  1.1        | 4              
+Divergence operator                      | 0.00067062      |  0.6        | 3              
+Source terms                             | 0.001082813     |  0.9        | 2              
+Update ::mettre_a_jour                   | 0.001681446     |  1.4        | 1              
+Solver for implicit diffusion            | 0.00801906      |  6.6        | 4              
+Computation of the time step dt          | 0.001480507     |  1.2        | 8              
+Turbulence model::update                 | 0.0004644671    |  0.4        | 1              
+Post-treatment operations                | 0.005549941     |  4.6        | 1              
+Other operations                         | 0.004360034     |  3.6        | 
+Number of virtual exchanges per time step:                                 76             
+Maximum number of MPI allreduce per time step                              66.7           
 
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
  Warning: number of MPI allreduce calls per time step is high. Contact TRUST team to run massive parallel calculations
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
-Average number of iteration of the linear solver per call:                 18             
+Average number of iteration of the linear solver per call:                 28.3           
 
 
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
                                                Time loop statistics: IO
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-Output write sequential:                                                   2095           MB/s
+Output write sequential:                                                   2180           MB/s
 
 ---------------------------------------------------------------------------------------------------------
 Warning: One or several PETSc solvers are used and thus the communication time below are under-estimated.
 ---------------------------------------------------------------------------------------------------------
 
-Average of the fraction of the time spent in communications between processors:      6.6            %
-Max of the fraction of the time spent in communications between processors:          10             %
+Average of the fraction of the time spent in communications between processors:      5.2            %
+Max of the fraction of the time spent in communications between processors:          8.2            %
 Min of the fraction of the time spent in communications between processors:          4.7            %
-Time of one mpsum measured by an internal bench over 0.1s (network latency):         4.72424e-06    
-Network maximum bandwidth on all processors:                                         116.6 GB/s     
-Total network traffic:                                                               1288           MB/time step
-Average message size:                                                                461.906        kB
-Min waiting time:                                                                    5              % of total time
-Max waiting time:                                                                    8.3            % of total time
-Avg waiting time:                                                                    6.275          % of total time
+Time of one mpsum measured by an internal bench over 0.1s (network latency):         6.09336e-06    
+Network maximum bandwidth on all processors:                                         131.6 GB/s     
+Total network traffic:                                                               1136.61        MB/time step
+Average message size:                                                                473.762        kB
+Min waiting time:                                                                    4.5            % of total time
+Max waiting time:                                                                    7.8            % of total time
+Avg waiting time:                                                                    6.6875         % of total time
 
 -----------------------------------------------------------------------------------------------------------
                                                     GPU statistics
 -----------------------------------------------------------------------------------------------------------
 Counter description                      | Time per step   | % loop time | Call(s)/step    | Bandwidth 
 -----------------------------------------------------------------------------------------------------------
-Libraries:                               | 0.0950047       | 66.4        | 2               | 
-Kernels:                                 | 0.0258533       | 18.1        | 1090            | 
-Copy host to device:                     | 0.00027859      |  0.2        | 12              | 4.6 GB/s
-Copy device to host:                     | 0.000448907     |  0.3        | 10              | 5.0 GB/s
-Alloc/Free on device:                    | 0.00298098      |  2.1        | 60              | 
-GPU: 84% Copy H<->D: 0.51% Alloc/free: 2.1% Comm: 8.3% CPU & I/O: 4.6%
+Libraries:                               | 0.0839351       | 69.3        | 2               | 
+Kernels:                                 | 0.0227148       | 18.8        | 1086            | 
+Copy host to device:                     | 0.000178755     |  0.1        | 9               | 3.0 GB/s
+Copy device to host:                     | 0.000334874     |  0.3        | 7               | 4.5 GB/s
+Alloc/Free on device:                    | 0.000144695     |  0.1        | 60              | 
+GPU: 88% Copy H<->D: 0.42% Alloc/free: 0.12% Comm: 4.9% CPU & I/O: 6.5%
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
                                               Post-resolution statistics
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-Time of the post-resolution:                                               0.0187218      
+Time of the post-resolution:                                               0.0332711      
 Maximum number of MPI allreduce per time step                              7              
 
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
@@ -110,5 +109,6 @@ Max waiting time big    => probably due to a bad partitioning
 Communications > 30%    => too many processors or network too slow
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
-Total time for the whole computation                                       9.53745        
+Total time for the whole computation                                       9.87309        
 
+[Slurm] Power consumption (25 s):  0.894 kW  0.006 kWh  0.001 € (0.10€/kWh)
diff --git a/tests/GPU/JEL_bous/PAR_JEL_bous_BENCH.TU.lumi_gfx90ax8 b/tests/GPU/JEL_bous/PAR_JEL_bous_BENCH.TU.lumi_gfx90ax8
index 64b93ba748..013e038296 100644
--- a/tests/GPU/JEL_bous/PAR_JEL_bous_BENCH.TU.lumi_gfx90ax8
+++ b/tests/GPU/JEL_bous/PAR_JEL_bous_BENCH.TU.lumi_gfx90ax8
@@ -8,8 +8,8 @@ Time is given in seconds
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
                                               Context of the computation
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-Date:     24-02-2026 -- 00:10:38
-OS:       nid007971__Linux__x86_64__6.4.0-150600.23.73_15.0.14-cray_shasta_c__#1 SMP Tue Oct 21 20:32:25 UTC 2025 (89d3c98)
+Date:     15-05-2026 -- 20:37:03
+OS:       nid005023__Linux__x86_64__6.4.0-150600.23.73_15.0.14-cray_shasta_c__#1 SMP Tue Oct 21 20:32:25 UTC 2025 (89d3c98)
 CPU model : AMD EPYC 7A53 64-Core Processor
 Total number of threads:128
 GPU model: AMD Instinct MI250X
@@ -22,42 +22,42 @@ Total number of elements used for the calculation: 2332800
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
                                            Computation start-up statistics
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-Total time of the start-up:                                                34.1602        
-Number of virtual exchanges:                                               88             
-Maximum number of MPI allreduce per time step                              230            
+Total time of the start-up:                                                55.7779        
+Number of virtual exchanges:                                               91             
+Maximum number of MPI allreduce per time step                              234            
 
 Number of calls to the linear solver per time step:                        2              
-Average time of the resolution of the linear problem per call:             2.11372        
+Average time of the resolution of the linear problem per call:             2.46717        
 Average number of iteration of the linear solver per call:                 19.5           
 
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
                                                  Time loop statistics
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 The first time step is not accounted for the computation of the time loop statistics
-Total time of the time loop:                                               1.73935        
+Total time of the time loop:                                               1.58648        
 Number of time steps:                                                      9              
 Skipped time steps:                                                        1              
-Average time per time step:                                                0.193261       
-Standard deviation between time steps:                                     0.00394785     
-Time elapsed in the skipped time steps:                                    0.403044       
+Average time per time step:                                                0.176276       
+Standard deviation between time steps:                                     0.00386474     
+Time elapsed in the skipped time steps:                                    0.382443       
 
-Percent of total time spend in communication:                              7.14789        
+Percent of total time spend in communication:                              7.163          
 
 Standard counter description             | Time/step       | % loop time | Call(s)/step   
 ------------------------------------------------------------------------------------------
-Linear solver resolutions Ax=B           | 0.0995881       | 41.8        | 2              
-Convection operator                      | 0.008435531     |  3.5        | 4              
-Diffusion operator                       | 0.02842003      | 11.9        | 26             
-Gradient operator                        | 0.009849339     |  4.1        | 4              
-Divergence operator                      | 0.002951774     |  1.2        | 3              
-Source terms                             | 0.009550072     |  4.0        | 2              
-Update ::mettre_a_jour                   | 0.003524391     |  1.5        | 1              
-Solver for implicit diffusion            | 0.01629956      |  6.8        | 4              
-Computation of the time step dt          | 0.005308412     |  2.2        | 8              
-Turbulence model::update                 | 0.001210523     |  0.5        | 1              
-Post-treatment operations                | 0.002457478     |  1.0        | 1              
-Other operations                         | 0.005665654     |  2.4        | 
-Number of virtual exchanges per time step:                                 80             
+Linear solver resolutions Ax=B           | 0.0943011       | 53.5        | 2              
+Convection operator                      | 0.00531937      |  3.0        | 4              
+Diffusion operator                       | 0.02432901      | 13.8        | 26             
+Gradient operator                        | 0.005692192     |  3.2        | 4              
+Divergence operator                      | 0.003190957     |  1.8        | 3              
+Source terms                             | 0.009910483     |  5.6        | 2              
+Update ::mettre_a_jour                   | 0.003489126     |  2.0        | 1              
+Solver for implicit diffusion            | 0.01607808      |  9.1        | 4              
+Computation of the time step dt          | 0.004845297     |  2.7        | 8              
+Turbulence model::update                 | 0.001111563     |  0.6        | 1              
+Post-treatment operations                | 0.00256484      |  1.5        | 1              
+Other operations                         | 0.005443877     |  3.1        | 
+Number of virtual exchanges per time step:                                 76             
 Maximum number of MPI allreduce per time step                              66.7           
 
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
@@ -70,38 +70,38 @@ Average number of iteration of the linear solver per call:                 30
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
                                                Time loop statistics: IO
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-Output write sequential:                                                   963            MB/s
+Output write sequential:                                                   951            MB/s
 
 ---------------------------------------------------------------------------------------------------------
 Warning: One or several PETSc solvers are used and thus the communication time below are under-estimated.
 ---------------------------------------------------------------------------------------------------------
 
-Average of the fraction of the time spent in communications between processors:      4.7            %
+Average of the fraction of the time spent in communications between processors:      5              %
 Max of the fraction of the time spent in communications between processors:          9              %
-Min of the fraction of the time spent in communications between processors:          4.4            %
-Time of one mpsum measured by an internal bench over 0.1s (network latency):         3.59628e-06    
-Network maximum bandwidth on all processors:                                         41.8 GB/s      
+Min of the fraction of the time spent in communications between processors:          4.9            %
+Time of one mpsum measured by an internal bench over 0.1s (network latency):         3.71493e-06    
+Network maximum bandwidth on all processors:                                         42.4 GB/s      
 Total network traffic:                                                               1136.61        MB/time step
 Average message size:                                                                473.762        kB
-Min waiting time:                                                                    4.3            % of total time
-Max waiting time:                                                                    8.6            % of total time
-Avg waiting time:                                                                    5.675          % of total time
+Min waiting time:                                                                    4.8            % of total time
+Max waiting time:                                                                    8.7            % of total time
+Avg waiting time:                                                                    6.05           % of total time
 
 -----------------------------------------------------------------------------------------------------------
                                                     GPU statistics
 -----------------------------------------------------------------------------------------------------------
 Counter description                      | Time per step   | % loop time | Call(s)/step    | Bandwidth 
 -----------------------------------------------------------------------------------------------------------
-Libraries:                               | 0.0991658       | 51.3        | 2               | 
-Kernels:                                 | 0.0714553       | 37.0        | 1084            | 
-Copy host to device:                     | 0.000330243     |  0.2        | 12              | 3.9 GB/s
-Copy device to host:                     | 0.00048114      |  0.2        | 10              | 4.7 GB/s
-Alloc/Free on device:                    | 0.000176988     |  0.1        | 60              | 
-GPU: 88% Copy H<->D: 0.42% Alloc/free: 0.092% Comm: 8.8% CPU & I/O: 2.4%
+Libraries:                               | 0.0938749       | 53.3        | 2               | 
+Kernels:                                 | 0.0608573       | 34.5        | 1086            | 
+Copy host to device:                     | 0.000337062     |  0.2        | 12              | 3.8 GB/s
+Copy device to host:                     | 0.000501417     |  0.3        | 10              | 4.5 GB/s
+Alloc/Free on device:                    | 3.41883e-05     |  0.0        | 60              | 
+GPU: 88% Copy H<->D: 0.48% Alloc/free: 0.019% Comm: 8.9% CPU & I/O: 2.8%
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
                                               Post-resolution statistics
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-Time of the post-resolution:                                               0.0405752      
+Time of the post-resolution:                                               0.0376324      
 Maximum number of MPI allreduce per time step                              7              
 
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
@@ -109,6 +109,6 @@ Max waiting time big    => probably due to a bad partitioning
 Communications > 30%    => too many processors or network too slow
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
-Total time for the whole computation                                       36.3432        
+Total time for the whole computation                                       57.7844        
 
-Power consumption: 0 kW 0 kWh 0 € (0.12€/kWh)
+[Slurm] Power consumption (88 s):  0.510 kW  0.012 kWh  0.001 € (0.10€/kWh)
diff --git a/tests/GPU/JEL_bous/PAR_JEL_bous_BENCH.TU.topaze_cc80x8 b/tests/GPU/JEL_bous/PAR_JEL_bous_BENCH.TU.topaze_cc80x8
index 56bb61f074..16f94723d3 100644
--- a/tests/GPU/JEL_bous/PAR_JEL_bous_BENCH.TU.topaze_cc80x8
+++ b/tests/GPU/JEL_bous/PAR_JEL_bous_BENCH.TU.topaze_cc80x8
@@ -8,8 +8,8 @@ Time is given in seconds
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
                                               Context of the computation
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-Date:     20-02-2026 -- 17:52:49
-OS:       topaze7033__Linux__x86_64__4.18.0-553.89.1.el8_10.x86_64__#1 SMP Sat Nov 29 00:49:18 EST 2025
+Date:     15-05-2026 -- 13:40:59
+OS:       topaze7018__Linux__x86_64__4.18.0-553.89.1.el8_10.x86_64__#1 SMP Sat Nov 29 00:49:18 EST 2025
 CPU model : AMD EPYC 7763 64-Core Processor
 Total number of threads:256
 GPU model: NVIDIA A100-SXM4-80GB
@@ -22,86 +22,86 @@ Total number of elements used for the calculation: 2332800
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
                                            Computation start-up statistics
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-Total time of the start-up:                                                10.7416        
-Number of virtual exchanges:                                               88             
-Maximum number of MPI allreduce per time step                              230            
+Total time of the start-up:                                                10.6546        
+Number of virtual exchanges:                                               91             
+Maximum number of MPI allreduce per time step                              234            
 
 Number of calls to the linear solver per time step:                        2              
-Average time of the resolution of the linear problem per call:             1.74355        
+Average time of the resolution of the linear problem per call:             1.49978        
 Average number of iteration of the linear solver per call:                 18             
 
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
                                                  Time loop statistics
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 The first time step is not accounted for the computation of the time loop statistics
-Total time of the time loop:                                               1.62302        
+Total time of the time loop:                                               1.57279        
 Number of time steps:                                                      9              
 Skipped time steps:                                                        1              
-Average time per time step:                                                0.180335       
-Standard deviation between time steps:                                     0.00980316     
-Time elapsed in the skipped time steps:                                    0.429487       
+Average time per time step:                                                0.174755       
+Standard deviation between time steps:                                     0.0105474      
+Time elapsed in the skipped time steps:                                    0.406459       
 
-Percent of total time spend in communication:                              8.93758        
+Percent of total time spend in communication:                              7.75645        
 
 Standard counter description             | Time/step       | % loop time | Call(s)/step   
 ------------------------------------------------------------------------------------------
-Linear solver resolutions Ax=B           | 0.118436        | 51.9        | 2              
-Convection operator                      | 0.00377104      |  1.7        | 4              
-Diffusion operator                       | 0.01757169      |  7.7        | 26             
-Gradient operator                        | 0.002272595     |  1.0        | 4              
-Divergence operator                      | 0.002091061     |  0.9        | 3              
-Source terms                             | 0.001597418     |  0.7        | 2              
-Update ::mettre_a_jour                   | 0.003249867     |  1.4        | 1              
-Solver for implicit diffusion            | 0.01671793      |  7.3        | 4              
-Computation of the time step dt          | 0.002845562     |  1.2        | 8              
-Turbulence model::update                 | 0.0009740548    |  0.4        | 1              
-Post-treatment operations                | 0.004604644     |  2.0        | 1              
-Other operations                         | 0.006203393     |  2.7        | 
-Number of virtual exchanges per time step:                                 80             
+Linear solver resolutions Ax=B           | 0.122977        | 70.4        | 2              
+Convection operator                      | 0.002903186     |  1.7        | 4              
+Diffusion operator                       | 0.0138009       |  7.9        | 26             
+Gradient operator                        | 0.001907345     |  1.1        | 4              
+Divergence operator                      | 0.001499655     |  0.9        | 3              
+Source terms                             | 0.001528563     |  0.9        | 2              
+Update ::mettre_a_jour                   | 0.002610787     |  1.5        | 1              
+Solver for implicit diffusion            | 0.01478346      |  8.5        | 4              
+Computation of the time step dt          | 0.002501599     |  1.4        | 8              
+Turbulence model::update                 | 0.0006880512    |  0.4        | 1              
+Post-treatment operations                | 0.004329323     |  2.5        | 1              
+Other operations                         | 0.005224245     |  3.0        | 
+Number of virtual exchanges per time step:                                 76             
 Maximum number of MPI allreduce per time step                              66.7           
 
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
  Warning: number of MPI allreduce calls per time step is high. Contact TRUST team to run massive parallel calculations
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
-Average number of iteration of the linear solver per call:                 29.7           
+Average number of iteration of the linear solver per call:                 28.3           
 
 
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
                                                Time loop statistics: IO
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-Output write sequential:                                                   1082           MB/s
+Output write sequential:                                                   1103           MB/s
 
 ---------------------------------------------------------------------------------------------------------
 Warning: One or several PETSc solvers are used and thus the communication time below are under-estimated.
 ---------------------------------------------------------------------------------------------------------
 
-Average of the fraction of the time spent in communications between processors:      6.8            %
-Max of the fraction of the time spent in communications between processors:          11.4           %
-Min of the fraction of the time spent in communications between processors:          6.2            %
-Time of one mpsum measured by an internal bench over 0.1s (network latency):         1.70475e-05    
+Average of the fraction of the time spent in communications between processors:      6.5            %
+Max of the fraction of the time spent in communications between processors:          11             %
+Min of the fraction of the time spent in communications between processors:          6.1            %
+Time of one mpsum measured by an internal bench over 0.1s (network latency):         1.71348e-05    
 Network maximum bandwidth on all processors:                                         46.4 GB/s      
 Total network traffic:                                                               1136.61        MB/time step
 Average message size:                                                                473.762        kB
-Min waiting time:                                                                    6.9            % of total time
-Max waiting time:                                                                    10.6           % of total time
-Avg waiting time:                                                                    8.05           % of total time
+Min waiting time:                                                                    6.7            % of total time
+Max waiting time:                                                                    9              % of total time
+Avg waiting time:                                                                    7.475          % of total time
 
 -----------------------------------------------------------------------------------------------------------
                                                     GPU statistics
 -----------------------------------------------------------------------------------------------------------
 Counter description                      | Time per step   | % loop time | Call(s)/step    | Bandwidth 
 -----------------------------------------------------------------------------------------------------------
-Libraries:                               | 0.118065        | 65.5        | 2               | 
-Kernels:                                 | 0.0327413       | 18.2        | 1084            | 
-Copy host to device:                     | 0.000336012     |  0.2        | 12              | 3.9 GB/s
-Copy device to host:                     | 0.000480723     |  0.3        | 10              | 4.7 GB/s
-Alloc/Free on device:                    | 0.0033014       |  1.8        | 60              | 
-GPU: 84% Copy H<->D: 0.45% Alloc/free: 1.8% Comm: 11% CPU & I/O: 2.8%
+Libraries:                               | 0.122609        | 70.2        | 2               | 
+Kernels:                                 | 0.0292873       | 16.8        | 1086            | 
+Copy host to device:                     | 0.00032894      |  0.2        | 12              | 3.9 GB/s
+Copy device to host:                     | 0.000453863     |  0.3        | 10              | 5.0 GB/s
+Alloc/Free on device:                    | 0.000211454     |  0.1        | 60              | 
+GPU: 87% Copy H<->D: 0.45% Alloc/free: 0.12% Comm: 9.8% CPU & I/O: 2.8%
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
                                               Post-resolution statistics
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-Time of the post-resolution:                                               0.0423874      
+Time of the post-resolution:                                               0.0395549      
 Maximum number of MPI allreduce per time step                              7              
 
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
@@ -109,5 +109,6 @@ Max waiting time big    => probably due to a bad partitioning
 Communications > 30%    => too many processors or network too slow
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
-Total time for the whole computation                                       12.8365        
+Total time for the whole computation                                       12.6734        
 
+[Slurm] Power consumption (59 s):  5.418 kW  0.089 kWh  0.009 € (0.10€/kWh)
diff --git a/tests/GPU/OpenMP_Iterateur/OpenMP_Iterateur.TU.adastra_gfx90a b/tests/GPU/OpenMP_Iterateur/OpenMP_Iterateur.TU.adastra_gfx90a
index 47b2a9e24a..023bcc5668 100644
--- a/tests/GPU/OpenMP_Iterateur/OpenMP_Iterateur.TU.adastra_gfx90a
+++ b/tests/GPU/OpenMP_Iterateur/OpenMP_Iterateur.TU.adastra_gfx90a
@@ -8,8 +8,8 @@ Time is given in seconds
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
                                               Context of the computation
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-Date:     20-02-2026 -- 21:15:33
-OS:       g1023__Linux__x86_64__5.14.0-570.69.1.el9_6.x86_64__#1 SMP PREEMPT_DYNAMIC Tue Nov 25 01:30:14 EST 2025
+Date:     23-04-2026 -- 09:27:03
+OS:       g1182__Linux__x86_64__5.14.0-570.69.1.el9_6.x86_64__#1 SMP PREEMPT_DYNAMIC Tue Nov 25 01:30:14 EST 2025
 CPU model : AMD EPYC 7A53 64-Core Processor
 Total number of threads:128
 GPU model: AMD Instinct MI250X
@@ -22,37 +22,37 @@ Total number of elements used for the calculation: 51840
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
                                            Computation start-up statistics
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-Total time of the start-up:                                                5.93888        
+Total time of the start-up:                                                6.12593        
 
 Number of calls to the linear solver per time step:                        2              
-Average time of the resolution of the linear problem per call:             0.185608       
-Average number of iteration of the linear solver per call:                 48             
+Average time of the resolution of the linear problem per call:             0.193628       
+Average number of iteration of the linear solver per call:                 26             
 
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
                                                  Time loop statistics
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 The first time step is not accounted for the computation of the time loop statistics
-Total time of the time loop:                                               0.131921       
+Total time of the time loop:                                               0.0968883      
 Number of time steps:                                                      9              
 Skipped time steps:                                                        1              
-Average time per time step:                                                0.0146579      
-Standard deviation between time steps:                                     0.00787638     
-Time elapsed in the skipped time steps:                                    0.0157475      
+Average time per time step:                                                0.0107654      
+Standard deviation between time steps:                                     0.00741937     
+Time elapsed in the skipped time steps:                                    0.0109251      
 
 
 Standard counter description             | Time/step       | % loop time | Call(s)/step   
 ------------------------------------------------------------------------------------------
-Linear solver resolutions Ax=B           | 0.00852404      | 51.9        | 1              
-Convection operator                      | 0.0004709512    |  2.9        | 1              
-Diffusion operator                       | 0.0002746094    |  1.7        | 1              
-Gradient operator                        | 0.0003199339    |  1.9        | 2              
-Divergence operator                      | 0.0001807804    |  1.1        | 2              
-Update ::mettre_a_jour                   | 0.0003165519    |  1.9        | 1              
-Computation of the time step dt          | 0.0002572336    |  1.6        | 2              
-Post-treatment operations                | 0.00343332      | 20.9        | 1              
-Other operations                         | 0.0008805088    |  5.4        | 
+Linear solver resolutions Ax=B           | 0.00462653      | 43.0        | 1              
+Convection operator                      | 0.000513395     |  4.8        | 1              
+Diffusion operator                       | 0.0002880368    |  2.7        | 1              
+Gradient operator                        | 0.0002975948    |  2.8        | 2              
+Divergence operator                      | 0.0001837762    |  1.7        | 2              
+Update ::mettre_a_jour                   | 0.0003529006    |  3.3        | 1              
+Computation of the time step dt          | 0.0002610717    |  2.4        | 2              
+Post-treatment operations                | 0.003305068     | 30.7        | 1              
+Other operations                         | 0.0009369908    |  8.7        | 
 
-Average number of iteration of the linear solver per call:                 42             
+Average number of iteration of the linear solver per call:                 22.2           
 
 
 -----------------------------------------------------------------------------------------------------------
@@ -60,17 +60,17 @@ Average number of iteration of the linear solver per call:                 42
 -----------------------------------------------------------------------------------------------------------
 Counter description                      | Time per step   | % loop time | Call(s)/step    | Bandwidth 
 -----------------------------------------------------------------------------------------------------------
-Libraries:                               | 0.00845284      | 57.7        | 1               | 
-Kernels:                                 | 0.00235428      | 16.1        | 98              | 
-Copy host to device:                     | 0.000145101     |  1.0        | 8               | 1.5 GB/s
-Copy device to host:                     | 0.000132154     |  0.9        | 4               | 4.1 GB/s
+Libraries:                               | 0.00455693      | 42.3        | 1               | 
+Kernels:                                 | 0.00238087      | 22.1        | 97              | 
+Copy host to device:                     | 0.000149197     |  1.4        | 8               | 1.4 GB/s
+Copy device to host:                     | 0.000136187     |  1.3        | 4               | 4.0 GB/s
 Alloc/Free on device:                    | 0               |  0.0        | 0               | 
-GPU: 74% Copy H<->D: 1.9% Alloc/free: 0% Comm: 0% CPU & I/O: 24%
+GPU: 64% Copy H<->D: 2.7% Alloc/free: 0% Comm: 0% CPU & I/O: 33%
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
                                               Post-resolution statistics
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-Time of the post-resolution:                                               0.00639265     
+Time of the post-resolution:                                               0.00551343     
 
-Total time for the whole computation                                       6.09295        
+Total time for the whole computation                                       6.23927        
 
-Power consumption: 0 kW 0 kWh 0 € (0.12€/kWh)
+[Slurm] Power consumption (14 s):  0.268 kW  0.001 kWh  0.000 € (0.10€/kWh)
diff --git a/tests/GPU/OpenMP_Iterateur/OpenMP_Iterateur.TU.adastra_gfx942 b/tests/GPU/OpenMP_Iterateur/OpenMP_Iterateur.TU.adastra_gfx942
index b06f2a3e97..99aa203f47 100644
--- a/tests/GPU/OpenMP_Iterateur/OpenMP_Iterateur.TU.adastra_gfx942
+++ b/tests/GPU/OpenMP_Iterateur/OpenMP_Iterateur.TU.adastra_gfx942
@@ -8,8 +8,8 @@ Time is given in seconds
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
                                               Context of the computation
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-Date:     17-03-2026 -- 17:58:19
-OS:       a1007__Linux__x86_64__5.14.0-570.69.1.el9_6.x86_64__#1 SMP PREEMPT_DYNAMIC Tue Nov 25 01:30:14 EST 2025
+Date:     23-04-2026 -- 14:47:58
+OS:       a1003__Linux__x86_64__5.14.0-570.69.1.el9_6.x86_64__#1 SMP PREEMPT_DYNAMIC Tue Nov 25 01:30:14 EST 2025
 CPU model : AMD Instinct MI300A Accelerator
 Total number of threads:192
 GPU model: AMD Instinct MI300A
@@ -22,37 +22,37 @@ Total number of elements used for the calculation: 51840
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
                                            Computation start-up statistics
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-Total time of the start-up:                                                7.01774        
+Total time of the start-up:                                                7.06812        
 
 Number of calls to the linear solver per time step:                        2              
-Average time of the resolution of the linear problem per call:             0.206942       
-Average number of iteration of the linear solver per call:                 48             
+Average time of the resolution of the linear problem per call:             0.203738       
+Average number of iteration of the linear solver per call:                 26             
 
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
                                                  Time loop statistics
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 The first time step is not accounted for the computation of the time loop statistics
-Total time of the time loop:                                               0.0987891      
+Total time of the time loop:                                               0.0858901      
 Number of time steps:                                                      9              
 Skipped time steps:                                                        1              
-Average time per time step:                                                0.0109766      
-Standard deviation between time steps:                                     0.00617032     
-Time elapsed in the skipped time steps:                                    0.0148606      
+Average time per time step:                                                0.00954334     
+Standard deviation between time steps:                                     0.0058059      
+Time elapsed in the skipped time steps:                                    0.0155645      
 
 
 Standard counter description             | Time/step       | % loop time | Call(s)/step   
 ------------------------------------------------------------------------------------------
-Linear solver resolutions Ax=B           | 0.00647832      | 59.0        | 1              
-Convection operator                      | 0.0002395161    |  2.2        | 1              
-Diffusion operator                       | 0.0001366721    |  1.2        | 1              
-Gradient operator                        | 0.0001858223    |  1.7        | 2              
-Divergence operator                      | 0.0001313133    |  1.2        | 2              
-Update ::mettre_a_jour                   | 0.0002394639    |  2.2        | 1              
-Computation of the time step dt          | 0.00016156      |  1.5        | 2              
-Post-treatment operations                | 0.002719926     | 24.8        | 1              
-Other operations                         | 0.0006839746    |  6.2        | 
+Linear solver resolutions Ax=B           | 0.00467528      | 49.0        | 1              
+Convection operator                      | 0.0002698356    |  2.8        | 1              
+Diffusion operator                       | 0.0001643067    |  1.7        | 1              
+Gradient operator                        | 0.0002388463    |  2.5        | 2              
+Divergence operator                      | 0.0001593037    |  1.7        | 2              
+Update ::mettre_a_jour                   | 0.000304429     |  3.2        | 1              
+Computation of the time step dt          | 0.0001922116    |  2.0        | 2              
+Post-treatment operations                | 0.002630262     | 27.6        | 1              
+Other operations                         | 0.0009088682    |  9.5        | 
 
-Average number of iteration of the linear solver per call:                 42             
+Average number of iteration of the linear solver per call:                 22.2           
 
 
 -----------------------------------------------------------------------------------------------------------
@@ -60,17 +60,17 @@ Average number of iteration of the linear solver per call:                 42
 -----------------------------------------------------------------------------------------------------------
 Counter description                      | Time per step   | % loop time | Call(s)/step    | Bandwidth 
 -----------------------------------------------------------------------------------------------------------
-Libraries:                               | 0.0064201       | 58.5        | 1               | 
-Kernels:                                 | 0.00149049      | 13.6        | 98              | 
-Copy host to device:                     | 0.000110276     |  1.0        | 8               | 1.9 GB/s
-Copy device to host:                     | 0.000110103     |  1.0        | 4               | 5.0 GB/s
+Libraries:                               | 0.00460235      | 48.2        | 1               | 
+Kernels:                                 | 0.00185681      | 19.5        | 97              | 
+Copy host to device:                     | 0.000133123     |  1.4        | 8               | 1.6 GB/s
+Copy device to host:                     | 0.000127818     |  1.3        | 4               | 4.3 GB/s
 Alloc/Free on device:                    | 0               |  0.0        | 0               | 
-GPU: 72% Copy H<->D: 2% Alloc/free: 0% Comm: 0% CPU & I/O: 26%
+GPU: 68% Copy H<->D: 2.7% Alloc/free: 0% Comm: 0% CPU & I/O: 30%
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
                                               Post-resolution statistics
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-Time of the post-resolution:                                               0.00526473     
+Time of the post-resolution:                                               0.0048061      
 
-Total time for the whole computation                                       7.13666        
+Total time for the whole computation                                       7.17439        
 
-[Slurm] Power consumption (15 s):  0.446 kW  0.002 kWh  0.000 € (0.10€/kWh)
+[Slurm] Power consumption (16 s):  0.396 kW  0.002 kWh  0.000 € (0.10€/kWh)
diff --git a/tests/GPU/OpenMP_Iterateur/OpenMP_Iterateur.TU.dalianvl_cc100 b/tests/GPU/OpenMP_Iterateur/OpenMP_Iterateur.TU.dalianvl_cc100
new file mode 100644
index 0000000000..28cbcce359
--- /dev/null
+++ b/tests/GPU/OpenMP_Iterateur/OpenMP_Iterateur.TU.dalianvl_cc100
@@ -0,0 +1,75 @@
+                                             # Global performance file #
+
+This is the global file for tracking performance in TRUST. It stores aggregated quantities.
+More detailed statistics can be found in the OpenMP_Iterateur_csv.TU file
+For time loop, only standard counters of level 1 are printed alongside your custom counters
+Time is given in seconds
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+                                              Context of the computation
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Date:     22-05-2026 -- 12:17:34
+OS:       dalianvl14__Linux__aarch64__5.14.0-503.40.1.el9_5.aarch64__#1 SMP PREEMPT_DYNAMIC Thu Apr 24 13:05:29 UTC 2025
+CPU model : Unknown Linux CPU
+Total number of threads:144
+GPU model: NVIDIA GB200
+CUDA runtime version: 13.0
+CUDA drivers version: 13.20
+Nb procs used for the computation: 1
+TRUST version: 1.9.8_beta
+Total number of elements used for the calculation: 51840
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+                                           Computation start-up statistics
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Total time of the start-up:                                                1.2994         
+
+Number of calls to the linear solver per time step:                        2              
+Average time of the resolution of the linear problem per call:             0.0740778      
+Average number of iteration of the linear solver per call:                 26             
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+                                                 Time loop statistics
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+The first time step is not accounted for the computation of the time loop statistics
+Total time of the time loop:                                               0.106793       
+Number of time steps:                                                      9              
+Skipped time steps:                                                        1              
+Average time per time step:                                                0.0118658      
+Standard deviation between time steps:                                     0.00600289     
+Time elapsed in the skipped time steps:                                    0.00831561     
+
+
+Standard counter description             | Time/step       | % loop time | Call(s)/step   
+------------------------------------------------------------------------------------------
+Linear solver resolutions Ax=B           | 0.0065994       | 55.6        | 1              
+Convection operator                      | 0.0002036051    |  1.7        | 1              
+Diffusion operator                       | 0.0001235908    |  1.0        | 1              
+Gradient operator                        | 0.0001959857    |  1.7        | 2              
+Divergence operator                      | 0.0001151963    |  1.0        | 2              
+Update ::mettre_a_jour                   | 0.0002897668    |  2.4        | 1              
+Computation of the time step dt          | 0.0001778879    |  1.5        | 2              
+Post-treatment operations                | 0.002707086     | 22.8        | 1              
+Other operations                         | 0.001453324     | 12.2        | 
+
+Average number of iteration of the linear solver per call:                 22.2           
+
+
+-----------------------------------------------------------------------------------------------------------
+                                                    GPU statistics
+-----------------------------------------------------------------------------------------------------------
+Counter description                      | Time per step   | % loop time | Call(s)/step    | Bandwidth 
+-----------------------------------------------------------------------------------------------------------
+Libraries:                               | 0.00650064      | 54.8        | 1               | 
+Kernels:                                 | 0.00148095      | 12.5        | 97              | 
+Copy host to device:                     | 0.000140437     |  1.2        | 8               | 1.5 GB/s
+Copy device to host:                     | 8.81669e-05     |  0.7        | 4               | 6.2 GB/s
+Alloc/Free on device:                    | 0               |  0.0        | 0               | 
+GPU: 67% Copy H<->D: 1.9% Alloc/free: 0% Comm: 0% CPU & I/O: 31%
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+                                              Post-resolution statistics
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Time of the post-resolution:                                               0.00378595     
+
+Total time for the whole computation                                       1.4183         
+
diff --git a/tests/GPU/OpenMP_Iterateur/OpenMP_Iterateur.TU.eureka_cc89 b/tests/GPU/OpenMP_Iterateur/OpenMP_Iterateur.TU.eureka_cc89
new file mode 100644
index 0000000000..7093500372
--- /dev/null
+++ b/tests/GPU/OpenMP_Iterateur/OpenMP_Iterateur.TU.eureka_cc89
@@ -0,0 +1,75 @@
+                                             # Global performance file #
+
+This is the global file for tracking performance in TRUST. It stores aggregated quantities.
+More detailed statistics can be found in the OpenMP_Iterateur_csv.TU file
+For time loop, only standard counters of level 1 are printed alongside your custom counters
+Time is given in seconds
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+                                              Context of the computation
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Date:     11-05-2026 -- 08:34:51
+OS:       eureka__Linux__x86_64__6.17.0-20-generic__#20~24.04.1-Ubuntu SMP PREEMPT_DYNAMIC Thu Mar 19 01:28:37 UTC 2
+CPU model : INTEL(R) XEON(R) PLATINUM 8580
+Total number of threads:240
+GPU model: NVIDIA RTX 6000 Ada Generation
+CUDA runtime version: 12.90
+CUDA drivers version: 13.10
+Nb procs used for the computation: 1
+TRUST version: 1.9.8_beta
+Total number of elements used for the calculation: 51840
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+                                           Computation start-up statistics
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Total time of the start-up:                                                1.19061        
+
+Number of calls to the linear solver per time step:                        2              
+Average time of the resolution of the linear problem per call:             0.124688       
+Average number of iteration of the linear solver per call:                 26             
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+                                                 Time loop statistics
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+The first time step is not accounted for the computation of the time loop statistics
+Total time of the time loop:                                               0.112536       
+Number of time steps:                                                      9              
+Skipped time steps:                                                        1              
+Average time per time step:                                                0.012504       
+Standard deviation between time steps:                                     0.00837201     
+Time elapsed in the skipped time steps:                                    0.00515174     
+
+
+Standard counter description             | Time/step       | % loop time | Call(s)/step   
+------------------------------------------------------------------------------------------
+Linear solver resolutions Ax=B           | 0.00692501      | 55.4        | 1              
+Convection operator                      | 0.0002694614    |  2.2        | 1              
+Diffusion operator                       | 0.0001328644    |  1.1        | 1              
+Gradient operator                        | 0.0001807441    |  1.4        | 2              
+Divergence operator                      | 0.0001022032    |  0.8        | 2              
+Update ::mettre_a_jour                   | 0.0002516219    |  2.0        | 1              
+Computation of the time step dt          | 0.0001513772    |  1.2        | 2              
+Post-treatment operations                | 0.003694517     | 29.5        | 1              
+Other operations                         | 0.0007962459    |  6.4        | 
+
+Average number of iteration of the linear solver per call:                 22.2           
+
+
+-----------------------------------------------------------------------------------------------------------
+                                                    GPU statistics
+-----------------------------------------------------------------------------------------------------------
+Counter description                      | Time per step   | % loop time | Call(s)/step    | Bandwidth 
+-----------------------------------------------------------------------------------------------------------
+Libraries:                               | 0.00685329      | 54.8        | 1               | 
+Kernels:                                 | 0.00129797      | 10.4        | 97              | 
+Copy host to device:                     | 0.000107672     |  0.9        | 8               | 2.0 GB/s
+Copy device to host:                     | 0.000139247     |  1.1        | 4               | 3.9 GB/s
+Alloc/Free on device:                    | 0               |  0.0        | 0               | 
+GPU: 65% Copy H<->D: 2% Alloc/free: 0% Comm: 0% CPU & I/O: 33%
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+                                              Post-resolution statistics
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Time of the post-resolution:                                               0.00246909     
+
+Total time for the whole computation                                       1.31078        
+
diff --git a/tests/GPU/OpenMP_Iterateur/OpenMP_Iterateur.TU.irene-amd-ccrt_cc70 b/tests/GPU/OpenMP_Iterateur/OpenMP_Iterateur.TU.irene-amd-ccrt_cc70
index 69c8125fd3..2f0b2a6301 100644
--- a/tests/GPU/OpenMP_Iterateur/OpenMP_Iterateur.TU.irene-amd-ccrt_cc70
+++ b/tests/GPU/OpenMP_Iterateur/OpenMP_Iterateur.TU.irene-amd-ccrt_cc70
@@ -8,52 +8,51 @@ Time is given in seconds
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
                                               Context of the computation
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-Date:     14-11-2025 -- 10:32:59
-OS:       irene7056__Linux__x86_64__4.18.0-553.69.1.el8_10.x86_64__#1 SMP Thu Aug 7 18:10:00 EDT 2025
-CPU:      Model name:  Intel(R) Xeon(R) Gold 6230 CPU @ 2.10GHz ; Thread(s) per core:  2
-GPU:      | NVIDIA-SMI 570.195.03  Driver Version: 570.195.03  CUDA Version: 12.8  |
-
-Nb procs: 1
-TRUST version: 1.9.7_beta
+Date:     23-04-2026 -- 14:56:21
+OS:       irene7051__Linux__x86_64__4.18.0-553.89.1.el8_10.x86_64__#1 SMP Sat Nov 29 00:49:18 EST 2025
+CPU model : Intel(R) Xeon(R) Gold 6230 CPU @ 2.10GHz
+Total number of threads:80
+GPU model: Tesla V100-SXM2-16GB
+CUDA runtime version: 12.90
+CUDA drivers version: 13.0
+Nb procs used for the computation: 1
+TRUST version: 1.9.8_beta
+Total number of elements used for the calculation: 51840
 
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
                                            Computation start-up statistics
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-Total time of the start-up:                                                3.65722        
-Percent of untracked time during computation start-up:                     1.05162e-06    
+Total time of the start-up:                                                1.07934        
 
 Number of calls to the linear solver per time step:                        2              
-Average time of the resolution of the linear problem per call:             0.501134       
-Average number of iteration of the linear solver per call:                 48             
+Average time of the resolution of the linear problem per call:             0.116356       
+Average number of iteration of the linear solver per call:                 26             
 
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
                                                  Time loop statistics
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 The first time step is not accounted for the computation of the time loop statistics
-Total time of the time loop:                                               0.233292       
+Total time of the time loop:                                               0.152678       
 Number of time steps:                                                      9              
 Skipped time steps:                                                        1              
-Average time per time step:                                                0.0259213      
-Standard deviation between time steps:                                     0.0101201      
-Time elapsed in the skipped time steps:                                    0.0285981      
+Average time per time step:                                                0.0169642      
+Standard deviation between time steps:                                     0.0113575      
+Time elapsed in the skipped time steps:                                    0.0292522      
 
 
 Standard counter description             | Time/step       | % loop time | Call(s)/step   
 ------------------------------------------------------------------------------------------
-Linear solver resolutions Ax=B           | 0.0187438       | 72.3        | 1              
-Convection operator                      | 0.0005950887    |  2.3        | 1              
-Diffusion operator                       | 0.0002307317    |  0.9        | 1              
-Gradient operator                        | 0.000239906     |  0.9        | 2              
-Divergence operator                      | 0.0001558698    |  0.6        | 2              
-Update ::mettre_a_jour                   | 0.0003146512    |  1.2        | 1              
-Computation of the time step dt          | 0.0002480136    |  1.0        | 2              
-Post-treatment operations                | 0.004528829     | 17.5        | 1              
-Other operations                         | 0.0008644418    |  3.3        | 
-
-Untracked time                           | 4.99e-05        | 0.0214      | 
-
+Linear solver resolutions Ax=B           | 0.00966756      | 57.0        | 1              
+Convection operator                      | 0.0003741867    |  2.2        | 1              
+Diffusion operator                       | 0.0001976752    |  1.2        | 1              
+Gradient operator                        | 0.0002378338    |  1.4        | 2              
+Divergence operator                      | 0.0001592834    |  0.9        | 2              
+Update ::mettre_a_jour                   | 0.0003339658    |  2.0        | 1              
+Computation of the time step dt          | 0.0002590474    |  1.5        | 2              
+Post-treatment operations                | 0.004811701     | 28.4        | 1              
+Other operations                         | 0.0009229322    |  5.4        | 
 
-Average number of iteration of the linear solver per call:                 41.8           
+Average number of iteration of the linear solver per call:                 22.2           
 
 
 -----------------------------------------------------------------------------------------------------------
@@ -61,16 +60,17 @@ Average number of iteration of the linear solver per call:                 41.8
 -----------------------------------------------------------------------------------------------------------
 Counter description                      | Time per step   | % loop time | Call(s)/step    | Bandwidth 
 -----------------------------------------------------------------------------------------------------------
-Libraries:                               | 0.0186583       | 72.0        | 1               | 
-Kernels:                                 | 0.00218357      |  8.4        | 98              | 
-Copy host to device:                     | 0.000161664     |  0.6        | 8               | 1.3 GB/s
-Copy device to host:                     | 0.000185338     |  0.7        | 4               | 2.9 GB/s
+Libraries:                               | 0.00957908      | 56.5        | 1               | 
+Kernels:                                 | 0.00190241      | 11.2        | 97              | 
+Copy host to device:                     | 0.000173999     |  1.0        | 8               | 1.2 GB/s
+Copy device to host:                     | 0.000192334     |  1.1        | 4               | 2.8 GB/s
 Alloc/Free on device:                    | 0               |  0.0        | 0               | 
-GPU: 80% Copy H<->D: 1.3% Alloc/free: 0% Comm: 0% CPU & I/O: 18%
+GPU: 68% Copy H<->D: 2.2% Alloc/free: 0% Comm: 0% CPU & I/O: 30%
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
                                               Post-resolution statistics
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-Time of the post-resolution:                                               0.0107429      
+Time of the post-resolution:                                               0.00647745     
 
-Total time for the whole computation                                       3.92985        
+Total time for the whole computation                                       1.26776        
 
+[Slurm] Power consumption (13 s):  0.243 kW  0.001 kWh  0.000 € (0.10€/kWh)
diff --git a/tests/GPU/OpenMP_Iterateur/OpenMP_Iterateur.TU.is157091_cc86 b/tests/GPU/OpenMP_Iterateur/OpenMP_Iterateur.TU.is157091_cc86
index 57da033e4c..4ea430e541 100644
--- a/tests/GPU/OpenMP_Iterateur/OpenMP_Iterateur.TU.is157091_cc86
+++ b/tests/GPU/OpenMP_Iterateur/OpenMP_Iterateur.TU.is157091_cc86
@@ -8,7 +8,7 @@ Time is given in seconds
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
                                               Context of the computation
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-Date:     11-03-2026 -- 20:22:16
+Date:     22-04-2026 -- 20:45:47
 OS:       is157091__Linux__x86_64__5.11.12-300.fc34.x86_64__#1 SMP Wed Apr 7 16:31:13 UTC 2021
 CPU model : AMD Ryzen Threadripper PRO 5975WX 32-Cores
 Total number of threads:64
@@ -22,37 +22,37 @@ Total number of elements used for the calculation: 51840
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
                                            Computation start-up statistics
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-Total time of the start-up:                                                0.436959       
+Total time of the start-up:                                                0.491229       
 
 Number of calls to the linear solver per time step:                        2              
-Average time of the resolution of the linear problem per call:             0.0725759      
-Average number of iteration of the linear solver per call:                 48             
+Average time of the resolution of the linear problem per call:             0.0660598      
+Average number of iteration of the linear solver per call:                 26             
 
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
                                                  Time loop statistics
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 The first time step is not accounted for the computation of the time loop statistics
-Total time of the time loop:                                               0.173573       
+Total time of the time loop:                                               0.106553       
 Number of time steps:                                                      9              
 Skipped time steps:                                                        1              
-Average time per time step:                                                0.0192858      
-Standard deviation between time steps:                                     0.00569502     
-Time elapsed in the skipped time steps:                                    0.00380361     
+Average time per time step:                                                0.0118392      
+Standard deviation between time steps:                                     0.00583933     
+Time elapsed in the skipped time steps:                                    0.00365864     
 
 
 Standard counter description             | Time/step       | % loop time | Call(s)/step   
 ------------------------------------------------------------------------------------------
-Linear solver resolutions Ax=B           | 0.0149777       | 77.7        | 1              
-Convection operator                      | 0.0003584584    |  1.9        | 1              
-Diffusion operator                       | 0.0001395967    |  0.7        | 1              
-Gradient operator                        | 0.0001576809    |  0.8        | 2              
-Divergence operator                      | 0.0001185614    |  0.6        | 2              
-Update ::mettre_a_jour                   | 0.0002077442    |  1.1        | 1              
-Computation of the time step dt          | 0.0001929384    |  1.0        | 2              
-Post-treatment operations                | 0.002563678     | 13.3        | 1              
-Other operations                         | 0.0005694627    |  3.0        | 
+Linear solver resolutions Ax=B           | 0.00756416      | 63.9        | 1              
+Convection operator                      | 0.0003625891    |  3.1        | 1              
+Diffusion operator                       | 0.0001376103    |  1.2        | 1              
+Gradient operator                        | 0.0001551979    |  1.3        | 2              
+Divergence operator                      | 0.0001102787    |  0.9        | 2              
+Update ::mettre_a_jour                   | 0.0002033066    |  1.7        | 1              
+Computation of the time step dt          | 0.0001916568    |  1.6        | 2              
+Post-treatment operations                | 0.002546436     | 21.5        | 1              
+Other operations                         | 0.0005679914    |  4.8        | 
 
-Average number of iteration of the linear solver per call:                 41.8           
+Average number of iteration of the linear solver per call:                 22.2           
 
 
 -----------------------------------------------------------------------------------------------------------
@@ -60,16 +60,16 @@ Average number of iteration of the linear solver per call:                 41.8
 -----------------------------------------------------------------------------------------------------------
 Counter description                      | Time per step   | % loop time | Call(s)/step    | Bandwidth 
 -----------------------------------------------------------------------------------------------------------
-Libraries:                               | 0.0149218       | 77.4        | 1               | 
-Kernels:                                 | 0.00149672      |  7.8        | 98              | 
-Copy host to device:                     | 8.56409e-05     |  0.4        | 8               | 2.5 GB/s
-Copy device to host:                     | 0.000105018     |  0.5        | 4               | 5.2 GB/s
+Libraries:                               | 0.00750619      | 63.4        | 1               | 
+Kernels:                                 | 0.00146924      | 12.4        | 97              | 
+Copy host to device:                     | 8.5329e-05      |  0.7        | 8               | 2.5 GB/s
+Copy device to host:                     | 0.000105709     |  0.9        | 4               | 5.2 GB/s
 Alloc/Free on device:                    | 0               |  0.0        | 0               | 
-GPU: 85% Copy H<->D: 0.99% Alloc/free: 0% Comm: 0% CPU & I/O: 14%
+GPU: 76% Copy H<->D: 1.6% Alloc/free: 0% Comm: 0% CPU & I/O: 23%
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
                                               Post-resolution statistics
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-Time of the post-resolution:                                               0.00183913     
+Time of the post-resolution:                                               0.00270217     
 
-Total time for the whole computation                                       0.616183       
+Total time for the whole computation                                       0.604151       
 
diff --git a/tests/GPU/OpenMP_Iterateur/OpenMP_Iterateur.TU.is159479_cc120 b/tests/GPU/OpenMP_Iterateur/OpenMP_Iterateur.TU.is159479_cc120
new file mode 100644
index 0000000000..19f788812e
--- /dev/null
+++ b/tests/GPU/OpenMP_Iterateur/OpenMP_Iterateur.TU.is159479_cc120
@@ -0,0 +1,75 @@
+                                             # Global performance file #
+
+This is the global file for tracking performance in TRUST. It stores aggregated quantities.
+More detailed statistics can be found in the OpenMP_Iterateur_csv.TU file
+For time loop, only standard counters of level 1 are printed alongside your custom counters
+Time is given in seconds
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+                                              Context of the computation
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Date:     13-05-2026 -- 07:05:25
+OS:       is159479.intra.cea.fr__Linux__x86_64__6.12.0-160000.29-default__#1 SMP PREEMPT_DYNAMIC Fri May  1 12:45:19 UTC 2026 (6
+CPU model : AMD Ryzen Threadripper PRO 9965WX 24-Cores
+Total number of threads:48
+GPU model: NVIDIA RTX PRO 6000 Blackwell Workstation Edition
+CUDA runtime version: 13.0
+CUDA drivers version: 13.0
+Nb procs used for the computation: 1
+TRUST version: 1.9.8_beta
+Total number of elements used for the calculation: 51840
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+                                           Computation start-up statistics
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Total time of the start-up:                                                0.363946       
+
+Number of calls to the linear solver per time step:                        2              
+Average time of the resolution of the linear problem per call:             0.0499642      
+Average number of iteration of the linear solver per call:                 26             
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+                                                 Time loop statistics
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+The first time step is not accounted for the computation of the time loop statistics
+Total time of the time loop:                                               0.0787909      
+Number of time steps:                                                      9              
+Skipped time steps:                                                        1              
+Average time per time step:                                                0.00875454     
+Standard deviation between time steps:                                     0.00358588     
+Time elapsed in the skipped time steps:                                    0.00282589     
+
+
+Standard counter description             | Time/step       | % loop time | Call(s)/step   
+------------------------------------------------------------------------------------------
+Linear solver resolutions Ax=B           | 0.00601979      | 68.8        | 1              
+Convection operator                      | 0.0001674919    |  1.9        | 1              
+Diffusion operator                       | 8.354833e-05    |  1.0        | 1              
+Gradient operator                        | 0.0001087588    |  1.2        | 2              
+Divergence operator                      | 6.7767e-05      |  0.8        | 2              
+Update ::mettre_a_jour                   | 0.0001461213    |  1.7        | 1              
+Computation of the time step dt          | 9.826378e-05    |  1.1        | 2              
+Post-treatment operations                | 0.001635225     | 18.7        | 1              
+Other operations                         | 0.0004275759    |  4.9        | 
+
+Average number of iteration of the linear solver per call:                 22.2           
+
+
+-----------------------------------------------------------------------------------------------------------
+                                                    GPU statistics
+-----------------------------------------------------------------------------------------------------------
+Counter description                      | Time per step   | % loop time | Call(s)/step    | Bandwidth 
+-----------------------------------------------------------------------------------------------------------
+Libraries:                               | 0.00598622      | 68.4        | 1               | 
+Kernels:                                 | 0.000918968     | 10.5        | 97              | 
+Copy host to device:                     | 6.19321e-05     |  0.7        | 8               | 3.5 GB/s
+Copy device to host:                     | 0.000119888     |  1.4        | 4               | 4.5 GB/s
+Alloc/Free on device:                    | 0               |  0.0        | 0               | 
+GPU: 79% Copy H<->D: 2.1% Alloc/free: 0% Comm: 0% CPU & I/O: 19%
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+                                              Post-resolution statistics
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Time of the post-resolution:                                               0.00140214     
+
+Total time for the whole computation                                       0.446975       
+
diff --git a/tests/GPU/OpenMP_Iterateur/OpenMP_Iterateur.TU.is247793_gfx1100 b/tests/GPU/OpenMP_Iterateur/OpenMP_Iterateur.TU.is247793_gfx1100
new file mode 100644
index 0000000000..5b87937b2e
--- /dev/null
+++ b/tests/GPU/OpenMP_Iterateur/OpenMP_Iterateur.TU.is247793_gfx1100
@@ -0,0 +1,75 @@
+                                             # Global performance file #
+
+This is the global file for tracking performance in TRUST. It stores aggregated quantities.
+More detailed statistics can be found in the OpenMP_Iterateur_csv.TU file
+For time loop, only standard counters of level 1 are printed alongside your custom counters
+Time is given in seconds
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+                                              Context of the computation
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Date:     22-05-2026 -- 19:04:54
+OS:       is247793__Linux__x86_64__6.8.0-107-generic__#107~22.04.1-Ubuntu SMP PREEMPT_DYNAMIC Wed Mar 18 23:40:43 UTC 
+CPU model : AMD Ryzen Threadripper PRO 7965WX 24-Cores
+Total number of threads:48
+GPU model: AMD Radeon PRO W7900
+HIP runtime version: 7.53
+HIP drivers version: 7.53
+Nb procs used for the computation: 1
+TRUST version: 1.9.8_beta
+Total number of elements used for the calculation: 51840
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+                                           Computation start-up statistics
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Total time of the start-up:                                                1.23986        
+
+Number of calls to the linear solver per time step:                        2              
+Average time of the resolution of the linear problem per call:             0.413036       
+Average number of iteration of the linear solver per call:                 26             
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+                                                 Time loop statistics
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+The first time step is not accounted for the computation of the time loop statistics
+Total time of the time loop:                                               0.0998113      
+Number of time steps:                                                      9              
+Skipped time steps:                                                        1              
+Average time per time step:                                                0.0110901      
+Standard deviation between time steps:                                     0.00412294     
+Time elapsed in the skipped time steps:                                    0.00524283     
+
+
+Standard counter description             | Time/step       | % loop time | Call(s)/step   
+------------------------------------------------------------------------------------------
+Linear solver resolutions Ax=B           | 0.00646524      | 58.3        | 1              
+Convection operator                      | 0.0003997576    |  3.6        | 1              
+Diffusion operator                       | 0.0002390461    |  2.2        | 1              
+Gradient operator                        | 0.0002989122    |  2.7        | 2              
+Divergence operator                      | 0.0001892956    |  1.7        | 2              
+Update ::mettre_a_jour                   | 0.0003517464    |  3.2        | 1              
+Computation of the time step dt          | 0.0002773126    |  2.5        | 2              
+Post-treatment operations                | 0.001906918     | 17.2        | 1              
+Other operations                         | 0.0009619149    |  8.7        | 
+
+Average number of iteration of the linear solver per call:                 22.2           
+
+
+-----------------------------------------------------------------------------------------------------------
+                                                    GPU statistics
+-----------------------------------------------------------------------------------------------------------
+Counter description                      | Time per step   | % loop time | Call(s)/step    | Bandwidth 
+-----------------------------------------------------------------------------------------------------------
+Libraries:                               | 0.00638564      | 57.6        | 1               | 
+Kernels:                                 | 0.00255147      | 23.0        | 97              | 
+Copy host to device:                     | 0.000176835     |  1.6        | 8               | 1.2 GB/s
+Copy device to host:                     | 9.96538e-05     |  0.9        | 4               | 5.5 GB/s
+Alloc/Free on device:                    | 0               |  0.0        | 0               | 
+GPU: 81% Copy H<->D: 2.5% Alloc/free: 0% Comm: 0% CPU & I/O: 17%
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+                                              Post-resolution statistics
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Time of the post-resolution:                                               0.00283027     
+
+Total time for the whole computation                                       1.34775        
+
diff --git a/tests/GPU/OpenMP_Iterateur/OpenMP_Iterateur.TU.jean-zay_cc90 b/tests/GPU/OpenMP_Iterateur/OpenMP_Iterateur.TU.jean-zay_cc90
index f8fb79c9e3..4b7e6ff550 100644
--- a/tests/GPU/OpenMP_Iterateur/OpenMP_Iterateur.TU.jean-zay_cc90
+++ b/tests/GPU/OpenMP_Iterateur/OpenMP_Iterateur.TU.jean-zay_cc90
@@ -8,52 +8,51 @@ Time is given in seconds
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
                                               Context of the computation
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-Date:     19-11-2025 -- 11:42:09
-OS:       jzxh177__Linux__x86_64__5.14.0-427.76.1.el9_4.x86_64__#1 SMP PREEMPT_DYNAMIC Fri Jun 27 09:53:45 EDT 2025
-CPU:      Model name:  Intel(R) Xeon(R) Platinum 8468 ; Thread(s) per core:  2
-GPU:      | NVIDIA-SMI 570.86.15  Driver Version: 570.86.15  CUDA Version: 12.8  |
-|  0  NVIDIA H100 80GB HBM3  On  |  00000000:
-Nb procs: 1
-TRUST version: 1.9.7_beta
+Date:     23-04-2026 -- 08:20:22
+OS:       jzxh011__Linux__x86_64__5.14.0-570.58.1.el9_6.x86_64__#1 SMP PREEMPT_DYNAMIC Tue Oct 21 04:15:07 EDT 2025
+CPU model : Intel(R) Xeon(R) Platinum 8468
+Total number of threads:192
+GPU model: NVIDIA H100 80GB HBM3
+CUDA runtime version: 12.60
+CUDA drivers version: 13.0
+Nb procs used for the computation: 1
+TRUST version: 1.9.8_beta
+Total number of elements used for the calculation: 51840
 
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
                                            Computation start-up statistics
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-Total time of the start-up:                                                3.28177        
-Percent of untracked time during computation start-up:                     9.65941e-08    
+Total time of the start-up:                                                2.50003        
 
 Number of calls to the linear solver per time step:                        2              
-Average time of the resolution of the linear problem per call:             0.70507        
-Average number of iteration of the linear solver per call:                 48             
+Average time of the resolution of the linear problem per call:             0.0936644      
+Average number of iteration of the linear solver per call:                 26             
 
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
                                                  Time loop statistics
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 The first time step is not accounted for the computation of the time loop statistics
-Total time of the time loop:                                               0.183362       
+Total time of the time loop:                                               0.112767       
 Number of time steps:                                                      9              
 Skipped time steps:                                                        1              
-Average time per time step:                                                0.0203736      
-Standard deviation between time steps:                                     0.00609699     
-Time elapsed in the skipped time steps:                                    0.0231738      
+Average time per time step:                                                0.0125296      
+Standard deviation between time steps:                                     0.00642425     
+Time elapsed in the skipped time steps:                                    0.112153       
 
 
 Standard counter description             | Time/step       | % loop time | Call(s)/step   
 ------------------------------------------------------------------------------------------
-Linear solver resolutions Ax=B           | 0.0146579       | 71.9        | 1              
-Convection operator                      | 0.0002678476    |  1.3        | 1              
-Diffusion operator                       | 0.0001548862    |  0.8        | 1              
-Gradient operator                        | 0.0001796749    |  0.9        | 2              
-Divergence operator                      | 0.0001112369    |  0.5        | 2              
-Update ::mettre_a_jour                   | 0.0002635913    |  1.3        | 1              
-Computation of the time step dt          | 0.0001501152    |  0.7        | 2              
-Post-treatment operations                | 0.002908446     | 14.3        | 1              
-Other operations                         | 0.001679913     |  8.2        | 
+Linear solver resolutions Ax=B           | 0.00675927      | 53.9        | 1              
+Convection operator                      | 0.000223977     |  1.8        | 1              
+Diffusion operator                       | 0.0001341254    |  1.1        | 1              
+Gradient operator                        | 0.0001789626    |  1.4        | 2              
+Divergence operator                      | 0.0001138861    |  0.9        | 2              
+Update ::mettre_a_jour                   | 0.0002445856    |  2.0        | 1              
+Computation of the time step dt          | 0.0001574472    |  1.3        | 2              
+Post-treatment operations                | 0.00296707      | 23.7        | 1              
+Other operations                         | 0.001750308     | 14.0        | 
 
-Untracked time                           | 5.81e-05        | 0.0317      | 
-
-
-Average number of iteration of the linear solver per call:                 41.8           
+Average number of iteration of the linear solver per call:                 22.2           
 
 
 -----------------------------------------------------------------------------------------------------------
@@ -61,16 +60,17 @@ Average number of iteration of the linear solver per call:                 41.8
 -----------------------------------------------------------------------------------------------------------
 Counter description                      | Time per step   | % loop time | Call(s)/step    | Bandwidth 
 -----------------------------------------------------------------------------------------------------------
-Libraries:                               | 0.0145851       | 71.6        | 1               | 
-Kernels:                                 | 0.0014527       |  7.1        | 98              | 
-Copy host to device:                     | 0.000107502     |  0.5        | 8               | 2.0 GB/s
-Copy device to host:                     | 0.000125785     |  0.6        | 4               | 4.3 GB/s
+Libraries:                               | 0.00669146      | 53.4        | 1               | 
+Kernels:                                 | 0.00135639      | 10.8        | 97              | 
+Copy host to device:                     | 0.000121315     |  1.0        | 8               | 1.8 GB/s
+Copy device to host:                     | 0.000134266     |  1.1        | 4               | 4.1 GB/s
 Alloc/Free on device:                    | 0               |  0.0        | 0               | 
-GPU: 79% Copy H<->D: 1.1% Alloc/free: 0% Comm: 0% CPU & I/O: 20%
+GPU: 64% Copy H<->D: 2% Alloc/free: 0% Comm: 0% CPU & I/O: 34%
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
                                               Post-resolution statistics
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-Time of the post-resolution:                                               0.0166082      
+Time of the post-resolution:                                               0.0203563      
 
-Total time for the whole computation                                       3.50492        
+Total time for the whole computation                                       2.74533        
 
+[Slurm] Power consumption (12 s):  0.393 kW  0.001 kWh  0.000 € (0.10€/kWh)
diff --git a/tests/GPU/OpenMP_Iterateur/OpenMP_Iterateur.TU.lumi_gfx90a b/tests/GPU/OpenMP_Iterateur/OpenMP_Iterateur.TU.lumi_gfx90a
index d575a024b3..820e885b9b 100644
--- a/tests/GPU/OpenMP_Iterateur/OpenMP_Iterateur.TU.lumi_gfx90a
+++ b/tests/GPU/OpenMP_Iterateur/OpenMP_Iterateur.TU.lumi_gfx90a
@@ -1,76 +1 @@
-                                             # Global performance file #
-
-This is the global file for tracking performance in TRUST. It stores aggregated quantities.
-More detailed statistics can be found in the OpenMP_Iterateur_csv.TU file
-For time loop, only standard counters of level 1 are printed alongside your custom counters
-Time is given in seconds
-
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-                                              Context of the computation
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-Date:     24-02-2026 -- 00:12:53
-OS:       nid007971__Linux__x86_64__6.4.0-150600.23.73_15.0.14-cray_shasta_c__#1 SMP Tue Oct 21 20:32:25 UTC 2025 (89d3c98)
-CPU model : AMD EPYC 7A53 64-Core Processor
-Total number of threads:128
-GPU model: AMD Instinct MI250X
-HIP runtime version: 6.43
-HIP drivers version: 6.43
-Nb procs used for the computation: 1
-TRUST version: 1.9.8_beta
-Total number of elements used for the calculation: 51840
-
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-                                           Computation start-up statistics
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-Total time of the start-up:                                                39.2215        
-
-Number of calls to the linear solver per time step:                        2              
-Average time of the resolution of the linear problem per call:             1.44943        
-Average number of iteration of the linear solver per call:                 48             
-
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-                                                 Time loop statistics
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-The first time step is not accounted for the computation of the time loop statistics
-Total time of the time loop:                                               0.12493        
-Number of time steps:                                                      9              
-Skipped time steps:                                                        1              
-Average time per time step:                                                0.0138811      
-Standard deviation between time steps:                                     0.0069912      
-Time elapsed in the skipped time steps:                                    0.0144053      
-
-
-Standard counter description             | Time/step       | % loop time | Call(s)/step   
-------------------------------------------------------------------------------------------
-Linear solver resolutions Ax=B           | 0.00808639      | 52.2        | 1              
-Convection operator                      | 0.0004544939    |  2.9        | 1              
-Diffusion operator                       | 0.00026234      |  1.7        | 1              
-Gradient operator                        | 0.0003127612    |  2.0        | 2              
-Divergence operator                      | 0.0001773008    |  1.1        | 2              
-Update ::mettre_a_jour                   | 0.000312985     |  2.0        | 1              
-Computation of the time step dt          | 0.0002489606    |  1.6        | 2              
-Post-treatment operations                | 0.003119252     | 20.1        | 1              
-Other operations                         | 0.0009066113    |  5.9        | 
-
-Average number of iteration of the linear solver per call:                 42             
-
-
------------------------------------------------------------------------------------------------------------
-                                                    GPU statistics
------------------------------------------------------------------------------------------------------------
-Counter description                      | Time per step   | % loop time | Call(s)/step    | Bandwidth 
------------------------------------------------------------------------------------------------------------
-Libraries:                               | 0.00801202      | 57.7        | 1               | 
-Kernels:                                 | 0.00229587      | 16.5        | 98              | 
-Copy host to device:                     | 0.000151657     |  1.1        | 8               | 1.4 GB/s
-Copy device to host:                     | 0.00014025      |  1.0        | 4               | 3.9 GB/s
-Alloc/Free on device:                    | 0               |  0.0        | 0               | 
-GPU: 74% Copy H<->D: 2.1% Alloc/free: 0% Comm: 0% CPU & I/O: 24%
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-                                              Post-resolution statistics
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-Time of the post-resolution:                                               0.00522131     
-
-Total time for the whole computation                                       39.366         
-
-Power consumption: 0 kW 0 kWh 0 € (0.12€/kWh)
+[Slurm] Power consumption (1667 s):  0.521 kW  0.241 kWh  0.024 € (0.10€/kWh)
diff --git a/tests/GPU/OpenMP_Iterateur/OpenMP_Iterateur.TU.topaze_cc80 b/tests/GPU/OpenMP_Iterateur/OpenMP_Iterateur.TU.topaze_cc80
index d45c1128a2..f32139d334 100644
--- a/tests/GPU/OpenMP_Iterateur/OpenMP_Iterateur.TU.topaze_cc80
+++ b/tests/GPU/OpenMP_Iterateur/OpenMP_Iterateur.TU.topaze_cc80
@@ -8,52 +8,51 @@ Time is given in seconds
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
                                               Context of the computation
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-Date:     16-11-2025 -- 08:34:37
-OS:       topaze7068__Linux__x86_64__4.18.0-553.69.1.el8_10.x86_64__#1 SMP Thu Aug 7 18:10:00 EDT 2025
-CPU:      Model name:  AMD EPYC 7763 64-Core Processor ; Thread(s) per core:  2
-GPU:      | NVIDIA-SMI 570.195.03  Driver Version: 570.195.03  CUDA Version: 12.8  |
-|  0  NVIDIA A100-SXM4-80GB  On  |  0000000
-Nb procs: 1
-TRUST version: 1.9.7_beta
+Date:     15-05-2026 -- 13:41:54
+OS:       topaze7040__Linux__x86_64__4.18.0-553.89.1.el8_10.x86_64__#1 SMP Sat Nov 29 00:49:18 EST 2025
+CPU model : AMD EPYC 7763 64-Core Processor
+Total number of threads:256
+GPU model: NVIDIA A100-SXM4-80GB
+CUDA runtime version: 12.90
+CUDA drivers version: 13.0
+Nb procs used for the computation: 1
+TRUST version: 1.9.8_beta
+Total number of elements used for the calculation: 51840
 
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
                                            Computation start-up statistics
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-Total time of the start-up:                                                2.35008        
-Percent of untracked time during computation start-up:                     1.74462e-07    
+Total time of the start-up:                                                1.39309        
 
 Number of calls to the linear solver per time step:                        2              
-Average time of the resolution of the linear problem per call:             0.362608       
-Average number of iteration of the linear solver per call:                 48             
+Average time of the resolution of the linear problem per call:             0.173037       
+Average number of iteration of the linear solver per call:                 26             
 
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
                                                  Time loop statistics
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 The first time step is not accounted for the computation of the time loop statistics
-Total time of the time loop:                                               0.181071       
+Total time of the time loop:                                               0.117747       
 Number of time steps:                                                      9              
 Skipped time steps:                                                        1              
-Average time per time step:                                                0.020119       
-Standard deviation between time steps:                                     0.00767738     
-Time elapsed in the skipped time steps:                                    0.0213211      
+Average time per time step:                                                0.013083       
+Standard deviation between time steps:                                     0.007486       
+Time elapsed in the skipped time steps:                                    0.0172257      
 
 
 Standard counter description             | Time/step       | % loop time | Call(s)/step   
 ------------------------------------------------------------------------------------------
-Linear solver resolutions Ax=B           | 0.014519        | 72.2        | 1              
-Convection operator                      | 0.0003598946    |  1.8        | 1              
-Diffusion operator                       | 0.0001784016    |  0.9        | 1              
-Gradient operator                        | 0.0002058404    |  1.0        | 2              
-Divergence operator                      | 0.0001244321    |  0.6        | 2              
-Update ::mettre_a_jour                   | 0.0003190001    |  1.6        | 1              
-Computation of the time step dt          | 0.0001833313    |  0.9        | 2              
-Post-treatment operations                | 0.003421306     | 17.0        | 1              
-Other operations                         | 0.000807747     |  4.0        | 
+Linear solver resolutions Ax=B           | 0.00780977      | 59.7        | 1              
+Convection operator                      | 0.0002721863    |  2.1        | 1              
+Diffusion operator                       | 0.0001520589    |  1.2        | 1              
+Gradient operator                        | 0.0001971098    |  1.5        | 2              
+Divergence operator                      | 0.0001235297    |  0.9        | 2              
+Update ::mettre_a_jour                   | 0.0002596853    |  2.0        | 1              
+Computation of the time step dt          | 0.0001791377    |  1.4        | 2              
+Post-treatment operations                | 0.003267401     | 25.0        | 1              
+Other operations                         | 0.0008220941    |  6.3        | 
 
-Untracked time                           | 5.96e-05        | 0.0329      | 
-
-
-Average number of iteration of the linear solver per call:                 41.8           
+Average number of iteration of the linear solver per call:                 22.2           
 
 
 -----------------------------------------------------------------------------------------------------------
@@ -61,16 +60,17 @@ Average number of iteration of the linear solver per call:                 41.8
 -----------------------------------------------------------------------------------------------------------
 Counter description                      | Time per step   | % loop time | Call(s)/step    | Bandwidth 
 -----------------------------------------------------------------------------------------------------------
-Libraries:                               | 0.0144396       | 71.8        | 1               | 
-Kernels:                                 | 0.00169249      |  8.4        | 98              | 
-Copy host to device:                     | 0.000121641     |  0.6        | 8               | 1.8 GB/s
-Copy device to host:                     | 0.0001136       |  0.6        | 4               | 4.8 GB/s
+Libraries:                               | 0.00773568      | 59.1        | 1               | 
+Kernels:                                 | 0.0015436       | 11.8        | 97              | 
+Copy host to device:                     | 0.000123199     |  0.9        | 8               | 1.7 GB/s
+Copy device to host:                     | 0.00011577      |  0.9        | 4               | 4.7 GB/s
 Alloc/Free on device:                    | 0               |  0.0        | 0               | 
-GPU: 80% Copy H<->D: 1.2% Alloc/free: 0% Comm: 0% CPU & I/O: 19%
+GPU: 71% Copy H<->D: 1.8% Alloc/free: 0% Comm: 0% CPU & I/O: 27%
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
                                               Post-resolution statistics
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-Time of the post-resolution:                                               0.00642543     
+Time of the post-resolution:                                               0.00583202     
 
-Total time for the whole computation                                       2.5589         
+Total time for the whole computation                                       1.53391        
 
+[Slurm] Power consumption (28 s):  0.607 kW  0.005 kWh  0.000 € (0.10€/kWh)
diff --git a/tests/GPU/OpenMP_Iterateur/OpenMP_Iterateur.data b/tests/GPU/OpenMP_Iterateur/OpenMP_Iterateur.data
index da6a748340..463d7e03d9 100644
--- a/tests/GPU/OpenMP_Iterateur/OpenMP_Iterateur.data
+++ b/tests/GPU/OpenMP_Iterateur/OpenMP_Iterateur.data
@@ -41,7 +41,8 @@ END PARTITION #
 Scatter DOM.Zones dom
 END SCATTER #
 
-VEFPreP1B dis Lire dis { P0 }
+VEFPreP1B dis 
+Lire dis { P0 reorder { algo Hilbert } }
 
 # Runge_Kutta_ordre_3 #
 Scheme_euler_explicit sch
diff --git a/tests/GPU/OpenMP_Iterateur/OpenMP_Iterateur_BENCH_AmgX.TU.dalianvl_cc100 b/tests/GPU/OpenMP_Iterateur/OpenMP_Iterateur_BENCH_AmgX.TU.dalianvl_cc100
new file mode 100644
index 0000000000..3faa5b481e
--- /dev/null
+++ b/tests/GPU/OpenMP_Iterateur/OpenMP_Iterateur_BENCH_AmgX.TU.dalianvl_cc100
@@ -0,0 +1,75 @@
+                                             # Global performance file #
+
+This is the global file for tracking performance in TRUST. It stores aggregated quantities.
+More detailed statistics can be found in the OpenMP_Iterateur_BENCH_AmgX_csv.TU file
+For time loop, only standard counters of level 1 are printed alongside your custom counters
+Time is given in seconds
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+                                              Context of the computation
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Date:     09-06-2026 -- 17:39:31
+OS:       dalianvl08__Linux__aarch64__5.14.0-503.40.1.el9_5.aarch64__#1 SMP PREEMPT_DYNAMIC Thu Apr 24 13:05:29 UTC 2025
+CPU model : Unknown Linux CPU
+Total number of threads:144
+GPU model: NVIDIA GB200
+CUDA runtime version: 13.0
+CUDA drivers version: 13.20
+Nb procs used for the computation: 1
+TRUST version: 1.9.8_beta
+Total number of elements used for the calculation: 2592000
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+                                           Computation start-up statistics
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Total time of the start-up:                                                10.2002        
+
+Number of calls to the linear solver per time step:                        2              
+Average time of the resolution of the linear problem per call:             2.06413        
+Average number of iteration of the linear solver per call:                 41             
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+                                                 Time loop statistics
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+The first time step is not accounted for the computation of the time loop statistics
+Total time of the time loop:                                               0.550944       
+Number of time steps:                                                      9              
+Skipped time steps:                                                        1              
+Average time per time step:                                                0.061216       
+Standard deviation between time steps:                                     0.00242635     
+Time elapsed in the skipped time steps:                                    0.0899983      
+
+
+Standard counter description             | Time/step       | % loop time | Call(s)/step   
+------------------------------------------------------------------------------------------
+Linear solver resolutions Ax=B           | 0.0498086       | 81.4        | 1              
+Convection operator                      | 0.002785091     |  4.5        | 1              
+Diffusion operator                       | 0.0007763384    |  1.3        | 1              
+Gradient operator                        | 0.001174778     |  1.9        | 2              
+Divergence operator                      | 0.00054559      |  0.9        | 2              
+Update ::mettre_a_jour                   | 0.0007365982    |  1.2        | 1              
+Computation of the time step dt          | 0.00056079      |  0.9        | 2              
+Post-treatment operations                | 0.001837264     |  3.0        | 1              
+Other operations                         | 0.002991011     |  4.9        | 
+
+Average number of iteration of the linear solver per call:                 31.1           
+
+
+-----------------------------------------------------------------------------------------------------------
+                                                    GPU statistics
+-----------------------------------------------------------------------------------------------------------
+Counter description                      | Time per step   | % loop time | Call(s)/step    | Bandwidth 
+-----------------------------------------------------------------------------------------------------------
+Libraries:                               | 0.049687        | 81.2        | 1               | 
+Kernels:                                 | 0.00790305      | 12.9        | 96              | 
+Copy host to device:                     | 0.00016891      |  0.3        | 8               | 16.4 GB/s
+Copy device to host:                     | 0.000112647     |  0.2        | 4               | 54.9 GB/s
+Alloc/Free on device:                    | 0               |  0.0        | 0               | 
+GPU: 94% Copy H<->D: 0.46% Alloc/free: 0% Comm: 0% CPU & I/O: 5.5%
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+                                              Post-resolution statistics
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Time of the post-resolution:                                               0.191478       
+
+Total time for the whole computation                                       11.0326        
+
diff --git a/tests/GPU/OpenMP_Iterateur/OpenMP_Iterateur_BENCH_AmgX.TU.eureka_cc89 b/tests/GPU/OpenMP_Iterateur/OpenMP_Iterateur_BENCH_AmgX.TU.eureka_cc89
new file mode 100644
index 0000000000..1c29c75593
--- /dev/null
+++ b/tests/GPU/OpenMP_Iterateur/OpenMP_Iterateur_BENCH_AmgX.TU.eureka_cc89
@@ -0,0 +1,75 @@
+                                             # Global performance file #
+
+This is the global file for tracking performance in TRUST. It stores aggregated quantities.
+More detailed statistics can be found in the OpenMP_Iterateur_BENCH_AmgX_csv.TU file
+For time loop, only standard counters of level 1 are printed alongside your custom counters
+Time is given in seconds
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+                                              Context of the computation
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Date:     11-05-2026 -- 08:52:46
+OS:       eureka__Linux__x86_64__6.17.0-20-generic__#20~24.04.1-Ubuntu SMP PREEMPT_DYNAMIC Thu Mar 19 01:28:37 UTC 2
+CPU model : INTEL(R) XEON(R) PLATINUM 8580
+Total number of threads:240
+GPU model: NVIDIA RTX 6000 Ada Generation
+CUDA runtime version: 12.90
+CUDA drivers version: 13.10
+Nb procs used for the computation: 1
+TRUST version: 1.9.8_beta
+Total number of elements used for the calculation: 2592000
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+                                           Computation start-up statistics
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Total time of the start-up:                                                12.7089        
+
+Number of calls to the linear solver per time step:                        2              
+Average time of the resolution of the linear problem per call:             2.86719        
+Average number of iteration of the linear solver per call:                 41             
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+                                                 Time loop statistics
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+The first time step is not accounted for the computation of the time loop statistics
+Total time of the time loop:                                               0.913517       
+Number of time steps:                                                      9              
+Skipped time steps:                                                        1              
+Average time per time step:                                                0.101502       
+Standard deviation between time steps:                                     0.00424278     
+Time elapsed in the skipped time steps:                                    0.122592       
+
+
+Standard counter description             | Time/step       | % loop time | Call(s)/step   
+------------------------------------------------------------------------------------------
+Linear solver resolutions Ax=B           | 0.0746953       | 73.6        | 1              
+Convection operator                      | 0.007268645     |  7.2        | 1              
+Diffusion operator                       | 0.002631552     |  2.6        | 1              
+Gradient operator                        | 0.002676735     |  2.6        | 2              
+Divergence operator                      | 0.001694524     |  1.7        | 2              
+Update ::mettre_a_jour                   | 0.001962899     |  1.9        | 1              
+Computation of the time step dt          | 0.001669003     |  1.6        | 2              
+Post-treatment operations                | 0.003160382     |  3.1        | 1              
+Other operations                         | 0.005742889     |  5.7        | 
+
+Average number of iteration of the linear solver per call:                 31.1           
+
+
+-----------------------------------------------------------------------------------------------------------
+                                                    GPU statistics
+-----------------------------------------------------------------------------------------------------------
+Counter description                      | Time per step   | % loop time | Call(s)/step    | Bandwidth 
+-----------------------------------------------------------------------------------------------------------
+Libraries:                               | 0.0745032       | 73.4        | 1               | 
+Kernels:                                 | 0.0227984       | 22.5        | 96              | 
+Copy host to device:                     | 0.000396213     |  0.4        | 8               | 7.0 GB/s
+Copy device to host:                     | 0.00111494      |  1.1        | 4               | 5.5 GB/s
+Alloc/Free on device:                    | 0               |  0.0        | 0               | 
+GPU: 96% Copy H<->D: 1.5% Alloc/free: 0% Comm: 0% CPU & I/O: 2.6%
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+                                              Post-resolution statistics
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Time of the post-resolution:                                               0.0563904      
+
+Total time for the whole computation                                       13.8014        
+
diff --git a/tests/GPU/OpenMP_Iterateur/OpenMP_Iterateur_BENCH_AmgX.TU.irene-amd-ccrt_cc70 b/tests/GPU/OpenMP_Iterateur/OpenMP_Iterateur_BENCH_AmgX.TU.irene-amd-ccrt_cc70
index ed993402fa..aae10238db 100644
--- a/tests/GPU/OpenMP_Iterateur/OpenMP_Iterateur_BENCH_AmgX.TU.irene-amd-ccrt_cc70
+++ b/tests/GPU/OpenMP_Iterateur/OpenMP_Iterateur_BENCH_AmgX.TU.irene-amd-ccrt_cc70
@@ -8,8 +8,8 @@ Time is given in seconds
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
                                               Context of the computation
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-Date:     20-03-2026 -- 14:20:52
-OS:       irene7053__Linux__x86_64__4.18.0-553.89.1.el8_10.x86_64__#1 SMP Sat Nov 29 00:49:18 EST 2025
+Date:     23-04-2026 -- 14:57:06
+OS:       irene7051__Linux__x86_64__4.18.0-553.89.1.el8_10.x86_64__#1 SMP Sat Nov 29 00:49:18 EST 2025
 CPU model : Intel(R) Xeon(R) Gold 6230 CPU @ 2.10GHz
 Total number of threads:80
 GPU model: Tesla V100-SXM2-16GB
@@ -22,35 +22,35 @@ Total number of elements used for the calculation: 2592000
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
                                            Computation start-up statistics
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-Total time of the start-up:                                                19.755         
+Total time of the start-up:                                                20.0048        
 
 Number of calls to the linear solver per time step:                        2              
-Average time of the resolution of the linear problem per call:             4.47609        
+Average time of the resolution of the linear problem per call:             4.83528        
 Average number of iteration of the linear solver per call:                 41             
 
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
                                                  Time loop statistics
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 The first time step is not accounted for the computation of the time loop statistics
-Total time of the time loop:                                               1.61756        
+Total time of the time loop:                                               1.56294        
 Number of time steps:                                                      9              
 Skipped time steps:                                                        1              
-Average time per time step:                                                0.179729       
-Standard deviation between time steps:                                     0.00645205     
-Time elapsed in the skipped time steps:                                    0.252334       
+Average time per time step:                                                0.17366        
+Standard deviation between time steps:                                     0.00621117     
+Time elapsed in the skipped time steps:                                    0.240243       
 
 
 Standard counter description             | Time/step       | % loop time | Call(s)/step   
 ------------------------------------------------------------------------------------------
-Linear solver resolutions Ax=B           | 0.127466        | 70.9        | 1              
-Convection operator                      | 0.01145873      |  6.4        | 1              
-Diffusion operator                       | 0.004238428     |  2.4        | 1              
-Gradient operator                        | 0.0100999       |  5.6        | 2              
-Divergence operator                      | 0.006367787     |  3.5        | 2              
-Update ::mettre_a_jour                   | 0.005436899     |  3.0        | 1              
-Computation of the time step dt          | 0.006054541     |  3.4        | 2              
-Post-treatment operations                | 0.0041794       |  2.3        | 1              
-Other operations                         | 0.00442691      |  2.5        | 
+Linear solver resolutions Ax=B           | 0.127391        | 73.4        | 1              
+Convection operator                      | 0.01146404      |  6.6        | 1              
+Diffusion operator                       | 0.004249371     |  2.4        | 1              
+Gradient operator                        | 0.004379879     |  2.5        | 2              
+Divergence operator                      | 0.006376915     |  3.7        | 2              
+Update ::mettre_a_jour                   | 0.00544122      |  3.1        | 1              
+Computation of the time step dt          | 0.006055941     |  3.5        | 2              
+Post-treatment operations                | 0.004124303     |  2.4        | 1              
+Other operations                         | 0.004177375     |  2.4        | 
 
 Average number of iteration of the linear solver per call:                 31.1           
 
@@ -60,17 +60,17 @@ Average number of iteration of the linear solver per call:                 31.1
 -----------------------------------------------------------------------------------------------------------
 Counter description                      | Time per step   | % loop time | Call(s)/step    | Bandwidth 
 -----------------------------------------------------------------------------------------------------------
-Libraries:                               | 0.127187        | 70.8        | 1               | 
-Kernels:                                 | 0.046662        | 26.0        | 97              | 
-Copy host to device:                     | 0.000836908     |  0.5        | 8               | 3.3 GB/s
-Copy device to host:                     | 0.00149839      |  0.8        | 4               | 4.1 GB/s
+Libraries:                               | 0.127112        | 73.2        | 1               | 
+Kernels:                                 | 0.0407369       | 23.5        | 96              | 
+Copy host to device:                     | 0.000828729     |  0.5        | 8               | 3.3 GB/s
+Copy device to host:                     | 0.00136563      |  0.8        | 4               | 4.5 GB/s
 Alloc/Free on device:                    | 0               |  0.0        | 0               | 
-GPU: 97% Copy H<->D: 1.3% Alloc/free: 0% Comm: 0% CPU & I/O: 2%
+GPU: 97% Copy H<->D: 1.3% Alloc/free: 0% Comm: 0% CPU & I/O: 2.1%
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
                                               Post-resolution statistics
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-Time of the post-resolution:                                               0.174162       
+Time of the post-resolution:                                               0.144198       
 
-Total time for the whole computation                                       21.7991        
+Total time for the whole computation                                       21.9522        
 
-[Slurm] Power consumption (38 s):  0.217 kW  0.002 kWh  0.000 € (0.10€/kWh)
+[Slurm] Power consumption (36 s):  0.160 kW  0.002 kWh  0.000 € (0.10€/kWh)
diff --git a/tests/GPU/OpenMP_Iterateur/OpenMP_Iterateur_BENCH_AmgX.TU.is157091_cc86 b/tests/GPU/OpenMP_Iterateur/OpenMP_Iterateur_BENCH_AmgX.TU.is157091_cc86
index b039591652..3599c3f3fc 100644
--- a/tests/GPU/OpenMP_Iterateur/OpenMP_Iterateur_BENCH_AmgX.TU.is157091_cc86
+++ b/tests/GPU/OpenMP_Iterateur/OpenMP_Iterateur_BENCH_AmgX.TU.is157091_cc86
@@ -8,7 +8,7 @@ Time is given in seconds
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
                                               Context of the computation
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-Date:     10-03-2026 -- 08:42:48
+Date:     22-04-2026 -- 07:55:23
 OS:       is157091__Linux__x86_64__5.11.12-300.fc34.x86_64__#1 SMP Wed Apr 7 16:31:13 UTC 2021
 CPU model : AMD Ryzen Threadripper PRO 5975WX 32-Cores
 Total number of threads:64
@@ -22,35 +22,35 @@ Total number of elements used for the calculation: 2592000
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
                                            Computation start-up statistics
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-Total time of the start-up:                                                11.0494        
+Total time of the start-up:                                                10.7604        
 
 Number of calls to the linear solver per time step:                        2              
-Average time of the resolution of the linear problem per call:             2.33687        
+Average time of the resolution of the linear problem per call:             2.48938        
 Average number of iteration of the linear solver per call:                 41             
 
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
                                                  Time loop statistics
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 The first time step is not accounted for the computation of the time loop statistics
-Total time of the time loop:                                               1.63426        
+Total time of the time loop:                                               1.58881        
 Number of time steps:                                                      9              
 Skipped time steps:                                                        1              
-Average time per time step:                                                0.181585       
-Standard deviation between time steps:                                     0.00729363     
-Time elapsed in the skipped time steps:                                    0.2154         
+Average time per time step:                                                0.176535       
+Standard deviation between time steps:                                     0.00706665     
+Time elapsed in the skipped time steps:                                    0.206322       
 
 
 Standard counter description             | Time/step       | % loop time | Call(s)/step   
 ------------------------------------------------------------------------------------------
-Linear solver resolutions Ax=B           | 0.131686        | 72.5        | 1              
-Convection operator                      | 0.01309432      |  7.2        | 1              
-Diffusion operator                       | 0.00372165      |  2.0        | 1              
-Gradient operator                        | 0.008522686     |  4.7        | 2              
-Divergence operator                      | 0.004614091     |  2.5        | 2              
-Update ::mettre_a_jour                   | 0.003988937     |  2.2        | 1              
-Computation of the time step dt          | 0.006508071     |  3.6        | 2              
-Post-treatment operations                | 0.00263361      |  1.5        | 1              
-Other operations                         | 0.006814981     |  3.8        | 
+Linear solver resolutions Ax=B           | 0.131459        | 74.5        | 1              
+Convection operator                      | 0.01314436      |  7.4        | 1              
+Diffusion operator                       | 0.003713571     |  2.1        | 1              
+Gradient operator                        | 0.003726761     |  2.1        | 2              
+Divergence operator                      | 0.004629008     |  2.6        | 2              
+Update ::mettre_a_jour                   | 0.004018701     |  2.3        | 1              
+Computation of the time step dt          | 0.006475843     |  3.7        | 2              
+Post-treatment operations                | 0.002728818     |  1.5        | 1              
+Other operations                         | 0.006638199     |  3.8        | 
 
 Average number of iteration of the linear solver per call:                 31.1           
 
@@ -60,16 +60,16 @@ Average number of iteration of the linear solver per call:                 31.1
 -----------------------------------------------------------------------------------------------------------
 Counter description                      | Time per step   | % loop time | Call(s)/step    | Bandwidth 
 -----------------------------------------------------------------------------------------------------------
-Libraries:                               | 0.131402        | 72.4        | 1               | 
-Kernels:                                 | 0.0469629       | 25.9        | 97              | 
-Copy host to device:                     | 0.000331245     |  0.2        | 8               | 8.4 GB/s
-Copy device to host:                     | 0.000617492     |  0.3        | 4               | 10.0 GB/s
+Libraries:                               | 0.131171        | 74.3        | 1               | 
+Kernels:                                 | 0.0419323       | 23.8        | 96              | 
+Copy host to device:                     | 0.000343416     |  0.2        | 8               | 8.1 GB/s
+Copy device to host:                     | 0.000657896     |  0.4        | 4               | 9.4 GB/s
 Alloc/Free on device:                    | 0               |  0.0        | 0               | 
-GPU: 98% Copy H<->D: 0.52% Alloc/free: 0% Comm: 0% CPU & I/O: 1.3%
+GPU: 98% Copy H<->D: 0.57% Alloc/free: 0% Comm: 0% CPU & I/O: 1.4%
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
                                               Post-resolution statistics
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-Time of the post-resolution:                                               0.207824       
+Time of the post-resolution:                                               0.0673813      
 
-Total time for the whole computation                                       13.1069        
+Total time for the whole computation                                       12.6229        
 
diff --git a/tests/GPU/OpenMP_Iterateur/OpenMP_Iterateur_BENCH_AmgX.TU.is159479_cc120 b/tests/GPU/OpenMP_Iterateur/OpenMP_Iterateur_BENCH_AmgX.TU.is159479_cc120
new file mode 100644
index 0000000000..146dce3081
--- /dev/null
+++ b/tests/GPU/OpenMP_Iterateur/OpenMP_Iterateur_BENCH_AmgX.TU.is159479_cc120
@@ -0,0 +1,75 @@
+                                             # Global performance file #
+
+This is the global file for tracking performance in TRUST. It stores aggregated quantities.
+More detailed statistics can be found in the OpenMP_Iterateur_BENCH_AmgX_csv.TU file
+For time loop, only standard counters of level 1 are printed alongside your custom counters
+Time is given in seconds
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+                                              Context of the computation
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Date:     24-04-2026 -- 14:50:20
+OS:       is159479.intra.cea.fr__Linux__x86_64__6.12.0-160000.26-default__#1 SMP PREEMPT_DYNAMIC Thu Feb  5 00:00:11 UTC 2026 (f
+CPU model : AMD Ryzen Threadripper PRO 9965WX 24-Cores
+Total number of threads:48
+GPU model: NVIDIA RTX PRO 6000 Blackwell Workstation Edition
+CUDA runtime version: 13.0
+CUDA drivers version: 13.0
+Nb procs used for the computation: 1
+TRUST version: 1.9.8_beta
+Total number of elements used for the calculation: 2592000
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+                                           Computation start-up statistics
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Total time of the start-up:                                                7.24263        
+
+Number of calls to the linear solver per time step:                        2              
+Average time of the resolution of the linear problem per call:             1.58355        
+Average number of iteration of the linear solver per call:                 41             
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+                                                 Time loop statistics
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+The first time step is not accounted for the computation of the time loop statistics
+Total time of the time loop:                                               0.578466       
+Number of time steps:                                                      9              
+Skipped time steps:                                                        1              
+Average time per time step:                                                0.064274       
+Standard deviation between time steps:                                     0.00427381     
+Time elapsed in the skipped time steps:                                    0.0865707      
+
+
+Standard counter description             | Time/step       | % loop time | Call(s)/step   
+------------------------------------------------------------------------------------------
+Linear solver resolutions Ax=B           | 0.0491303       | 76.4        | 1              
+Convection operator                      | 0.004380798     |  6.8        | 1              
+Diffusion operator                       | 0.001477663     |  2.3        | 1              
+Gradient operator                        | 0.001421014     |  2.2        | 2              
+Divergence operator                      | 0.0007430587    |  1.2        | 2              
+Update ::mettre_a_jour                   | 0.0009293183    |  1.4        | 1              
+Computation of the time step dt          | 0.000812331     |  1.3        | 2              
+Post-treatment operations                | 0.002051846     |  3.2        | 1              
+Other operations                         | 0.003327638     |  5.2        | 
+
+Average number of iteration of the linear solver per call:                 31.1           
+
+
+-----------------------------------------------------------------------------------------------------------
+                                                    GPU statistics
+-----------------------------------------------------------------------------------------------------------
+Counter description                      | Time per step   | % loop time | Call(s)/step    | Bandwidth 
+-----------------------------------------------------------------------------------------------------------
+Libraries:                               | 0.0490462       | 76.3        | 1               | 
+Kernels:                                 | 0.0126528       | 19.7        | 96              | 
+Copy host to device:                     | 0.000231439     |  0.4        | 8               | 12.0 GB/s
+Copy device to host:                     | 0.000867666     |  1.3        | 4               | 7.1 GB/s
+Alloc/Free on device:                    | 0               |  0.0        | 0               | 
+GPU: 96% Copy H<->D: 1.7% Alloc/free: 0% Comm: 0% CPU & I/O: 2.3%
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+                                              Post-resolution statistics
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Time of the post-resolution:                                               0.0518869      
+
+Total time for the whole computation                                       7.95956        
+
diff --git a/tests/GPU/OpenMP_Iterateur/OpenMP_Iterateur_BENCH_AmgX.TU.jean-zay_cc90 b/tests/GPU/OpenMP_Iterateur/OpenMP_Iterateur_BENCH_AmgX.TU.jean-zay_cc90
index d3bd20b90b..ed1afe0a6b 100644
--- a/tests/GPU/OpenMP_Iterateur/OpenMP_Iterateur_BENCH_AmgX.TU.jean-zay_cc90
+++ b/tests/GPU/OpenMP_Iterateur/OpenMP_Iterateur_BENCH_AmgX.TU.jean-zay_cc90
@@ -8,8 +8,8 @@ Time is given in seconds
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
                                               Context of the computation
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-Date:     20-03-2026 -- 14:51:53
-OS:       jzxh041__Linux__x86_64__5.14.0-570.58.1.el9_6.x86_64__#1 SMP PREEMPT_DYNAMIC Tue Oct 21 04:15:07 EDT 2025
+Date:     23-04-2026 -- 08:21:31
+OS:       jzxh011__Linux__x86_64__5.14.0-570.58.1.el9_6.x86_64__#1 SMP PREEMPT_DYNAMIC Tue Oct 21 04:15:07 EDT 2025
 CPU model : Intel(R) Xeon(R) Platinum 8468
 Total number of threads:192
 GPU model: NVIDIA H100 80GB HBM3
@@ -22,35 +22,35 @@ Total number of elements used for the calculation: 2592000
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
                                            Computation start-up statistics
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-Total time of the start-up:                                                29.0666        
+Total time of the start-up:                                                12.8657        
 
 Number of calls to the linear solver per time step:                        2              
-Average time of the resolution of the linear problem per call:             6.28925        
+Average time of the resolution of the linear problem per call:             2.66272        
 Average number of iteration of the linear solver per call:                 41             
 
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
                                                  Time loop statistics
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 The first time step is not accounted for the computation of the time loop statistics
-Total time of the time loop:                                               0.612503       
+Total time of the time loop:                                               0.606875       
 Number of time steps:                                                      9              
 Skipped time steps:                                                        1              
-Average time per time step:                                                0.0680559      
-Standard deviation between time steps:                                     0.00259521     
-Time elapsed in the skipped time steps:                                    0.104328       
+Average time per time step:                                                0.0674305      
+Standard deviation between time steps:                                     0.00262351     
+Time elapsed in the skipped time steps:                                    0.164465       
 
 
 Standard counter description             | Time/step       | % loop time | Call(s)/step   
 ------------------------------------------------------------------------------------------
-Linear solver resolutions Ax=B           | 0.0506794       | 74.5        | 1              
-Convection operator                      | 0.003897978     |  5.7        | 1              
-Diffusion operator                       | 0.001564434     |  2.3        | 1              
-Gradient operator                        | 0.001842167     |  2.7        | 2              
-Divergence operator                      | 0.001292618     |  1.9        | 2              
-Update ::mettre_a_jour                   | 0.001611427     |  2.4        | 1              
-Computation of the time step dt          | 0.000948646     |  1.4        | 2              
-Post-treatment operations                | 0.002957776     |  4.3        | 1              
-Other operations                         | 0.003261441     |  4.8        | 
+Linear solver resolutions Ax=B           | 0.0499214       | 74.0        | 1              
+Convection operator                      | 0.003907927     |  5.8        | 1              
+Diffusion operator                       | 0.001567694     |  2.3        | 1              
+Gradient operator                        | 0.001694054     |  2.5        | 2              
+Divergence operator                      | 0.001322626     |  2.0        | 2              
+Update ::mettre_a_jour                   | 0.001578162     |  2.3        | 1              
+Computation of the time step dt          | 0.0009521459    |  1.4        | 2              
+Post-treatment operations                | 0.002975969     |  4.4        | 1              
+Other operations                         | 0.003510551     |  5.2        | 
 
 Average number of iteration of the linear solver per call:                 31.1           
 
@@ -60,17 +60,17 @@ Average number of iteration of the linear solver per call:                 31.1
 -----------------------------------------------------------------------------------------------------------
 Counter description                      | Time per step   | % loop time | Call(s)/step    | Bandwidth 
 -----------------------------------------------------------------------------------------------------------
-Libraries:                               | 0.0505594       | 74.3        | 1               | 
-Kernels:                                 | 0.012317        | 18.1        | 97              | 
-Copy host to device:                     | 0.000425871     |  0.6        | 8               | 6.5 GB/s
-Copy device to host:                     | 0.000673408     |  1.0        | 4               | 9.2 GB/s
+Libraries:                               | 0.0497978       | 73.9        | 1               | 
+Kernels:                                 | 0.0121603       | 18.0        | 96              | 
+Copy host to device:                     | 0.000414936     |  0.6        | 8               | 6.7 GB/s
+Copy device to host:                     | 0.0006133       |  0.9        | 4               | 10.1 GB/s
 Alloc/Free on device:                    | 0               |  0.0        | 0               | 
-GPU: 92% Copy H<->D: 1.6% Alloc/free: 0% Comm: 0% CPU & I/O: 6%
+GPU: 92% Copy H<->D: 1.5% Alloc/free: 0% Comm: 0% CPU & I/O: 6.6%
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
                                               Post-resolution statistics
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-Time of the post-resolution:                                               0.190546       
+Time of the post-resolution:                                               0.173993       
 
-Total time for the whole computation                                       29.974         
+Total time for the whole computation                                       13.8111        
 
-[Slurm] Power consumption (46 s):  0.441 kW  0.006 kWh  0.001 € (0.10€/kWh)
+[Slurm] Power consumption (22 s):  0.410 kW  0.003 kWh  0.000 € (0.10€/kWh)
diff --git a/tests/GPU/OpenMP_Iterateur/OpenMP_Iterateur_BENCH_AmgX.TU.topaze_cc80 b/tests/GPU/OpenMP_Iterateur/OpenMP_Iterateur_BENCH_AmgX.TU.topaze_cc80
index 02a0052a50..dc5440df70 100644
--- a/tests/GPU/OpenMP_Iterateur/OpenMP_Iterateur_BENCH_AmgX.TU.topaze_cc80
+++ b/tests/GPU/OpenMP_Iterateur/OpenMP_Iterateur_BENCH_AmgX.TU.topaze_cc80
@@ -8,8 +8,8 @@ Time is given in seconds
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
                                               Context of the computation
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-Date:     20-02-2026 -- 17:11:18
-OS:       topaze7071__Linux__x86_64__4.18.0-553.89.1.el8_10.x86_64__#1 SMP Sat Nov 29 00:49:18 EST 2025
+Date:     13-06-2026 -- 12:17:57
+OS:       topaze7062__Linux__x86_64__4.18.0-553.123.1.el8_10.x86_64__#1 SMP Mon May 4 13:45:48 EDT 2026
 CPU model : AMD EPYC 7763 64-Core Processor
 Total number of threads:256
 GPU model: NVIDIA A100-SXM4-80GB
@@ -22,35 +22,35 @@ Total number of elements used for the calculation: 2592000
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
                                            Computation start-up statistics
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-Total time of the start-up:                                                16.5775        
+Total time of the start-up:                                                15.101         
 
 Number of calls to the linear solver per time step:                        2              
-Average time of the resolution of the linear problem per call:             3.4427         
+Average time of the resolution of the linear problem per call:             3.22742        
 Average number of iteration of the linear solver per call:                 41             
 
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
                                                  Time loop statistics
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 The first time step is not accounted for the computation of the time loop statistics
-Total time of the time loop:                                               0.906586       
+Total time of the time loop:                                               0.877615       
 Number of time steps:                                                      9              
 Skipped time steps:                                                        1              
-Average time per time step:                                                0.100732       
-Standard deviation between time steps:                                     0.00337763     
-Time elapsed in the skipped time steps:                                    0.169131       
+Average time per time step:                                                0.0975127      
+Standard deviation between time steps:                                     0.00398041     
+Time elapsed in the skipped time steps:                                    0.144939       
 
 
 Standard counter description             | Time/step       | % loop time | Call(s)/step   
 ------------------------------------------------------------------------------------------
-Linear solver resolutions Ax=B           | 0.075859        | 63.5        | 1              
-Convection operator                      | 0.007069063     |  5.9        | 1              
-Diffusion operator                       | 0.002373228     |  2.0        | 1              
-Gradient operator                        | 0.002605388     |  2.2        | 2              
-Divergence operator                      | 0.001750139     |  1.5        | 2              
-Update ::mettre_a_jour                   | 0.001915461     |  1.6        | 1              
-Computation of the time step dt          | 0.002081152     |  1.7        | 2              
-Post-treatment operations                | 0.002956459     |  2.5        | 1              
-Other operations                         | 0.004121942     |  3.4        | 
+Linear solver resolutions Ax=B           | 0.0760297       | 78.0        | 1              
+Convection operator                      | 0.006027708     |  6.2        | 1              
+Diffusion operator                       | 0.002294193     |  2.4        | 1              
+Gradient operator                        | 0.002297408     |  2.4        | 2              
+Divergence operator                      | 0.00122019      |  1.3        | 2              
+Update ::mettre_a_jour                   | 0.00161066      |  1.7        | 1              
+Computation of the time step dt          | 0.001546687     |  1.6        | 2              
+Post-treatment operations                | 0.002914558     |  3.0        | 1              
+Other operations                         | 0.003571631     |  3.7        | 
 
 Average number of iteration of the linear solver per call:                 31.1           
 
@@ -60,16 +60,17 @@ Average number of iteration of the linear solver per call:                 31.1
 -----------------------------------------------------------------------------------------------------------
 Counter description                      | Time per step   | % loop time | Call(s)/step    | Bandwidth 
 -----------------------------------------------------------------------------------------------------------
-Libraries:                               | 0.0757034       | 75.2        | 1               | 
-Kernels:                                 | 0.0210069       | 20.9        | 97              | 
-Copy host to device:                     | 0.000323824     |  0.3        | 8               | 8.5 GB/s
-Copy device to host:                     | 0.000551814     |  0.5        | 4               | 11.2 GB/s
+Libraries:                               | 0.07588         | 77.8        | 1               | 
+Kernels:                                 | 0.0176215       | 18.1        | 96              | 
+Copy host to device:                     | 0.000328309     |  0.3        | 8               | 8.4 GB/s
+Copy device to host:                     | 0.00052074      |  0.5        | 4               | 11.9 GB/s
 Alloc/Free on device:                    | 0               |  0.0        | 0               | 
-GPU: 96% Copy H<->D: 0.87% Alloc/free: 0% Comm: 0% CPU & I/O: 3.1%
+GPU: 96% Copy H<->D: 0.87% Alloc/free: 0% Comm: 0% CPU & I/O: 3.2%
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
                                               Post-resolution statistics
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-Time of the post-resolution:                                               0.295341       
+Time of the post-resolution:                                               0.163937       
 
-Total time for the whole computation                                       17.9485        
+Total time for the whole computation                                       16.2875        
 
+[Slurm] Power consumption (55 s):  0.416 kW  0.006 kWh  0.001 € (0.10€/kWh)
diff --git a/tests/GPU/OpenMP_Iterateur/OpenMP_Iterateur_BENCH_PETSc.TU.adastra_gfx90a b/tests/GPU/OpenMP_Iterateur/OpenMP_Iterateur_BENCH_PETSc.TU.adastra_gfx90a
index 237ba5b0d9..71f04bb7ea 100644
--- a/tests/GPU/OpenMP_Iterateur/OpenMP_Iterateur_BENCH_PETSc.TU.adastra_gfx90a
+++ b/tests/GPU/OpenMP_Iterateur/OpenMP_Iterateur_BENCH_PETSc.TU.adastra_gfx90a
@@ -8,8 +8,8 @@ Time is given in seconds
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
                                               Context of the computation
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-Date:     12-03-2026 -- 16:38:02
-OS:       g1229__Linux__x86_64__5.14.0-570.69.1.el9_6.x86_64__#1 SMP PREEMPT_DYNAMIC Tue Nov 25 01:30:14 EST 2025
+Date:     23-04-2026 -- 09:27:31
+OS:       g1182__Linux__x86_64__5.14.0-570.69.1.el9_6.x86_64__#1 SMP PREEMPT_DYNAMIC Tue Nov 25 01:30:14 EST 2025
 CPU model : AMD EPYC 7A53 64-Core Processor
 Total number of threads:128
 GPU model: AMD Instinct MI250X
@@ -22,37 +22,37 @@ Total number of elements used for the calculation: 2592000
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
                                            Computation start-up statistics
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-Total time of the start-up:                                                15.1194        
+Total time of the start-up:                                                14.8615        
 
 Number of calls to the linear solver per time step:                        2              
-Average time of the resolution of the linear problem per call:             1.55704        
-Average number of iteration of the linear solver per call:                 17.5           
+Average time of the resolution of the linear problem per call:             1.74019        
+Average number of iteration of the linear solver per call:                 18.5           
 
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
                                                  Time loop statistics
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 The first time step is not accounted for the computation of the time loop statistics
-Total time of the time loop:                                               1.11758        
+Total time of the time loop:                                               1.0108         
 Number of time steps:                                                      9              
 Skipped time steps:                                                        1              
-Average time per time step:                                                0.124175       
-Standard deviation between time steps:                                     0.00298425     
-Time elapsed in the skipped time steps:                                    0.163265       
+Average time per time step:                                                0.112311       
+Standard deviation between time steps:                                     0.00396454     
+Time elapsed in the skipped time steps:                                    0.135985       
 
 
 Standard counter description             | Time/step       | % loop time | Call(s)/step   
 ------------------------------------------------------------------------------------------
-Linear solver resolutions Ax=B           | 0.0515469       | 41.5        | 1              
-Convection operator                      | 0.01753108      | 14.1        | 1              
-Diffusion operator                       | 0.008836219     |  7.1        | 1              
-Gradient operator                        | 0.02405718      | 19.4        | 2              
-Divergence operator                      | 0.005545095     |  4.5        | 2              
-Update ::mettre_a_jour                   | 0.004695313     |  3.8        | 1              
-Computation of the time step dt          | 0.00673917      |  5.4        | 2              
-Post-treatment operations                | 0.002858661     |  2.3        | 1              
-Other operations                         | 0.002365791     |  1.9        | 
+Linear solver resolutions Ax=B           | 0.0553498       | 49.3        | 1              
+Convection operator                      | 0.01779112      | 15.8        | 1              
+Diffusion operator                       | 0.009032714     |  8.0        | 1              
+Gradient operator                        | 0.007908956     |  7.0        | 2              
+Divergence operator                      | 0.005581833     |  5.0        | 2              
+Update ::mettre_a_jour                   | 0.004612555     |  4.1        | 1              
+Computation of the time step dt          | 0.006740941     |  6.0        | 2              
+Post-treatment operations                | 0.002857232     |  2.5        | 1              
+Other operations                         | 0.002436298     |  2.2        | 
 
-Average number of iteration of the linear solver per call:                 13.6           
+Average number of iteration of the linear solver per call:                 14.4           
 
 
 -----------------------------------------------------------------------------------------------------------
@@ -60,17 +60,17 @@ Average number of iteration of the linear solver per call:                 13.6
 -----------------------------------------------------------------------------------------------------------
 Counter description                      | Time per step   | % loop time | Call(s)/step    | Bandwidth 
 -----------------------------------------------------------------------------------------------------------
-Libraries:                               | 0.0513757       | 41.4        | 1               | 
-Kernels:                                 | 0.0690257       | 55.6        | 97              | 
-Copy host to device:                     | 0.000286796     |  0.2        | 8               | 9.7 GB/s
-Copy device to host:                     | 0.000368582     |  0.3        | 4               | 16.8 GB/s
+Libraries:                               | 0.0551808       | 49.1        | 1               | 
+Kernels:                                 | 0.0532523       | 47.4        | 96              | 
+Copy host to device:                     | 0.000285677     |  0.3        | 8               | 9.7 GB/s
+Copy device to host:                     | 0.000371596     |  0.3        | 4               | 16.6 GB/s
 Alloc/Free on device:                    | 0               |  0.0        | 0               | 
-GPU: 97% Copy H<->D: 0.53% Alloc/free: 0% Comm: 0% CPU & I/O: 2.5%
+GPU: 97% Copy H<->D: 0.59% Alloc/free: 0% Comm: 0% CPU & I/O: 2.9%
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
                                               Post-resolution statistics
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-Time of the post-resolution:                                               0.150748       
+Time of the post-resolution:                                               0.155407       
 
-Total time for the whole computation                                       16.551         
+Total time for the whole computation                                       16.1637        
 
-[Slurm] Power consumption (24 s):  0.395 kW  0.003 kWh  0.000 € (0.10€/kWh)
+[Slurm] Power consumption (23 s):  0.392 kW  0.003 kWh  0.000 € (0.10€/kWh)
diff --git a/tests/GPU/OpenMP_Iterateur/OpenMP_Iterateur_BENCH_PETSc.TU.lumi_gfx90a b/tests/GPU/OpenMP_Iterateur/OpenMP_Iterateur_BENCH_PETSc.TU.lumi_gfx90a
index 590e3e362a..efe1a573df 100644
--- a/tests/GPU/OpenMP_Iterateur/OpenMP_Iterateur_BENCH_PETSc.TU.lumi_gfx90a
+++ b/tests/GPU/OpenMP_Iterateur/OpenMP_Iterateur_BENCH_PETSc.TU.lumi_gfx90a
@@ -8,8 +8,8 @@ Time is given in seconds
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
                                               Context of the computation
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-Date:     24-02-2026 -- 00:15:17
-OS:       nid007971__Linux__x86_64__6.4.0-150600.23.73_15.0.14-cray_shasta_c__#1 SMP Tue Oct 21 20:32:25 UTC 2025 (89d3c98)
+Date:     15-05-2026 -- 20:43:18
+OS:       nid005023__Linux__x86_64__6.4.0-150600.23.73_15.0.14-cray_shasta_c__#1 SMP Tue Oct 21 20:32:25 UTC 2025 (89d3c98)
 CPU model : AMD EPYC 7A53 64-Core Processor
 Total number of threads:128
 GPU model: AMD Instinct MI250X
@@ -22,37 +22,37 @@ Total number of elements used for the calculation: 2592000
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
                                            Computation start-up statistics
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-Total time of the start-up:                                                53.4901        
+Total time of the start-up:                                                72.2337        
 
 Number of calls to the linear solver per time step:                        2              
-Average time of the resolution of the linear problem per call:             3.29056        
-Average number of iteration of the linear solver per call:                 17.5           
+Average time of the resolution of the linear problem per call:             3.83761        
+Average number of iteration of the linear solver per call:                 19.5           
 
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
                                                  Time loop statistics
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 The first time step is not accounted for the computation of the time loop statistics
-Total time of the time loop:                                               1.12433        
+Total time of the time loop:                                               1.00974        
 Number of time steps:                                                      9              
 Skipped time steps:                                                        1              
-Average time per time step:                                                0.124926       
-Standard deviation between time steps:                                     0.00291511     
-Time elapsed in the skipped time steps:                                    0.149759       
+Average time per time step:                                                0.112194       
+Standard deviation between time steps:                                     0.00433867     
+Time elapsed in the skipped time steps:                                    0.138662       
 
 
 Standard counter description             | Time/step       | % loop time | Call(s)/step   
 ------------------------------------------------------------------------------------------
-Linear solver resolutions Ax=B           | 0.0503064       | 35.5        | 1              
-Convection operator                      | 0.02040081      | 14.4        | 1              
-Diffusion operator                       | 0.007673064     |  5.4        | 1              
-Gradient operator                        | 0.02397075      | 16.9        | 2              
-Divergence operator                      | 0.005495745     |  3.9        | 2              
-Update ::mettre_a_jour                   | 0.004568919     |  3.2        | 1              
-Computation of the time step dt          | 0.006858336     |  4.8        | 2              
-Post-treatment operations                | 0.002841983     |  2.0        | 1              
-Other operations                         | 0.002810071     |  2.0        | 
+Linear solver resolutions Ax=B           | 0.0555408       | 49.5        | 1              
+Convection operator                      | 0.01741344      | 15.5        | 1              
+Diffusion operator                       | 0.008625326     |  7.7        | 1              
+Gradient operator                        | 0.008388428     |  7.5        | 2              
+Divergence operator                      | 0.005668888     |  5.1        | 2              
+Update ::mettre_a_jour                   | 0.004655274     |  4.1        | 1              
+Computation of the time step dt          | 0.006789526     |  6.1        | 2              
+Post-treatment operations                | 0.002835855     |  2.5        | 1              
+Other operations                         | 0.002276084     |  2.0        | 
 
-Average number of iteration of the linear solver per call:                 13.6           
+Average number of iteration of the linear solver per call:                 15             
 
 
 -----------------------------------------------------------------------------------------------------------
@@ -60,17 +60,17 @@ Average number of iteration of the linear solver per call:                 13.6
 -----------------------------------------------------------------------------------------------------------
 Counter description                      | Time per step   | % loop time | Call(s)/step    | Bandwidth 
 -----------------------------------------------------------------------------------------------------------
-Libraries:                               | 0.0501355       | 40.1        | 1               | 
-Kernels:                                 | 0.0709778       | 56.8        | 97              | 
-Copy host to device:                     | 0.000305926     |  0.2        | 8               | 9.0 GB/s
-Copy device to host:                     | 0.000366232     |  0.3        | 4               | 16.9 GB/s
+Libraries:                               | 0.0553671       | 49.3        | 1               | 
+Kernels:                                 | 0.0529794       | 47.2        | 96              | 
+Copy host to device:                     | 0.000301069     |  0.3        | 8               | 9.2 GB/s
+Copy device to host:                     | 0.00036894      |  0.3        | 4               | 16.8 GB/s
 Alloc/Free on device:                    | 0               |  0.0        | 0               | 
-GPU: 97% Copy H<->D: 0.54% Alloc/free: 0% Comm: 0% CPU & I/O: 2.5%
+GPU: 97% Copy H<->D: 0.6% Alloc/free: 0% Comm: 0% CPU & I/O: 2.8%
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
                                               Post-resolution statistics
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-Time of the post-resolution:                                               0.143902       
+Time of the post-resolution:                                               0.15637        
 
-Total time for the whole computation                                       54.9081        
+Total time for the whole computation                                       73.5385        
 
-Power consumption: 0 kW 0 kWh 0 € (0.12€/kWh)
+[Slurm] Power consumption (101 s):  0.467 kW  0.013 kWh  0.001 € (0.10€/kWh)
diff --git a/tests/GPU/OpenMP_Iterateur/PAR_OpenMP_Iterateur_BENCH_AmgX.TU.dalianvl_cc100x4 b/tests/GPU/OpenMP_Iterateur/PAR_OpenMP_Iterateur_BENCH_AmgX.TU.dalianvl_cc100x4
new file mode 100644
index 0000000000..e1d34358e0
--- /dev/null
+++ b/tests/GPU/OpenMP_Iterateur/PAR_OpenMP_Iterateur_BENCH_AmgX.TU.dalianvl_cc100x4
@@ -0,0 +1,101 @@
+                                             # Global performance file #
+
+This is the global file for tracking performance in TRUST. It stores aggregated quantities.
+More detailed statistics can be found in the PAR_OpenMP_Iterateur_BENCH_AmgX_csv.TU file
+For time loop, only standard counters of level 1 are printed alongside your custom counters
+Time is given in seconds
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+                                              Context of the computation
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Date:     11-06-2026 -- 12:56:02
+OS:       dalianvl16__Linux__aarch64__5.14.0-503.40.1.el9_5.aarch64__#1 SMP PREEMPT_DYNAMIC Thu Apr 24 13:05:29 UTC 2025
+CPU model : Unknown Linux CPU
+Total number of threads:144
+GPU model: NVIDIA GB200
+CUDA runtime version: 13.0
+CUDA drivers version: 13.20
+Nb procs used for the computation: 4
+TRUST version: 1.9.8_beta
+Total number of elements used for the calculation: 2592000
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+                                           Computation start-up statistics
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Total time of the start-up:                                                6.31723        
+Number of virtual exchanges:                                               59             
+Maximum number of MPI allreduce per time step                              113            
+
+Number of calls to the linear solver per time step:                        2              
+Average time of the resolution of the linear problem per call:             0.987298       
+Average number of iteration of the linear solver per call:                 45.5           
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+                                                 Time loop statistics
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+The first time step is not accounted for the computation of the time loop statistics
+Total time of the time loop:                                               1.66244        
+Number of time steps:                                                      9              
+Skipped time steps:                                                        1              
+Average time per time step:                                                0.184716       
+Standard deviation between time steps:                                     0.013327       
+Time elapsed in the skipped time steps:                                    0.251559       
+
+Percent of total time spend in communication:                              0.554609       
+
+Standard counter description             | Time/step       | % loop time | Call(s)/step   
+------------------------------------------------------------------------------------------
+Linear solver resolutions Ax=B           | 0.176631        | 95.6        | 1              
+Convection operator                      | 0.0009179114    |  0.5        | 1              
+Diffusion operator                       | 0.0003050746    |  0.2        | 1              
+Gradient operator                        | 0.0003565659    |  0.2        | 2              
+Divergence operator                      | 0.0002437128    |  0.1        | 2              
+Update ::mettre_a_jour                   | 0.001284476     |  0.7        | 1              
+Computation of the time step dt          | 0.0003241821    |  0.2        | 2              
+Post-treatment operations                | 0.0007485209    |  0.4        | 1              
+Other operations                         | 0.0039044       |  2.1        | 
+Number of virtual exchanges per time step:                                 9              
+Maximum number of MPI allreduce per time step                              14             
+
+Average number of iteration of the linear solver per call:                 34.7           
+
+
+---------------------------------------------------------------------------------------------------------
+Warning: One or several PETSc solvers are used and thus the communication time below are under-estimated.
+---------------------------------------------------------------------------------------------------------
+
+Average of the fraction of the time spent in communications between processors:      0.8            %
+Max of the fraction of the time spent in communications between processors:          1.1            %
+Min of the fraction of the time spent in communications between processors:          0.6            %
+Time of one mpsum measured by an internal bench over 0.1s (network latency):         3.48609e-06    
+Network maximum bandwidth on all processors:                                         54.2 GB/s      
+Total network traffic:                                                               194.73         MB/time step
+Average message size:                                                                740.105        kB
+Min waiting time:                                                                    0.6            % of total time
+Max waiting time:                                                                    1.1            % of total time
+Avg waiting time:                                                                    0.95           % of total time
+
+-----------------------------------------------------------------------------------------------------------
+                                                    GPU statistics
+-----------------------------------------------------------------------------------------------------------
+Counter description                      | Time per step   | % loop time | Call(s)/step    | Bandwidth 
+-----------------------------------------------------------------------------------------------------------
+Libraries:                               | 0.176361        | 95.5        | 1               | 
+Kernels:                                 | 0.00414017      |  2.2        | 151             | 
+Copy host to device:                     | 0.000122941     |  0.1        | 5               | 6.2 GB/s
+Copy device to host:                     | 0.000104189     |  0.1        | 4               | 14.6 GB/s
+Alloc/Free on device:                    | 4.65778e-07     |  0.0        | 6               | 
+GPU: 98% Copy H<->D: 0.12% Alloc/free: 0.00025% Comm: 0.64% CPU & I/O: 1.5%
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+                                              Post-resolution statistics
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Time of the post-resolution:                                               0.469793       
+Maximum number of MPI allreduce per time step                              6              
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Max waiting time big    => probably due to a bad partitioning
+Communications > 30%    => too many processors or network too slow
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Total time for the whole computation                                       8.70103        
+
diff --git a/tests/GPU/OpenMP_Iterateur/PAR_OpenMP_Iterateur_BENCH_AmgX.TU.jean-zay_cc90x4 b/tests/GPU/OpenMP_Iterateur/PAR_OpenMP_Iterateur_BENCH_AmgX.TU.jean-zay_cc90x4
new file mode 100644
index 0000000000..bd4d2b39f6
--- /dev/null
+++ b/tests/GPU/OpenMP_Iterateur/PAR_OpenMP_Iterateur_BENCH_AmgX.TU.jean-zay_cc90x4
@@ -0,0 +1,102 @@
+                                             # Global performance file #
+
+This is the global file for tracking performance in TRUST. It stores aggregated quantities.
+More detailed statistics can be found in the PAR_OpenMP_Iterateur_BENCH_AmgX_csv.TU file
+For time loop, only standard counters of level 1 are printed alongside your custom counters
+Time is given in seconds
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+                                              Context of the computation
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Date:     10-06-2026 -- 15:24:25
+OS:       jzxh032__Linux__x86_64__5.14.0-570.116.1.el9_6.x86_64__#1 SMP PREEMPT_DYNAMIC Tue May 19 08:44:54 EDT 2026
+CPU model : Intel(R) Xeon(R) Platinum 8468
+Total number of threads:192
+GPU model: NVIDIA H100 80GB HBM3
+CUDA runtime version: 12.60
+CUDA drivers version: 13.20
+Nb procs used for the computation: 4
+TRUST version: 1.9.8_beta
+Total number of elements used for the calculation: 2592000
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+                                           Computation start-up statistics
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Total time of the start-up:                                                8.97592        
+Number of virtual exchanges:                                               59             
+Maximum number of MPI allreduce per time step                              113            
+
+Number of calls to the linear solver per time step:                        2              
+Average time of the resolution of the linear problem per call:             1.69108        
+Average number of iteration of the linear solver per call:                 45.5           
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+                                                 Time loop statistics
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+The first time step is not accounted for the computation of the time loop statistics
+Total time of the time loop:                                               0.912864       
+Number of time steps:                                                      9              
+Skipped time steps:                                                        1              
+Average time per time step:                                                0.101429       
+Standard deviation between time steps:                                     0.00683941     
+Time elapsed in the skipped time steps:                                    0.232016       
+
+Percent of total time spend in communication:                              0.73194        
+
+Standard counter description             | Time/step       | % loop time | Call(s)/step   
+------------------------------------------------------------------------------------------
+Linear solver resolutions Ax=B           | 0.0933083       | 92.0        | 1              
+Convection operator                      | 0.001174145     |  1.2        | 1              
+Diffusion operator                       | 0.0004917334    |  0.5        | 1              
+Gradient operator                        | 0.000429222     |  0.4        | 2              
+Divergence operator                      | 0.0002555969    |  0.3        | 2              
+Update ::mettre_a_jour                   | 0.001087015     |  1.1        | 1              
+Computation of the time step dt          | 0.0004033608    |  0.4        | 2              
+Post-treatment operations                | 0.001093626     |  1.1        | 1              
+Other operations                         | 0.003186304     |  3.1        | 
+Number of virtual exchanges per time step:                                 9              
+Maximum number of MPI allreduce per time step                              14             
+
+Average number of iteration of the linear solver per call:                 34.7           
+
+
+---------------------------------------------------------------------------------------------------------
+Warning: One or several PETSc solvers are used and thus the communication time below are under-estimated.
+---------------------------------------------------------------------------------------------------------
+
+Average of the fraction of the time spent in communications between processors:      2.7            %
+Max of the fraction of the time spent in communications between processors:          4.2            %
+Min of the fraction of the time spent in communications between processors:          0.8            %
+Time of one mpsum measured by an internal bench over 0.1s (network latency):         4.03775e-06    
+Network maximum bandwidth on all processors:                                         62.0 GB/s      
+Total network traffic:                                                               194.73         MB/time step
+Average message size:                                                                740.105        kB
+Min waiting time:                                                                    0.8            % of total time
+Max waiting time:                                                                    4.1            % of total time
+Avg waiting time:                                                                    3.25           % of total time
+
+-----------------------------------------------------------------------------------------------------------
+                                                    GPU statistics
+-----------------------------------------------------------------------------------------------------------
+Counter description                      | Time per step   | % loop time | Call(s)/step    | Bandwidth 
+-----------------------------------------------------------------------------------------------------------
+Libraries:                               | 0.093002        | 91.7        | 1               | 
+Kernels:                                 | 0.00481281      |  4.7        | 151             | 
+Copy host to device:                     | 0.000142865     |  0.1        | 5               | 5.3 GB/s
+Copy device to host:                     | 0.000247582     |  0.2        | 4               | 6.2 GB/s
+Alloc/Free on device:                    | 3.79444e-07     |  0.0        | 6               | 
+GPU: 96% Copy H<->D: 0.38% Alloc/free: 0.00037% Comm: 0.92% CPU & I/O: 2.3%
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+                                              Post-resolution statistics
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Time of the post-resolution:                                               0.495779       
+Maximum number of MPI allreduce per time step                              6              
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Max waiting time big    => probably due to a bad partitioning
+Communications > 30%    => too many processors or network too slow
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Total time for the whole computation                                       10.6166        
+
+[Slurm] Power consumption (27 s):  0.438 kW  0.003 kWh  0.000 € (0.10€/kWh)
diff --git a/tests/GPU/OpenMP_Iterateur/PAR_OpenMP_Iterateur_BENCH_AmgX_10.TU.jean-zay_cc90x8 b/tests/GPU/OpenMP_Iterateur/PAR_OpenMP_Iterateur_BENCH_AmgX_10.TU.jean-zay_cc90x8
new file mode 100644
index 0000000000..4910e01b67
--- /dev/null
+++ b/tests/GPU/OpenMP_Iterateur/PAR_OpenMP_Iterateur_BENCH_AmgX_10.TU.jean-zay_cc90x8
@@ -0,0 +1,102 @@
+                                             # Global performance file #
+
+This is the global file for tracking performance in TRUST. It stores aggregated quantities.
+More detailed statistics can be found in the PAR_OpenMP_Iterateur_BENCH_AmgX_10_csv.TU file
+For time loop, only standard counters of level 1 are printed alongside your custom counters
+Time is given in seconds
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+                                              Context of the computation
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Date:     10-06-2026 -- 16:05:33
+OS:       jzxh250__Linux__x86_64__5.14.0-570.116.1.el9_6.x86_64__#1 SMP PREEMPT_DYNAMIC Tue May 19 08:44:54 EDT 2026
+CPU model : Intel(R) Xeon(R) Platinum 8468
+Total number of threads:192
+GPU model: NVIDIA H100 80GB HBM3
+CUDA runtime version: 12.60
+CUDA drivers version: 13.20
+Nb procs used for the computation: 8
+TRUST version: 1.9.8_beta
+Total number of elements used for the calculation: 80864000
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+                                           Computation start-up statistics
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Total time of the start-up:                                                62.3521        
+Number of virtual exchanges:                                               59             
+Maximum number of MPI allreduce per time step                              113            
+
+Number of calls to the linear solver per time step:                        2              
+Average time of the resolution of the linear problem per call:             15.4526        
+Average number of iteration of the linear solver per call:                 97             
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+                                                 Time loop statistics
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+The first time step is not accounted for the computation of the time loop statistics
+Total time of the time loop:                                               4.87771        
+Number of time steps:                                                      9              
+Skipped time steps:                                                        1              
+Average time per time step:                                                0.541968       
+Standard deviation between time steps:                                     0.0373707      
+Time elapsed in the skipped time steps:                                    0.944194       
+
+Percent of total time spend in communication:                              1.12627        
+
+Standard counter description             | Time/step       | % loop time | Call(s)/step   
+------------------------------------------------------------------------------------------
+Linear solver resolutions Ax=B           | 0.479851        | 88.5        | 1              
+Convection operator                      | 0.01469592      |  2.7        | 1              
+Diffusion operator                       | 0.00588249      |  1.1        | 1              
+Gradient operator                        | 0.005783263     |  1.1        | 2              
+Divergence operator                      | 0.00287426      |  0.5        | 2              
+Update ::mettre_a_jour                   | 0.01230296      |  2.3        | 1              
+Computation of the time step dt          | 0.003788952     |  0.7        | 2              
+Post-treatment operations                | 0.004320248     |  0.8        | 1              
+Other operations                         | 0.01246934      |  2.3        | 
+Number of virtual exchanges per time step:                                 9              
+Maximum number of MPI allreduce per time step                              14             
+
+Average number of iteration of the linear solver per call:                 72             
+
+
+---------------------------------------------------------------------------------------------------------
+Warning: One or several PETSc solvers are used and thus the communication time below are under-estimated.
+---------------------------------------------------------------------------------------------------------
+
+Average of the fraction of the time spent in communications between processors:      1.6            %
+Max of the fraction of the time spent in communications between processors:          2.6            %
+Min of the fraction of the time spent in communications between processors:          1.2            %
+Time of one mpsum measured by an internal bench over 0.1s (network latency):         5.31916e-06    
+Network maximum bandwidth on all processors:                                         172.1 GB/s     
+Total network traffic:                                                               2535.2         MB/time step
+Average message size:                                                                4477.39        kB
+Min waiting time:                                                                    1.3            % of total time
+Max waiting time:                                                                    2.4            % of total time
+Avg waiting time:                                                                    1.9625         % of total time
+
+-----------------------------------------------------------------------------------------------------------
+                                                    GPU statistics
+-----------------------------------------------------------------------------------------------------------
+Counter description                      | Time per step   | % loop time | Call(s)/step    | Bandwidth 
+-----------------------------------------------------------------------------------------------------------
+Libraries:                               | 0.479276        | 88.4        | 1               | 
+Kernels:                                 | 0.0465683       |  8.6        | 133             | 
+Copy host to device:                     | 0.000555787     |  0.1        | 5               | 8.8 GB/s
+Copy device to host:                     | 0.000704379     |  0.1        | 4               | 13.0 GB/s
+Alloc/Free on device:                    | 4.16333e-07     |  0.0        | 6               | 
+GPU: 97% Copy H<->D: 0.23% Alloc/free: 7.7e-05% Comm: 1.3% CPU & I/O: 1.4%
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+                                              Post-resolution statistics
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Time of the post-resolution:                                               1.47244        
+Maximum number of MPI allreduce per time step                              6              
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Max waiting time big    => probably due to a bad partitioning
+Communications > 30%    => too many processors or network too slow
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Total time for the whole computation                                       69.6465        
+
+[Slurm] Power consumption (86 s):  0.950 kW  0.023 kWh  0.002 € (0.10€/kWh)
diff --git a/tests/GPU/OpenMP_Iterateur/PAR_OpenMP_Iterateur_BENCH_PETSc.TU.adastra_gfx90ax4 b/tests/GPU/OpenMP_Iterateur/PAR_OpenMP_Iterateur_BENCH_PETSc.TU.adastra_gfx90ax4
index f63e88a6e3..d47299e2f6 100644
--- a/tests/GPU/OpenMP_Iterateur/PAR_OpenMP_Iterateur_BENCH_PETSc.TU.adastra_gfx90ax4
+++ b/tests/GPU/OpenMP_Iterateur/PAR_OpenMP_Iterateur_BENCH_PETSc.TU.adastra_gfx90ax4
@@ -8,8 +8,8 @@ Time is given in seconds
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
                                               Context of the computation
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-Date:     20-02-2026 -- 21:16:48
-OS:       g1023__Linux__x86_64__5.14.0-570.69.1.el9_6.x86_64__#1 SMP PREEMPT_DYNAMIC Tue Nov 25 01:30:14 EST 2025
+Date:     08-06-2026 -- 16:11:54
+OS:       g1323__Linux__x86_64__5.14.0-570.69.1.el9_6.x86_64__#1 SMP PREEMPT_DYNAMIC Tue Nov 25 01:30:14 EST 2025
 CPU model : AMD EPYC 7A53 64-Core Processor
 Total number of threads:128
 GPU model: AMD Instinct MI250X
@@ -22,74 +22,74 @@ Total number of elements used for the calculation: 2592000
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
                                            Computation start-up statistics
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-Total time of the start-up:                                                9.11069        
-Number of virtual exchanges:                                               61             
-Maximum number of MPI allreduce per time step                              113            
+Total time of the start-up:                                                15.7382        
+Number of virtual exchanges:                                               59             
+Maximum number of MPI allreduce per time step                              112            
 
 Number of calls to the linear solver per time step:                        2              
-Average time of the resolution of the linear problem per call:             0.745132       
-Average number of iteration of the linear solver per call:                 17.5           
+Average time of the resolution of the linear problem per call:             1.49685        
+Average number of iteration of the linear solver per call:                 19             
 
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
                                                  Time loop statistics
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 The first time step is not accounted for the computation of the time loop statistics
-Total time of the time loop:                                               0.749941       
+Total time of the time loop:                                               0.536061       
 Number of time steps:                                                      9              
 Skipped time steps:                                                        1              
-Average time per time step:                                                0.0833267      
-Standard deviation between time steps:                                     0.00336632     
-Time elapsed in the skipped time steps:                                    0.109205       
+Average time per time step:                                                0.0595624      
+Standard deviation between time steps:                                     0.0020151      
+Time elapsed in the skipped time steps:                                    0.0832984      
 
-Percent of total time spend in communication:                              1.85683        
+Percent of total time spend in communication:                              2.96068        
 
 Standard counter description             | Time/step       | % loop time | Call(s)/step   
 ------------------------------------------------------------------------------------------
-Linear solver resolutions Ax=B           | 0.0588244       | 61.6        | 1              
-Convection operator                      | 0.00587872      |  6.2        | 1              
-Diffusion operator                       | 0.002212806     |  2.3        | 1              
-Gradient operator                        | 0.006406243     |  6.7        | 2              
-Divergence operator                      | 0.001422708     |  1.5        | 2              
-Update ::mettre_a_jour                   | 0.002037852     |  2.1        | 1              
-Computation of the time step dt          | 0.001919701     |  2.0        | 2              
-Post-treatment operations                | 0.00125179      |  1.3        | 1              
-Other operations                         | 0.00337248      |  3.5        | 
-Number of virtual exchanges per time step:                                 10             
+Linear solver resolutions Ax=B           | 0.0390233       | 65.5        | 1              
+Convection operator                      | 0.005349557     |  9.0        | 1              
+Diffusion operator                       | 0.002420475     |  4.1        | 1              
+Gradient operator                        | 0.002731023     |  4.6        | 2              
+Divergence operator                      | 0.0008404393    |  1.4        | 2              
+Update ::mettre_a_jour                   | 0.001865685     |  3.1        | 1              
+Computation of the time step dt          | 0.001907613     |  3.2        | 2              
+Post-treatment operations                | 0.001141614     |  1.9        | 1              
+Other operations                         | 0.004282682     |  7.2        | 
+Number of virtual exchanges per time step:                                 9              
 Maximum number of MPI allreduce per time step                              14             
 
-Average number of iteration of the linear solver per call:                 13.6           
+Average number of iteration of the linear solver per call:                 14.8           
 
 
 ---------------------------------------------------------------------------------------------------------
 Warning: One or several PETSc solvers are used and thus the communication time below are under-estimated.
 ---------------------------------------------------------------------------------------------------------
 
-Average of the fraction of the time spent in communications between processors:      2.2            %
-Max of the fraction of the time spent in communications between processors:          2.9            %
-Min of the fraction of the time spent in communications between processors:          2.1            %
-Time of one mpsum measured by an internal bench over 0.1s (network latency):         2.15456e-06    
-Network maximum bandwidth on all processors:                                         33.5 GB/s      
+Average of the fraction of the time spent in communications between processors:      3.2            %
+Max of the fraction of the time spent in communications between processors:          4              %
+Min of the fraction of the time spent in communications between processors:          3.4            %
+Time of one mpsum measured by an internal bench over 0.1s (network latency):         2.38312e-06    
+Network maximum bandwidth on all processors:                                         31.0 GB/s      
 Total network traffic:                                                               194.73         MB/time step
 Average message size:                                                                751.53         kB
-Min waiting time:                                                                    2              % of total time
-Max waiting time:                                                                    2.9            % of total time
-Avg waiting time:                                                                    2.5            % of total time
+Min waiting time:                                                                    3.3            % of total time
+Max waiting time:                                                                    3.9            % of total time
+Avg waiting time:                                                                    3.6            % of total time
 
 -----------------------------------------------------------------------------------------------------------
                                                     GPU statistics
 -----------------------------------------------------------------------------------------------------------
 Counter description                      | Time per step   | % loop time | Call(s)/step    | Bandwidth 
 -----------------------------------------------------------------------------------------------------------
-Libraries:                               | 0.0585978       | 70.3        | 1               | 
-Kernels:                                 | 0.0209497       | 25.1        | 151             | 
-Copy host to device:                     | 0.000159277     |  0.2        | 5               | 4.8 GB/s
-Copy device to host:                     | 0.000248381     |  0.3        | 4               | 6.1 GB/s
-Alloc/Free on device:                    | 1.34147e-05     |  0.0        | 6               | 
-GPU: 95% Copy H<->D: 0.49% Alloc/free: 0.016% Comm: 2.1% CPU & I/O: 1.9%
+Libraries:                               | 0.0387932       | 65.1        | 1               | 
+Kernels:                                 | 0.016855        | 28.3        | 151             | 
+Copy host to device:                     | 0.000166681     |  0.3        | 5               | 4.6 GB/s
+Copy device to host:                     | 0.000260071     |  0.4        | 4               | 5.9 GB/s
+Alloc/Free on device:                    | 3.67556e-07     |  0.0        | 6               | 
+GPU: 93% Copy H<->D: 0.72% Alloc/free: 0.00062% Comm: 3.4% CPU & I/O: 2.4%
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
                                               Post-resolution statistics
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-Time of the post-resolution:                                               0.745979       
+Time of the post-resolution:                                               0.75471        
 Maximum number of MPI allreduce per time step                              6              
 
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
@@ -97,6 +97,6 @@ Max waiting time big    => probably due to a bad partitioning
 Communications > 30%    => too many processors or network too slow
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
-Total time for the whole computation                                       10.7158        
+Total time for the whole computation                                       17.1123        
 
-Power consumption: 0 kW 0 kWh 0 € (0.12€/kWh)
+[Slurm] Power consumption (28 s):  0.465 kW  0.004 kWh  0.000 € (0.10€/kWh)
diff --git a/tests/GPU/OpenMP_Iterateur/PAR_OpenMP_Iterateur_BENCH_PETSc.TU.adastra_gfx942x4 b/tests/GPU/OpenMP_Iterateur/PAR_OpenMP_Iterateur_BENCH_PETSc.TU.adastra_gfx942x4
index 07313561f2..cc3bd0fc9b 100644
--- a/tests/GPU/OpenMP_Iterateur/PAR_OpenMP_Iterateur_BENCH_PETSc.TU.adastra_gfx942x4
+++ b/tests/GPU/OpenMP_Iterateur/PAR_OpenMP_Iterateur_BENCH_PETSc.TU.adastra_gfx942x4
@@ -8,13 +8,13 @@ Time is given in seconds
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
                                               Context of the computation
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-Date:     20-02-2026 -- 18:11:04
+Date:     08-06-2026 -- 14:46:38
 OS:       a1001__Linux__x86_64__5.14.0-570.69.1.el9_6.x86_64__#1 SMP PREEMPT_DYNAMIC Tue Nov 25 01:30:14 EST 2025
 CPU model : AMD Instinct MI300A Accelerator
 Total number of threads:192
 GPU model: AMD Instinct MI300A
-HIP runtime version: 6.41
-HIP drivers version: 6.41
+HIP runtime version: 6.43
+HIP drivers version: 6.43
 Nb procs used for the computation: 4
 TRUST version: 1.9.8_beta
 Total number of elements used for the calculation: 2592000
@@ -22,74 +22,74 @@ Total number of elements used for the calculation: 2592000
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
                                            Computation start-up statistics
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-Total time of the start-up:                                                4.0396         
-Number of virtual exchanges:                                               61             
-Maximum number of MPI allreduce per time step                              113            
+Total time of the start-up:                                                10.1716        
+Number of virtual exchanges:                                               59             
+Maximum number of MPI allreduce per time step                              112            
 
 Number of calls to the linear solver per time step:                        2              
-Average time of the resolution of the linear problem per call:             0.66545        
-Average number of iteration of the linear solver per call:                 17.5           
+Average time of the resolution of the linear problem per call:             0.95387        
+Average number of iteration of the linear solver per call:                 16.5           
 
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
                                                  Time loop statistics
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 The first time step is not accounted for the computation of the time loop statistics
-Total time of the time loop:                                               0.783478       
+Total time of the time loop:                                               0.480853       
 Number of time steps:                                                      9              
 Skipped time steps:                                                        1              
-Average time per time step:                                                0.0870531      
-Standard deviation between time steps:                                     0.00559937     
-Time elapsed in the skipped time steps:                                    0.123638       
+Average time per time step:                                                0.0534281      
+Standard deviation between time steps:                                     0.00226311     
+Time elapsed in the skipped time steps:                                    0.0790494      
 
-Percent of total time spend in communication:                              2.03525        
+Percent of total time spend in communication:                              3.33139        
 
 Standard counter description             | Time/step       | % loop time | Call(s)/step   
 ------------------------------------------------------------------------------------------
-Linear solver resolutions Ax=B           | 0.0768391       | 76.2        | 1              
-Convection operator                      | 0.001744362     |  1.7        | 1              
-Diffusion operator                       | 0.0007313998    |  0.7        | 1              
-Gradient operator                        | 0.0007877113    |  0.8        | 2              
-Divergence operator                      | 0.0007087598    |  0.7        | 2              
-Update ::mettre_a_jour                   | 0.001926733     |  1.9        | 1              
-Computation of the time step dt          | 0.0008340134    |  0.8        | 2              
-Post-treatment operations                | 0.001047411     |  1.0        | 1              
-Other operations                         | 0.00243357      |  2.4        | 
-Number of virtual exchanges per time step:                                 10             
+Linear solver resolutions Ax=B           | 0.0431755       | 80.8        | 1              
+Convection operator                      | 0.00164288      |  3.1        | 1              
+Diffusion operator                       | 0.0007242174    |  1.4        | 1              
+Gradient operator                        | 0.0007095389    |  1.3        | 2              
+Divergence operator                      | 0.0005097047    |  1.0        | 2              
+Update ::mettre_a_jour                   | 0.001811757     |  3.4        | 1              
+Computation of the time step dt          | 0.0008200627    |  1.5        | 2              
+Post-treatment operations                | 0.0009717177    |  1.8        | 1              
+Other operations                         | 0.00306278      |  5.7        | 
+Number of virtual exchanges per time step:                                 9              
 Maximum number of MPI allreduce per time step                              14             
 
-Average number of iteration of the linear solver per call:                 13.6           
+Average number of iteration of the linear solver per call:                 13.2           
 
 
 ---------------------------------------------------------------------------------------------------------
 Warning: One or several PETSc solvers are used and thus the communication time below are under-estimated.
 ---------------------------------------------------------------------------------------------------------
 
-Average of the fraction of the time spent in communications between processors:      2              %
-Max of the fraction of the time spent in communications between processors:          2.7            %
-Min of the fraction of the time spent in communications between processors:          2              %
-Time of one mpsum measured by an internal bench over 0.1s (network latency):         5.92733e-06    
-Network maximum bandwidth on all processors:                                         39.2 GB/s      
+Average of the fraction of the time spent in communications between processors:      3.3            %
+Max of the fraction of the time spent in communications between processors:          4.2            %
+Min of the fraction of the time spent in communications between processors:          3.5            %
+Time of one mpsum measured by an internal bench over 0.1s (network latency):         5.58282e-06    
+Network maximum bandwidth on all processors:                                         36.3 GB/s      
 Total network traffic:                                                               194.73         MB/time step
 Average message size:                                                                751.53         kB
-Min waiting time:                                                                    2              % of total time
-Max waiting time:                                                                    2.6            % of total time
-Avg waiting time:                                                                    2.2            % of total time
+Min waiting time:                                                                    3.3            % of total time
+Max waiting time:                                                                    4              % of total time
+Avg waiting time:                                                                    3.65           % of total time
 
 -----------------------------------------------------------------------------------------------------------
                                                     GPU statistics
 -----------------------------------------------------------------------------------------------------------
 Counter description                      | Time per step   | % loop time | Call(s)/step    | Bandwidth 
 -----------------------------------------------------------------------------------------------------------
-Libraries:                               | 0.0766393       | 88.0        | 1               | 
-Kernels:                                 | 0.00695985      |  8.0        | 151             | 
-Copy host to device:                     | 9.11892e-05     |  0.1        | 5               | 8.4 GB/s
-Copy device to host:                     | 0.000198859     |  0.2        | 4               | 7.7 GB/s
-Alloc/Free on device:                    | 1.3989e-05      |  0.0        | 6               | 
-GPU: 96% Copy H<->D: 0.33% Alloc/free: 0.016% Comm: 2.4% CPU & I/O: 1.3%
+Libraries:                               | 0.0429498       | 80.4        | 1               | 
+Kernels:                                 | 0.00693115      | 13.0        | 151             | 
+Copy host to device:                     | 0.000129734     |  0.2        | 5               | 5.9 GB/s
+Copy device to host:                     | 0.000197762     |  0.4        | 4               | 7.7 GB/s
+Alloc/Free on device:                    | 3.04444e-07     |  0.0        | 6               | 
+GPU: 93% Copy H<->D: 0.61% Alloc/free: 0.00057% Comm: 3.9% CPU & I/O: 2.1%
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
                                               Post-resolution statistics
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-Time of the post-resolution:                                               0.815148       
+Time of the post-resolution:                                               0.792903       
 Maximum number of MPI allreduce per time step                              6              
 
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
@@ -97,6 +97,6 @@ Max waiting time big    => probably due to a bad partitioning
 Communications > 30%    => too many processors or network too slow
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
-Total time for the whole computation                                       5.76188        
+Total time for the whole computation                                       11.5244        
 
-Power consumption: 0 kW 0 kWh 0 € (0.12€/kWh)
+[Slurm] Power consumption (19 s):  0.680 kW  0.004 kWh  0.000 € (0.10€/kWh)
diff --git a/tests/GPU/OpenMP_Iterateur/PAR_OpenMP_Iterateur_BENCH_PETSc.TU.is157091_cc86x2 b/tests/GPU/OpenMP_Iterateur/PAR_OpenMP_Iterateur_BENCH_PETSc.TU.is157091_cc86x2
index 3f45adaf4f..25fb24f1eb 100644
--- a/tests/GPU/OpenMP_Iterateur/PAR_OpenMP_Iterateur_BENCH_PETSc.TU.is157091_cc86x2
+++ b/tests/GPU/OpenMP_Iterateur/PAR_OpenMP_Iterateur_BENCH_PETSc.TU.is157091_cc86x2
@@ -8,13 +8,13 @@ Time is given in seconds
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
                                               Context of the computation
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-Date:     11-03-2026 -- 18:35:57
+Date:     08-06-2026 -- 11:20:06
 OS:       is157091__Linux__x86_64__5.11.12-300.fc34.x86_64__#1 SMP Wed Apr 7 16:31:13 UTC 2021
 CPU model : AMD Ryzen Threadripper PRO 5975WX 32-Cores
 Total number of threads:64
 GPU model: NVIDIA RTX A6000
 CUDA runtime version: 12.90
-CUDA drivers version: 12.70
+CUDA drivers version: 13.20
 Nb procs used for the computation: 2
 TRUST version: 1.9.8_beta
 Total number of elements used for the calculation: 2592000
@@ -22,39 +22,39 @@ Total number of elements used for the calculation: 2592000
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
                                            Computation start-up statistics
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-Total time of the start-up:                                                7.32206        
-Number of virtual exchanges:                                               61             
-Maximum number of MPI allreduce per time step                              113            
+Total time of the start-up:                                                5.76054        
+Number of virtual exchanges:                                               59             
+Maximum number of MPI allreduce per time step                              112            
 
 Number of calls to the linear solver per time step:                        2              
-Average time of the resolution of the linear problem per call:             1.74206        
+Average time of the resolution of the linear problem per call:             1.20091        
 Average number of iteration of the linear solver per call:                 18.5           
 
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
                                                  Time loop statistics
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 The first time step is not accounted for the computation of the time loop statistics
-Total time of the time loop:                                               4.2591         
+Total time of the time loop:                                               1.92849        
 Number of time steps:                                                      9              
 Skipped time steps:                                                        1              
-Average time per time step:                                                0.473233       
-Standard deviation between time steps:                                     0.0281796      
-Time elapsed in the skipped time steps:                                    0.559638       
+Average time per time step:                                                0.214277       
+Standard deviation between time steps:                                     0.0078799      
+Time elapsed in the skipped time steps:                                    0.253926       
 
-Percent of total time spend in communication:                              0.541268       
+Percent of total time spend in communication:                              3.72389        
 
 Standard counter description             | Time/step       | % loop time | Call(s)/step   
 ------------------------------------------------------------------------------------------
-Linear solver resolutions Ax=B           | 0.403631        | 85.3        | 1              
-Convection operator                      | 0.01615821      |  3.4        | 1              
-Diffusion operator                       | 0.004912699     |  1.0        | 1              
-Gradient operator                        | 0.007781903     |  1.6        | 2              
-Divergence operator                      | 0.003409271     |  0.7        | 2              
-Update ::mettre_a_jour                   | 0.007377545     |  1.6        | 1              
-Computation of the time step dt          | 0.008302543     |  1.8        | 2              
-Post-treatment operations                | 0.002612658     |  0.6        | 1              
-Other operations                         | 0.0190475       |  4.0        | 
-Number of virtual exchanges per time step:                                 10             
+Linear solver resolutions Ax=B           | 0.161752        | 75.5        | 1              
+Convection operator                      | 0.01520807      |  7.1        | 1              
+Diffusion operator                       | 0.00419924      |  2.0        | 1              
+Gradient operator                        | 0.003383048     |  1.6        | 2              
+Divergence operator                      | 0.001593399     |  0.7        | 2              
+Update ::mettre_a_jour                   | 0.004787313     |  2.2        | 1              
+Computation of the time step dt          | 0.007714073     |  3.6        | 2              
+Post-treatment operations                | 0.002160597     |  1.0        | 1              
+Other operations                         | 0.01347913      |  6.3        | 
+Number of virtual exchanges per time step:                                 9              
 Maximum number of MPI allreduce per time step                              14             
 
 Average number of iteration of the linear solver per call:                 13.7           
@@ -64,32 +64,32 @@ Average number of iteration of the linear solver per call:                 13.7
 Warning: One or several PETSc solvers are used and thus the communication time below are under-estimated.
 ---------------------------------------------------------------------------------------------------------
 
-Average of the fraction of the time spent in communications between processors:      0.5            %
-Max of the fraction of the time spent in communications between processors:          0.6            %
-Min of the fraction of the time spent in communications between processors:          0.6            %
-Time of one mpsum measured by an internal bench over 0.1s (network latency):         1.28883e-06    
-Network maximum bandwidth on all processors:                                         21.0 GB/s      
+Average of the fraction of the time spent in communications between processors:      3.1            %
+Max of the fraction of the time spent in communications between processors:          4.3            %
+Min of the fraction of the time spent in communications between processors:          2.7            %
+Time of one mpsum measured by an internal bench over 0.1s (network latency):         2.34412e-06    
+Network maximum bandwidth on all processors:                                          8.4 GB/s      
 Total network traffic:                                                               93.8292        MB/time step
 Average message size:                                                                1893.41        kB
-Min waiting time:                                                                    0.6            % of total time
-Max waiting time:                                                                    0.6            % of total time
-Avg waiting time:                                                                    0.6            % of total time
+Min waiting time:                                                                    2.8            % of total time
+Max waiting time:                                                                    4.1            % of total time
+Avg waiting time:                                                                    3.45           % of total time
 
 -----------------------------------------------------------------------------------------------------------
                                                     GPU statistics
 -----------------------------------------------------------------------------------------------------------
 Counter description                      | Time per step   | % loop time | Call(s)/step    | Bandwidth 
 -----------------------------------------------------------------------------------------------------------
-Libraries:                               | 0.402598        | 85.1        | 1               | 
-Kernels:                                 | 0.0651709       | 13.8        | 115             | 
-Copy host to device:                     | 0.000457177     |  0.1        | 5               | 3.1 GB/s
-Copy device to host:                     | 0.000608826     |  0.1        | 4               | 5.0 GB/s
-Alloc/Free on device:                    | 7.70153e-05     |  0.0        | 6               | 
-GPU: 99% Copy H<->D: 0.23% Alloc/free: 0.016% Comm: 0.61% CPU & I/O: 0.3%
+Libraries:                               | 0.161239        | 75.2        | 1               | 
+Kernels:                                 | 0.0413889       | 19.3        | 115             | 
+Copy host to device:                     | 0.000220838     |  0.1        | 5               | 6.4 GB/s
+Copy device to host:                     | 0.00060274      |  0.3        | 4               | 5.1 GB/s
+Alloc/Free on device:                    | 2.69667e-07     |  0.0        | 6               | 
+GPU: 95% Copy H<->D: 0.38% Alloc/free: 0.00013% Comm: 4.2% CPU & I/O: 0.84%
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
                                               Post-resolution statistics
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-Time of the post-resolution:                                               0.0699469      
+Time of the post-resolution:                                               0.0763386      
 Maximum number of MPI allreduce per time step                              6              
 
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
@@ -97,5 +97,5 @@ Max waiting time big    => probably due to a bad partitioning
 Communications > 30%    => too many processors or network too slow
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
-Total time for the whole computation                                       12.2108        
+Total time for the whole computation                                       8.0193         
 
diff --git a/tests/GPU/OpenMP_Iterateur/PAR_OpenMP_Iterateur_BENCH_PETSc.TU.lumi_gfx90ax4 b/tests/GPU/OpenMP_Iterateur/PAR_OpenMP_Iterateur_BENCH_PETSc.TU.lumi_gfx90ax4
index c2bef7f8ba..30e69c3d42 100644
--- a/tests/GPU/OpenMP_Iterateur/PAR_OpenMP_Iterateur_BENCH_PETSc.TU.lumi_gfx90ax4
+++ b/tests/GPU/OpenMP_Iterateur/PAR_OpenMP_Iterateur_BENCH_PETSc.TU.lumi_gfx90ax4
@@ -8,8 +8,8 @@ Time is given in seconds
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
                                               Context of the computation
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-Date:     24-02-2026 -- 00:19:51
-OS:       nid007971__Linux__x86_64__6.4.0-150600.23.73_15.0.14-cray_shasta_c__#1 SMP Tue Oct 21 20:32:25 UTC 2025 (89d3c98)
+Date:     05-06-2026 -- 23:15:15
+OS:       nid005002__Linux__x86_64__6.4.0-150600.23.73_15.0.14-cray_shasta_c__#1 SMP Tue Oct 21 20:32:25 UTC 2025 (89d3c98)
 CPU model : AMD EPYC 7A53 64-Core Processor
 Total number of threads:128
 GPU model: AMD Instinct MI250X
@@ -22,74 +22,74 @@ Total number of elements used for the calculation: 2592000
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
                                            Computation start-up statistics
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-Total time of the start-up:                                                37.6937        
-Number of virtual exchanges:                                               61             
-Maximum number of MPI allreduce per time step                              113            
+Total time of the start-up:                                                50.583         
+Number of virtual exchanges:                                               59             
+Maximum number of MPI allreduce per time step                              112            
 
 Number of calls to the linear solver per time step:                        2              
-Average time of the resolution of the linear problem per call:             2.50994        
+Average time of the resolution of the linear problem per call:             3.77037        
 Average number of iteration of the linear solver per call:                 17.5           
 
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
                                                  Time loop statistics
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 The first time step is not accounted for the computation of the time loop statistics
-Total time of the time loop:                                               0.751937       
+Total time of the time loop:                                               0.526528       
 Number of time steps:                                                      9              
 Skipped time steps:                                                        1              
-Average time per time step:                                                0.0835485      
-Standard deviation between time steps:                                     0.00334088     
-Time elapsed in the skipped time steps:                                    0.109768       
+Average time per time step:                                                0.0585031      
+Standard deviation between time steps:                                     0.0027974      
+Time elapsed in the skipped time steps:                                    0.0731386      
 
-Percent of total time spend in communication:                              1.7599         
+Percent of total time spend in communication:                              2.01856        
 
 Standard counter description             | Time/step       | % loop time | Call(s)/step   
 ------------------------------------------------------------------------------------------
-Linear solver resolutions Ax=B           | 0.0598234       | 62.5        | 1              
-Convection operator                      | 0.005831945     |  6.1        | 1              
-Diffusion operator                       | 0.00206353      |  2.2        | 1              
-Gradient operator                        | 0.006198873     |  6.5        | 2              
-Divergence operator                      | 0.001405442     |  1.5        | 2              
-Update ::mettre_a_jour                   | 0.002029805     |  2.1        | 1              
-Computation of the time step dt          | 0.001912096     |  2.0        | 2              
-Post-treatment operations                | 0.001101474     |  1.2        | 1              
-Other operations                         | 0.003181929     |  3.3        | 
-Number of virtual exchanges per time step:                                 10             
+Linear solver resolutions Ax=B           | 0.0386508       | 66.1        | 1              
+Convection operator                      | 0.005404908     |  9.2        | 1              
+Diffusion operator                       | 0.002414174     |  4.1        | 1              
+Gradient operator                        | 0.00281482      |  4.8        | 2              
+Divergence operator                      | 0.0008206784    |  1.4        | 2              
+Update ::mettre_a_jour                   | 0.00173183      |  3.0        | 1              
+Computation of the time step dt          | 0.001880702     |  3.2        | 2              
+Post-treatment operations                | 0.001186549     |  2.0        | 1              
+Other operations                         | 0.00359863      |  6.2        | 
+Number of virtual exchanges per time step:                                 9              
 Maximum number of MPI allreduce per time step                              14             
 
-Average number of iteration of the linear solver per call:                 13.6           
+Average number of iteration of the linear solver per call:                 14.1           
 
 
 ---------------------------------------------------------------------------------------------------------
 Warning: One or several PETSc solvers are used and thus the communication time below are under-estimated.
 ---------------------------------------------------------------------------------------------------------
 
-Average of the fraction of the time spent in communications between processors:      1.9            %
-Max of the fraction of the time spent in communications between processors:          2.5            %
-Min of the fraction of the time spent in communications between processors:          1.9            %
-Time of one mpsum measured by an internal bench over 0.1s (network latency):         2.09506e-06    
-Network maximum bandwidth on all processors:                                         43.8 GB/s      
+Average of the fraction of the time spent in communications between processors:      2.9            %
+Max of the fraction of the time spent in communications between processors:          4.3            %
+Min of the fraction of the time spent in communications between processors:          2.2            %
+Time of one mpsum measured by an internal bench over 0.1s (network latency):         2.31699e-06    
+Network maximum bandwidth on all processors:                                         40.1 GB/s      
 Total network traffic:                                                               194.73         MB/time step
 Average message size:                                                                751.53         kB
-Min waiting time:                                                                    1.8            % of total time
-Max waiting time:                                                                    2.4            % of total time
-Avg waiting time:                                                                    2.125          % of total time
+Min waiting time:                                                                    2.2            % of total time
+Max waiting time:                                                                    4              % of total time
+Avg waiting time:                                                                    3.225          % of total time
 
 -----------------------------------------------------------------------------------------------------------
                                                     GPU statistics
 -----------------------------------------------------------------------------------------------------------
 Counter description                      | Time per step   | % loop time | Call(s)/step    | Bandwidth 
 -----------------------------------------------------------------------------------------------------------
-Libraries:                               | 0.0595808       | 71.3        | 1               | 
-Kernels:                                 | 0.020409        | 24.4        | 151             | 
-Copy host to device:                     | 0.00016724      |  0.2        | 5               | 4.6 GB/s
-Copy device to host:                     | 0.000260598     |  0.3        | 4               | 5.9 GB/s
-Alloc/Free on device:                    | 1.6014e-05      |  0.0        | 6               | 
-GPU: 96% Copy H<->D: 0.51% Alloc/free: 0.019% Comm: 2% CPU & I/O: 1.7%
+Libraries:                               | 0.0384101       | 65.7        | 1               | 
+Kernels:                                 | 0.0168738       | 28.8        | 151             | 
+Copy host to device:                     | 0.000169971     |  0.3        | 5               | 4.5 GB/s
+Copy device to host:                     | 0.000271035     |  0.5        | 4               | 5.6 GB/s
+Alloc/Free on device:                    | 4.24111e-07     |  0.0        | 6               | 
+GPU: 94% Copy H<->D: 0.75% Alloc/free: 0.00072% Comm: 2.3% CPU & I/O: 2.4%
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
                                               Post-resolution statistics
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-Time of the post-resolution:                                               0.599376       
+Time of the post-resolution:                                               0.829184       
 Maximum number of MPI allreduce per time step                              6              
 
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
@@ -97,6 +97,6 @@ Max waiting time big    => probably due to a bad partitioning
 Communications > 30%    => too many processors or network too slow
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
-Total time for the whole computation                                       39.1548        
+Total time for the whole computation                                       52.0119        
 
-Power consumption: 0 kW 0 kWh 0 € (0.12€/kWh)
+[Slurm] Power consumption (74 s):  0.495 kW  0.010 kWh  0.001 € (0.10€/kWh)
diff --git a/tests/GPU/OpenMP_Iterateur/PAR_OpenMP_Iterateur_BENCH_PETSc_10.TU.adastra_gfx90ax16 b/tests/GPU/OpenMP_Iterateur/PAR_OpenMP_Iterateur_BENCH_PETSc_10.TU.adastra_gfx90ax16
index e56f9af632..c4c221bcd4 100644
--- a/tests/GPU/OpenMP_Iterateur/PAR_OpenMP_Iterateur_BENCH_PETSc_10.TU.adastra_gfx90ax16
+++ b/tests/GPU/OpenMP_Iterateur/PAR_OpenMP_Iterateur_BENCH_PETSc_10.TU.adastra_gfx90ax16
@@ -1,102 +1 @@
-                                             # Global performance file #
-
-This is the global file for tracking performance in TRUST. It stores aggregated quantities.
-More detailed statistics can be found in the PAR_OpenMP_Iterateur_BENCH_PETSc_10_csv.TU file
-For time loop, only standard counters of level 1 are printed alongside your custom counters
-Time is given in seconds
-
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-                                              Context of the computation
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-Date:     18-03-2026 -- 20:04:16
-OS:       g1085__Linux__x86_64__5.14.0-570.69.1.el9_6.x86_64__#1 SMP PREEMPT_DYNAMIC Tue Nov 25 01:30:14 EST 2025
-CPU model : AMD EPYC 7A53 64-Core Processor
-Total number of threads:128
-GPU model: AMD Instinct MI250X
-HIP runtime version: 6.43
-HIP drivers version: 6.43
-Nb procs used for the computation: 16
-TRUST version: 1.9.8_beta
-Total number of elements used for the calculation: 80864000
-
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-                                           Computation start-up statistics
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-Total time of the start-up:                                                54.0581        
-Number of virtual exchanges:                                               61             
-Maximum number of MPI allreduce per time step                              113            
-
-Number of calls to the linear solver per time step:                        2              
-Average time of the resolution of the linear problem per call:             8.03732        
-Average number of iteration of the linear solver per call:                 23             
-
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-                                                 Time loop statistics
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-The first time step is not accounted for the computation of the time loop statistics
-Total time of the time loop:                                               3.87395        
-Number of time steps:                                                      9              
-Skipped time steps:                                                        1              
-Average time per time step:                                                0.430439       
-Standard deviation between time steps:                                     0.0201513      
-Time elapsed in the skipped time steps:                                    0.520931       
-
-Percent of total time spend in communication:                              4.0433         
-
-Standard counter description             | Time/step       | % loop time | Call(s)/step   
-------------------------------------------------------------------------------------------
-Linear solver resolutions Ax=B           | 0.262045        | 60.9        | 1              
-Convection operator                      | 0.03891157      |  9.0        | 1              
-Diffusion operator                       | 0.01705334      |  4.0        | 1              
-Gradient operator                        | 0.05048981      | 11.7        | 2              
-Divergence operator                      | 0.01093064      |  2.5        | 2              
-Update ::mettre_a_jour                   | 0.01398206      |  3.2        | 1              
-Computation of the time step dt          | 0.01677364      |  3.9        | 2              
-Post-treatment operations                | 0.00391899      |  0.9        | 1              
-Other operations                         | 0.01633393      |  3.8        | 
-Number of virtual exchanges per time step:                                 10             
-Maximum number of MPI allreduce per time step                              14             
-
-Average number of iteration of the linear solver per call:                 16.8           
-
-
----------------------------------------------------------------------------------------------------------
-Warning: One or several PETSc solvers are used and thus the communication time below are under-estimated.
----------------------------------------------------------------------------------------------------------
-
-Average of the fraction of the time spent in communications between processors:      3.3            %
-Max of the fraction of the time spent in communications between processors:          4.8            %
-Min of the fraction of the time spent in communications between processors:          2.7            %
-Time of one mpsum measured by an internal bench over 0.1s (network latency):         8.05391e-06    
-Network maximum bandwidth on all processors:                                         47.5 GB/s      
-Total network traffic:                                                               3572.84        MB/time step
-Average message size:                                                                2296.83        kB
-Min waiting time:                                                                    2.8            % of total time
-Max waiting time:                                                                    4.5            % of total time
-Avg waiting time:                                                                    3.66875        % of total time
-
------------------------------------------------------------------------------------------------------------
-                                                    GPU statistics
------------------------------------------------------------------------------------------------------------
-Counter description                      | Time per step   | % loop time | Call(s)/step    | Bandwidth 
------------------------------------------------------------------------------------------------------------
-Libraries:                               | 0.261533        | 60.8        | 1               | 
-Kernels:                                 | 0.144021        | 33.5        | 133             | 
-Copy host to device:                     | 0.000297935     |  0.1        | 5               | 9.1 GB/s
-Copy device to host:                     | 0.000368028     |  0.1        | 4               | 14.3 GB/s
-Alloc/Free on device:                    | 1.32222e-05     |  0.0        | 6               | 
-GPU: 94% Copy H<->D: 0.15% Alloc/free: 0.0031% Comm: 4.6% CPU & I/O: 1%
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-                                              Post-resolution statistics
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-Time of the post-resolution:                                               3.45915        
-Maximum number of MPI allreduce per time step                              6              
-
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-Max waiting time big    => probably due to a bad partitioning
-Communications > 30%    => too many processors or network too slow
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-Total time for the whole computation                                       61.9121        
-
-[Slurm] Power consumption (71 s):  1.265 kW  0.025 kWh  0.002 € (0.10€/kWh)
+[Slurm] Power consumption (55 s):  0.558 kW  0.009 kWh  0.001 € (0.10€/kWh)
diff --git a/tests/GPU/OpenMP_Iterateur/PAR_OpenMP_Iterateur_BENCH_PETSc_10.TU.adastra_gfx942x16 b/tests/GPU/OpenMP_Iterateur/PAR_OpenMP_Iterateur_BENCH_PETSc_10.TU.adastra_gfx942x16
index 0342f9c835..ce3dcf89ba 100644
--- a/tests/GPU/OpenMP_Iterateur/PAR_OpenMP_Iterateur_BENCH_PETSc_10.TU.adastra_gfx942x16
+++ b/tests/GPU/OpenMP_Iterateur/PAR_OpenMP_Iterateur_BENCH_PETSc_10.TU.adastra_gfx942x16
@@ -8,7 +8,7 @@ Time is given in seconds
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
                                               Context of the computation
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-Date:     17-03-2026 -- 19:20:34
+Date:     08-06-2026 -- 14:52:55
 OS:       a1007__Linux__x86_64__5.14.0-570.69.1.el9_6.x86_64__#1 SMP PREEMPT_DYNAMIC Tue Nov 25 01:30:14 EST 2025
 CPU model : AMD Instinct MI300A Accelerator
 Total number of threads:192
@@ -22,39 +22,39 @@ Total number of elements used for the calculation: 80864000
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
                                            Computation start-up statistics
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-Total time of the start-up:                                                42.2627        
-Number of virtual exchanges:                                               61             
-Maximum number of MPI allreduce per time step                              113            
+Total time of the start-up:                                                44.6328        
+Number of virtual exchanges:                                               59             
+Maximum number of MPI allreduce per time step                              112            
 
 Number of calls to the linear solver per time step:                        2              
-Average time of the resolution of the linear problem per call:             5.57906        
+Average time of the resolution of the linear problem per call:             5.07416        
 Average number of iteration of the linear solver per call:                 23             
 
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
                                                  Time loop statistics
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 The first time step is not accounted for the computation of the time loop statistics
-Total time of the time loop:                                               2.96566        
+Total time of the time loop:                                               1.65019        
 Number of time steps:                                                      9              
 Skipped time steps:                                                        1              
-Average time per time step:                                                0.329517       
-Standard deviation between time steps:                                     0.0188315      
-Time elapsed in the skipped time steps:                                    0.458303       
+Average time per time step:                                                0.183354       
+Standard deviation between time steps:                                     0.00993221     
+Time elapsed in the skipped time steps:                                    0.272063       
 
-Percent of total time spend in communication:                              4.53575        
+Percent of total time spend in communication:                              8.04868        
 
 Standard counter description             | Time/step       | % loop time | Call(s)/step   
 ------------------------------------------------------------------------------------------
-Linear solver resolutions Ax=B           | 0.273889        | 83.1        | 1              
-Convection operator                      | 0.01395275      |  4.2        | 1              
-Diffusion operator                       | 0.003969847     |  1.2        | 1              
-Gradient operator                        | 0.004818868     |  1.5        | 2              
-Divergence operator                      | 0.003827343     |  1.2        | 2              
-Update ::mettre_a_jour                   | 0.008657538     |  2.6        | 1              
-Computation of the time step dt          | 0.00643696      |  2.0        | 2              
-Post-treatment operations                | 0.002952047     |  0.9        | 1              
-Other operations                         | 0.01101313      |  3.3        | 
-Number of virtual exchanges per time step:                                 10             
+Linear solver resolutions Ax=B           | 0.133307        | 72.7        | 1              
+Convection operator                      | 0.01025036      |  5.6        | 1              
+Diffusion operator                       | 0.004059215     |  2.2        | 1              
+Gradient operator                        | 0.004627898     |  2.5        | 2              
+Divergence operator                      | 0.001907161     |  1.0        | 2              
+Update ::mettre_a_jour                   | 0.007985502     |  4.4        | 1              
+Computation of the time step dt          | 0.006856734     |  3.7        | 2              
+Post-treatment operations                | 0.002799652     |  1.5        | 1              
+Other operations                         | 0.01156031      |  6.3        | 
+Number of virtual exchanges per time step:                                 9              
 Maximum number of MPI allreduce per time step                              14             
 
 Average number of iteration of the linear solver per call:                 16.8           
@@ -64,32 +64,32 @@ Average number of iteration of the linear solver per call:                 16.8
 Warning: One or several PETSc solvers are used and thus the communication time below are under-estimated.
 ---------------------------------------------------------------------------------------------------------
 
-Average of the fraction of the time spent in communications between processors:      4              %
-Max of the fraction of the time spent in communications between processors:          5.7            %
-Min of the fraction of the time spent in communications between processors:          3.8            %
-Time of one mpsum measured by an internal bench over 0.1s (network latency):         2.05594e-05    
-Network maximum bandwidth on all processors:                                         43.6 GB/s      
+Average of the fraction of the time spent in communications between processors:      7              %
+Max of the fraction of the time spent in communications between processors:          10.7           %
+Min of the fraction of the time spent in communications between processors:          6.4            %
+Time of one mpsum measured by an internal bench over 0.1s (network latency):         1.91406e-05    
+Network maximum bandwidth on all processors:                                         42.1 GB/s      
 Total network traffic:                                                               3572.84        MB/time step
 Average message size:                                                                2296.83        kB
-Min waiting time:                                                                    3.7            % of total time
-Max waiting time:                                                                    5.2            % of total time
-Avg waiting time:                                                                    4.5625         % of total time
+Min waiting time:                                                                    6.9            % of total time
+Max waiting time:                                                                    9.2            % of total time
+Avg waiting time:                                                                    7.975          % of total time
 
 -----------------------------------------------------------------------------------------------------------
                                                     GPU statistics
 -----------------------------------------------------------------------------------------------------------
 Counter description                      | Time per step   | % loop time | Call(s)/step    | Bandwidth 
 -----------------------------------------------------------------------------------------------------------
-Libraries:                               | 0.273451        | 83.0        | 1               | 
-Kernels:                                 | 0.0357669       | 10.9        | 133             | 
-Copy host to device:                     | 0.000190326     |  0.1        | 5               | 14.3 GB/s
-Copy device to host:                     | 0.000208065     |  0.1        | 4               | 25.3 GB/s
-Alloc/Free on device:                    | 1.49636e-05     |  0.0        | 6               | 
-GPU: 94% Copy H<->D: 0.12% Alloc/free: 0.0045% Comm: 5.2% CPU & I/O: 0.8%
+Libraries:                               | 0.132898        | 72.5        | 1               | 
+Kernels:                                 | 0.0301383       | 16.4        | 133             | 
+Copy host to device:                     | 0.000228833     |  0.1        | 5               | 11.9 GB/s
+Copy device to host:                     | 0.000239867     |  0.1        | 4               | 22.0 GB/s
+Alloc/Free on device:                    | 2.84444e-07     |  0.0        | 6               | 
+GPU: 89% Copy H<->D: 0.26% Alloc/free: 0.00016% Comm: 9.4% CPU & I/O: 1.4%
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
                                               Post-resolution statistics
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-Time of the post-resolution:                                               1.95444        
+Time of the post-resolution:                                               2.20561        
 Maximum number of MPI allreduce per time step                              6              
 
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
@@ -97,6 +97,6 @@ Max waiting time big    => probably due to a bad partitioning
 Communications > 30%    => too many processors or network too slow
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
-Total time for the whole computation                                       47.6411        
+Total time for the whole computation                                       48.7606        
 
-[Slurm] Power consumption (57 s):  1.815 kW  0.029 kWh  0.003 € (0.10€/kWh)
+[Slurm] Power consumption (57 s):  2.676 kW  0.042 kWh  0.004 € (0.10€/kWh)
diff --git a/tests/GPU/OpenMP_Iterateur/PAR_OpenMP_Iterateur_BENCH_PETSc_10.TU.jean-zay_cc90x8 b/tests/GPU/OpenMP_Iterateur/PAR_OpenMP_Iterateur_BENCH_PETSc_10.TU.jean-zay_cc90x8
new file mode 100644
index 0000000000..725817887b
--- /dev/null
+++ b/tests/GPU/OpenMP_Iterateur/PAR_OpenMP_Iterateur_BENCH_PETSc_10.TU.jean-zay_cc90x8
@@ -0,0 +1,102 @@
+                                             # Global performance file #
+
+This is the global file for tracking performance in TRUST. It stores aggregated quantities.
+More detailed statistics can be found in the PAR_OpenMP_Iterateur_BENCH_PETSc_10_csv.TU file
+For time loop, only standard counters of level 1 are printed alongside your custom counters
+Time is given in seconds
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+                                              Context of the computation
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Date:     10-06-2026 -- 15:37:09
+OS:       jzxh250__Linux__x86_64__5.14.0-570.116.1.el9_6.x86_64__#1 SMP PREEMPT_DYNAMIC Tue May 19 08:44:54 EDT 2026
+CPU model : Intel(R) Xeon(R) Platinum 8468
+Total number of threads:192
+GPU model: NVIDIA H100 80GB HBM3
+CUDA runtime version: 12.60
+CUDA drivers version: 13.20
+Nb procs used for the computation: 8
+TRUST version: 1.9.8_beta
+Total number of elements used for the calculation: 80864000
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+                                           Computation start-up statistics
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Total time of the start-up:                                                50.1571        
+Number of virtual exchanges:                                               59             
+Maximum number of MPI allreduce per time step                              112            
+
+Number of calls to the linear solver per time step:                        2              
+Average time of the resolution of the linear problem per call:             9.18945        
+Average number of iteration of the linear solver per call:                 28             
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+                                                 Time loop statistics
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+The first time step is not accounted for the computation of the time loop statistics
+Total time of the time loop:                                               3.84827        
+Number of time steps:                                                      9              
+Skipped time steps:                                                        1              
+Average time per time step:                                                0.427586       
+Standard deviation between time steps:                                     0.0255095      
+Time elapsed in the skipped time steps:                                    0.699009       
+
+Percent of total time spend in communication:                              1.14865        
+
+Standard counter description             | Time/step       | % loop time | Call(s)/step   
+------------------------------------------------------------------------------------------
+Linear solver resolutions Ax=B           | 0.368848        | 86.3        | 1              
+Convection operator                      | 0.0146931       |  3.4        | 1              
+Diffusion operator                       | 0.005882547     |  1.4        | 1              
+Gradient operator                        | 0.00578996      |  1.4        | 2              
+Divergence operator                      | 0.002865112     |  0.7        | 2              
+Update ::mettre_a_jour                   | 0.009087068     |  2.1        | 1              
+Computation of the time step dt          | 0.003678033     |  0.9        | 2              
+Post-treatment operations                | 0.004289857     |  1.0        | 1              
+Other operations                         | 0.01245212      |  2.9        | 
+Number of virtual exchanges per time step:                                 9              
+Maximum number of MPI allreduce per time step                              14             
+
+Average number of iteration of the linear solver per call:                 20.9           
+
+
+---------------------------------------------------------------------------------------------------------
+Warning: One or several PETSc solvers are used and thus the communication time below are under-estimated.
+---------------------------------------------------------------------------------------------------------
+
+Average of the fraction of the time spent in communications between processors:      1.6            %
+Max of the fraction of the time spent in communications between processors:          2.7            %
+Min of the fraction of the time spent in communications between processors:          1.3            %
+Time of one mpsum measured by an internal bench over 0.1s (network latency):         5.57243e-06    
+Network maximum bandwidth on all processors:                                         180.4 GB/s     
+Total network traffic:                                                               2535.2         MB/time step
+Average message size:                                                                4541.56        kB
+Min waiting time:                                                                    1.3            % of total time
+Max waiting time:                                                                    2.5            % of total time
+Avg waiting time:                                                                    1.9375         % of total time
+
+-----------------------------------------------------------------------------------------------------------
+                                                    GPU statistics
+-----------------------------------------------------------------------------------------------------------
+Counter description                      | Time per step   | % loop time | Call(s)/step    | Bandwidth 
+-----------------------------------------------------------------------------------------------------------
+Libraries:                               | 0.368458        | 86.2        | 1               | 
+Kernels:                                 | 0.0464343       | 10.9        | 133             | 
+Copy host to device:                     | 0.000551314     |  0.1        | 5               | 8.8 GB/s
+Copy device to host:                     | 0.000781695     |  0.2        | 4               | 11.7 GB/s
+Alloc/Free on device:                    | 3.24e-07        |  0.0        | 6               | 
+GPU: 97% Copy H<->D: 0.31% Alloc/free: 7.6e-05% Comm: 1.4% CPU & I/O: 1.3%
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+                                              Post-resolution statistics
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Time of the post-resolution:                                               2.33551        
+Maximum number of MPI allreduce per time step                              6              
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Max waiting time big    => probably due to a bad partitioning
+Communications > 30%    => too many processors or network too slow
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Total time for the whole computation                                       57.0399        
+
+[Slurm] Power consumption (88 s):  0.947 kW  0.023 kWh  0.002 € (0.10€/kWh)
diff --git a/tests/GPU/OpenMP_Iterateur/PAR_OpenMP_Iterateur_BENCH_PETSc_10.TU.lumi_gfx90ax16 b/tests/GPU/OpenMP_Iterateur/PAR_OpenMP_Iterateur_BENCH_PETSc_10.TU.lumi_gfx90ax16
index 654fbec68e..d073b666c4 100644
--- a/tests/GPU/OpenMP_Iterateur/PAR_OpenMP_Iterateur_BENCH_PETSc_10.TU.lumi_gfx90ax16
+++ b/tests/GPU/OpenMP_Iterateur/PAR_OpenMP_Iterateur_BENCH_PETSc_10.TU.lumi_gfx90ax16
@@ -8,8 +8,8 @@ Time is given in seconds
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
                                               Context of the computation
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-Date:     24-02-2026 -- 00:30:07
-OS:       nid007973__Linux__x86_64__6.4.0-150600.23.73_15.0.14-cray_shasta_c__#1 SMP Tue Oct 21 20:32:25 UTC 2025 (89d3c98)
+Date:     15-05-2026 -- 21:00:20
+OS:       nid005023__Linux__x86_64__6.4.0-150600.23.73_15.0.14-cray_shasta_c__#1 SMP Tue Oct 21 20:32:25 UTC 2025 (89d3c98)
 CPU model : AMD EPYC 7A53 64-Core Processor
 Total number of threads:128
 GPU model: AMD Instinct MI250X
@@ -22,39 +22,39 @@ Total number of elements used for the calculation: 80864000
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
                                            Computation start-up statistics
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-Total time of the start-up:                                                62.7028        
-Number of virtual exchanges:                                               61             
-Maximum number of MPI allreduce per time step                              113            
+Total time of the start-up:                                                78.8995        
+Number of virtual exchanges:                                               59             
+Maximum number of MPI allreduce per time step                              112            
 
 Number of calls to the linear solver per time step:                        2              
-Average time of the resolution of the linear problem per call:             7.42008        
+Average time of the resolution of the linear problem per call:             8.6603         
 Average number of iteration of the linear solver per call:                 23             
 
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
                                                  Time loop statistics
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 The first time step is not accounted for the computation of the time loop statistics
-Total time of the time loop:                                               3.98958        
+Total time of the time loop:                                               3.3746         
 Number of time steps:                                                      9              
 Skipped time steps:                                                        1              
-Average time per time step:                                                0.443287       
-Standard deviation between time steps:                                     0.0201239      
-Time elapsed in the skipped time steps:                                    0.540091       
+Average time per time step:                                                0.374955       
+Standard deviation between time steps:                                     0.0173637      
+Time elapsed in the skipped time steps:                                    0.473033       
 
-Percent of total time spend in communication:                              4.67846        
+Percent of total time spend in communication:                              4.13723        
 
 Standard counter description             | Time/step       | % loop time | Call(s)/step   
 ------------------------------------------------------------------------------------------
-Linear solver resolutions Ax=B           | 0.26956         | 53.6        | 1              
-Convection operator                      | 0.04405853      |  8.8        | 1              
-Diffusion operator                       | 0.01489942      |  3.0        | 1              
-Gradient operator                        | 0.04936467      |  9.8        | 2              
-Divergence operator                      | 0.01089552      |  2.2        | 2              
-Update ::mettre_a_jour                   | 0.01366762      |  2.7        | 1              
-Computation of the time step dt          | 0.01753448      |  3.5        | 2              
-Post-treatment operations                | 0.00330442      |  0.7        | 1              
-Other operations                         | 0.02000149      |  4.0        | 
-Number of virtual exchanges per time step:                                 10             
+Linear solver resolutions Ax=B           | 0.25189         | 67.2        | 1              
+Convection operator                      | 0.03422895      |  9.1        | 1              
+Diffusion operator                       | 0.01616347      |  4.3        | 1              
+Gradient operator                        | 0.01467866      |  3.9        | 2              
+Divergence operator                      | 0.01064046      |  2.8        | 2              
+Update ::mettre_a_jour                   | 0.01338688      |  3.6        | 1              
+Computation of the time step dt          | 0.01679689      |  4.5        | 2              
+Post-treatment operations                | 0.00328297      |  0.9        | 1              
+Other operations                         | 0.01388677      |  3.7        | 
+Number of virtual exchanges per time step:                                 9              
 Maximum number of MPI allreduce per time step                              14             
 
 Average number of iteration of the linear solver per call:                 16.8           
@@ -64,32 +64,32 @@ Average number of iteration of the linear solver per call:                 16.8
 Warning: One or several PETSc solvers are used and thus the communication time below are under-estimated.
 ---------------------------------------------------------------------------------------------------------
 
-Average of the fraction of the time spent in communications between processors:      3.6            %
-Max of the fraction of the time spent in communications between processors:          5.7            %
-Min of the fraction of the time spent in communications between processors:          2.3            %
-Time of one mpsum measured by an internal bench over 0.1s (network latency):         8.50623e-06    
-Network maximum bandwidth on all processors:                                         54.1 GB/s      
+Average of the fraction of the time spent in communications between processors:      3.9            %
+Max of the fraction of the time spent in communications between processors:          6.2            %
+Min of the fraction of the time spent in communications between processors:          3.4            %
+Time of one mpsum measured by an internal bench over 0.1s (network latency):         8.64124e-06    
+Network maximum bandwidth on all processors:                                         42.6 GB/s      
 Total network traffic:                                                               3572.84        MB/time step
 Average message size:                                                                2296.83        kB
-Min waiting time:                                                                    2.5            % of total time
-Max waiting time:                                                                    5.3            % of total time
-Avg waiting time:                                                                    4.06875        % of total time
+Min waiting time:                                                                    3.4            % of total time
+Max waiting time:                                                                    5.8            % of total time
+Avg waiting time:                                                                    4.3875         % of total time
 
 -----------------------------------------------------------------------------------------------------------
                                                     GPU statistics
 -----------------------------------------------------------------------------------------------------------
 Counter description                      | Time per step   | % loop time | Call(s)/step    | Bandwidth 
 -----------------------------------------------------------------------------------------------------------
-Libraries:                               | 0.269051        | 60.7        | 1               | 
-Kernels:                                 | 0.146349        | 33.0        | 133             | 
-Copy host to device:                     | 0.000343226     |  0.1        | 5               | 7.9 GB/s
-Copy device to host:                     | 0.000442977     |  0.1        | 4               | 11.9 GB/s
-Alloc/Free on device:                    | 1.62248e-05     |  0.0        | 6               | 
-GPU: 94% Copy H<->D: 0.18% Alloc/free: 0.0037% Comm: 5.3% CPU & I/O: 0.8%
+Libraries:                               | 0.25138         | 67.0        | 1               | 
+Kernels:                                 | 0.101628        | 27.1        | 133             | 
+Copy host to device:                     | 0.000340213     |  0.1        | 5               | 8.0 GB/s
+Copy device to host:                     | 0.000432693     |  0.1        | 4               | 12.2 GB/s
+Alloc/Free on device:                    | 5.12e-07        |  0.0        | 6               | 
+GPU: 94% Copy H<->D: 0.21% Alloc/free: 0.00014% Comm: 4.7% CPU & I/O: 0.93%
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
                                               Post-resolution statistics
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-Time of the post-resolution:                                               3.27229        
+Time of the post-resolution:                                               3.00196        
 Maximum number of MPI allreduce per time step                              6              
 
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
@@ -97,6 +97,6 @@ Max waiting time big    => probably due to a bad partitioning
 Communications > 30%    => too many processors or network too slow
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
-Total time for the whole computation                                       70.5048        
+Total time for the whole computation                                       85.7491        
 
-Power consumption: 0 kW 0 kWh 0 € (0.12€/kWh)
+[Slurm] Power consumption (114 s):  1.176 kW  0.037 kWh  0.004 € (0.10€/kWh)
diff --git a/tests/GPU/OpenMP_Iterateur/check_perf.sh b/tests/GPU/OpenMP_Iterateur/check_perf.sh
index b9c4f392cd..ecd6c19405 100755
--- a/tests/GPU/OpenMP_Iterateur/check_perf.sh
+++ b/tests/GPU/OpenMP_Iterateur/check_perf.sh
@@ -15,7 +15,7 @@ check()
    then
       mv -f $TU $TU_REF && [ "$TRUST_SCM" = 1 ] && git add $TU_REF
       echo "Creating new reference $TU_REF"
-      exit
+      exit 0
    fi 
    ref=`TU.sh $TU_REF -dt`
    new=`TU.sh $TU     -dt`
@@ -68,6 +68,9 @@ else
    [ $HOST = is157091 ]     && run $HOST$GPU_ARCH 2 OpenMP_Iterateur_BENCH_PETSc
    [ "`hostname`" = petra ] && run $HOST$GPU_ARCH 2
    [ $HOST = topaze ]       && run $HOST$GPU_ARCH 4 && run $HOST$GPU_ARCH 8 OpenMP_Iterateur_BENCH_AmgX_10
+   [ $HOST = dalianvl ]        && run $HOST$GPU_ARCH 4 && run $HOST$GPU_ARCH 8 OpenMP_Iterateur_BENCH_AmgX_10
+   [ $HOST = jean-zay ]       && run $HOST$GPU_ARCH 4 && run $HOST$GPU_ARCH 8 OpenMP_Iterateur_BENCH_PETSc_10 && run $HOST$GPU_ARCH 8 OpenMP_Iterateur_BENCH_AmgX_10
+   [ $HOST = dalia ]        && run $HOST$GPU_ARCH 4 && run $HOST$GPU_ARCH 8 OpenMP_Iterateur_BENCH_AmgX_10
    [ $HOST = adastra ]      && run $HOST$GPU_ARCH 4 && run $HOST$GPU_ARCH 16 OpenMP_Iterateur_BENCH_PETSc_10
    [ $HOST = lumi ]      && run $HOST$GPU_ARCH 4 && run $HOST$GPU_ARCH 16 OpenMP_Iterateur_BENCH_PETSc_10
    [ $HOST = irene-amd-ccrt ]     && run $HOST$GPU_ARCH 4 && run $HOST$GPU_ARCH 8 OpenMP_Iterateur_BENCH_AmgX_10
diff --git a/tests/GPU/OpenMP_Iterateur/plot_scaling.py b/tests/GPU/OpenMP_Iterateur/plot_scaling.py
new file mode 100644
index 0000000000..e156b3bb73
--- /dev/null
+++ b/tests/GPU/OpenMP_Iterateur/plot_scaling.py
@@ -0,0 +1,344 @@
+#!/usr/bin/env python3
+"""
+plot_scaling.py
+---------------
+Plot Performance [MDOF/s] vs Problem Size [MDOF] from *SCALING.* files.
+
+Usage:
+    python3 plot_scaling.py                        # auto-detect *SCALING.* in current dir
+    python3 plot_scaling.py file1 file2 ...        # explicit files
+    python3 plot_scaling.py -o output.pdf file1    # custom output name
+    python3 plot_scaling.py -all                   # also plot Solver/Conv/Diff/Grad/Div
+    python3 plot_scaling.py -normalize             # normalize each curve to [0, 1]
+"""
+
+import sys
+import os
+import glob
+import argparse
+from collections import defaultdict
+
+import matplotlib
+matplotlib.use("Agg")
+import matplotlib.pyplot as plt
+import matplotlib.ticker as ticker
+import numpy as np
+
+# ── Matplotlib / rcParams for paper-quality output ──────────────────────────
+plt.rcParams.update({
+    "text.usetex":        False,        # set True if LaTeX is available
+    "font.family":        "serif",
+    "font.serif":         ["DejaVu Serif", "Times New Roman", "Times"],
+    "font.size":          11,
+    "axes.titlesize":     13,
+    "axes.labelsize":     12,
+    "xtick.labelsize":    10,
+    "ytick.labelsize":    10,
+    "legend.fontsize":    9,
+    "legend.framealpha":  0.85,
+    "lines.linewidth":    1.5,
+    "lines.markersize":   6,
+    "figure.dpi":         150,
+    "savefig.dpi":        300,
+    "savefig.bbox":       "tight",
+    "savefig.pad_inches": 0.05,
+    "axes.grid":          True,
+    "grid.linestyle":     "--",
+    "grid.linewidth":     0.5,
+    "grid.alpha":         0.6,
+    "axes.spines.top":    False,
+    "axes.spines.right":  False,
+})
+
+# ── Colour / marker cycle (colour-blind-friendly) ────────────────────────────
+COLORS = [
+    "#0072B2",  # blue
+    "#D55E00",  # vermilion
+    "#009E73",  # green
+    "#CC79A7",  # pink
+    "#E69F00",  # orange
+    "#56B4E9",  # sky blue
+    "#F0E442",  # yellow
+]
+MARKERS = ["o", "s", "^", "D", "v", "P", "X"]
+
+# Metrics available in -all mode, with their column index and display name
+ALL_METRICS = [
+    ("MDOF/s", 10),
+    ("Solver",  11),
+    ("Conv",    13),
+    ("Diff",    14),
+    ("Grad",    15),
+    ("Div",     16),
+]
+# Line styles to distinguish metrics when multiple configs are present
+LINESTYLES = ["-", "--", "-.", ":", (0, (3, 1, 1, 1)), (0, (5, 2))]
+
+
+# ── File parsing ─────────────────────────────────────────────────────────────
+
+def parse_scaling_file(path: str, all_metrics: bool = False):
+    """
+    Return parsed data from a SCALING file.
+
+    Default mode  → {config: ([MDOF], [MDOF/s])}
+    all_metrics   → {config: ([MDOF], {metric_name: [values]})}
+
+    Header layout (0-based column indices):
+      0=Config  1=[MTET]  2=[MDOF]  3=TimeStep[s]  4=Solver[s]  5=[its]
+      6=[ms/it]  7=Kernels[s]  8=RAM[GB]  9=DRAM[GB]  10=[MDOF/s]
+      11=Solver  12=Kernels  13=Conv  14=Diff  15=Grad  16=Div
+    """
+    raw_data: dict = defaultdict(lambda: defaultdict(list))
+
+    with open(path) as fh:
+        for lineno, line in enumerate(fh, 1):
+            line = line.strip()
+            if not line or line.startswith("#"):
+                continue
+            parts = line.split()
+            if parts[0] == "Config":
+                continue
+            min_cols = max(col for _, col in ALL_METRICS) + 1 if all_metrics else 11
+            if len(parts) < min_cols:
+                if len(parts) >= 11:
+                    # file has basic columns only; fall back to default parsing
+                    pass
+                else:
+                    print(f"  [skip] {os.path.basename(path)}:{lineno} – only {len(parts)} columns")
+                    continue
+            try:
+                config = parts[0]
+                mdof   = float(parts[2])
+            except (ValueError, IndexError) as exc:
+                print(f"  [skip] {os.path.basename(path)}:{lineno} – {exc}")
+                continue
+
+            raw_data[config]["mdof"].append(mdof)
+
+            if all_metrics:
+                for name, col in ALL_METRICS:
+                    try:
+                        raw_data[config][name].append(float(parts[col]))
+                    except (ValueError, IndexError):
+                        raw_data[config][name].append(float("nan"))
+            else:
+                try:
+                    raw_data[config]["MDOF/s"].append(float(parts[10]))
+                except (ValueError, IndexError) as exc:
+                    print(f"  [skip] {os.path.basename(path)}:{lineno} – {exc}")
+                    raw_data[config]["mdof"].pop()
+
+    # Convert to numpy arrays
+    result = {}
+    for config, arrays in raw_data.items():
+        mdof = np.array(arrays["mdof"])
+        if all_metrics:
+            metrics = {name: np.array(arrays[name]) for name, _ in ALL_METRICS}
+            result[config] = (mdof, metrics)
+        else:
+            result[config] = (mdof, np.array(arrays["MDOF/s"]))
+    return result
+
+
+# ── Plotting ─────────────────────────────────────────────────────────────────
+
+def sort_configs(configs: list) -> list:
+    """Try to sort configs numerically by the leading MPI count, then GPU count."""
+    def key(s):
+        try:
+            mpi_part = s.split("MPI")[0]
+            gpu_part = s.split("+")[1].replace("GPU", "") if "+" in s else "0"
+            return (int(mpi_part), int(gpu_part))
+        except Exception:
+            return (0, 0)
+    return sorted(configs, key=key)
+
+
+def _normalize(arr: np.ndarray) -> np.ndarray:
+    """Normalize array to [0, 1] by its maximum (ignoring NaN)."""
+    mx = np.nanmax(arr)
+    if mx == 0 or np.isnan(mx):
+        return arr
+    return arr / mx
+
+
+def plot_scaling(files: list, output: str, all_mode: bool = False, normalize: bool = False, log: bool = False):
+    """Read all files and produce one figure per file (saved as PDF + PNG)."""
+
+    for fpath in files:
+        print(f"Processing: {fpath}")
+        data = parse_scaling_file(fpath, all_metrics=all_mode)
+
+        if not data:
+            print(f"  [warn] no valid data found, skipping.")
+            continue
+
+        fig, ax = plt.subplots(figsize=(6.5, 4.5))
+        configs = sort_configs(list(data.keys()))
+
+        if all_mode:
+            # ── -all: one curve per (config, metric) ────────────────────────
+            # Colors cycle over metrics, line styles over configs
+            metric_names = [name for name, _ in ALL_METRICS]
+            metric_color = {name: COLORS[i % len(COLORS)] for i, name in enumerate(metric_names)}
+            config_ls    = {cfg: LINESTYLES[i % len(LINESTYLES)] for i, cfg in enumerate(configs)}
+
+            for config in configs:
+                mdof, metrics = data[config]
+                order = np.argsort(mdof)
+                mdof = mdof[order]
+                ls   = config_ls[config]
+
+                for name in metric_names:
+                    vals = metrics[name][order]
+                    if normalize:
+                        vals = _normalize(vals)
+                    label = f"{config} – {name}"
+                    ax.plot(
+                        mdof, vals,
+                        color=metric_color[name],
+                        linestyle=ls,
+                        marker=MARKERS[metric_names.index(name) % len(MARKERS)],
+                        label=label,
+                        zorder=3,
+                    )
+
+            # Legend: two sections – one for metrics (color), one for configs (linestyle)
+            from matplotlib.lines import Line2D
+            legend_handles = []
+            for name in metric_names:
+                legend_handles.append(Line2D([0], [0], color=metric_color[name],
+                                             linewidth=2, label=name))
+            if len(configs) > 1:
+                legend_handles.append(Line2D([0], [0], color="none", label=""))
+                for cfg in configs:
+                    legend_handles.append(Line2D([0], [0], color="grey",
+                                                 linestyle=config_ls[cfg], label=cfg))
+            ax.legend(handles=legend_handles, fontsize=8, loc="best", handlelength=2.5)
+
+        else:
+            # ── default: one curve per config, [MDOF/s] only ────────────────
+            for idx, config in enumerate(configs):
+                mdof, perf = data[config]
+                order = np.argsort(mdof)
+                mdof, perf = mdof[order], perf[order]
+                if normalize:
+                    perf = _normalize(perf)
+
+                ax.plot(
+                    mdof, perf,
+                    color=COLORS[idx % len(COLORS)],
+                    marker=MARKERS[idx % len(MARKERS)],
+                    label=config,
+                    zorder=3,
+                )
+
+            ncol = max(1, len(configs) // 6 + 1)
+            ax.legend(title="Configuration", ncol=ncol, loc="best", handlelength=2.0)
+
+        # ── axes labels & title ──────────────────────────────────────────────
+        ax.set_xlabel("Problem size [MDOF]")
+        if normalize:
+            ax.set_ylabel("Normalized performance [0–1]")
+            ax.set_ylim(0, 1.05)
+        else:
+            ax.set_ylabel("Performance [MDOF/s]")
+            ax.set_ylim(bottom=0)
+
+        base  = os.path.splitext(os.path.basename(fpath))[0]
+        parts = base.split("_SCALING")
+        if len(parts) == 2:
+            case   = parts[0].replace("_", " ")
+            suffix = parts[1].lstrip(".")
+            title  = f"{case} – Performance Scaling"
+            if suffix:
+                title += f"\n({suffix})"
+        else:
+            title = base.replace("_", " ")
+        ax.set_title(title)
+
+        if log:
+            ax.set_xscale("log")
+            ax.xaxis.set_major_formatter(ticker.LogFormatterSciNotation(labelOnlyBase=False))
+        else:
+            ax.set_xlim(left=0)
+            ax.xaxis.set_minor_locator(ticker.AutoMinorLocator())
+        ax.yaxis.set_minor_locator(ticker.AutoMinorLocator())
+        ax.tick_params(which="minor", length=3)
+
+        # ── save ─────────────────────────────────────────────────────────────
+        if output:
+            stem    = os.path.splitext(output)[0]
+            ext     = os.path.splitext(output)[1] or ".pdf"
+            idx_str = f"_{files.index(fpath):02d}" if len(files) > 1 else ""
+            out_path = stem + idx_str + ext
+        else:
+            dir_     = os.path.dirname(os.path.abspath(fpath))
+            stem     = os.path.splitext(os.path.basename(fpath))[0]
+            out_path = os.path.join(dir_, stem + ".pdf")
+
+        fig.savefig(out_path)
+        print(f"  -> saved: {out_path}")
+
+        png_path = os.path.splitext(out_path)[0] + ".png"
+        fig.savefig(png_path)
+        print(f"  -> saved: {png_path}")
+
+        plt.close(fig)
+
+
+# ── Entry point ───────────────────────────────────────────────────────────────
+
+def main():
+    parser = argparse.ArgumentParser(
+        description="Plot Performance [MDOF/s] vs Problem Size [MDOF] from SCALING files."
+    )
+    parser.add_argument(
+        "files",
+        nargs="*",
+        help="SCALING file(s). If omitted, auto-detects *SCALING.* in the current directory.",
+    )
+    parser.add_argument(
+        "-o", "--output",
+        default="",
+        help="Output file path (e.g. plot.pdf). Extension determines format.",
+    )
+    parser.add_argument(
+        "-all",
+        action="store_true",
+        dest="all_mode",
+        help="Plot all kernel metrics (MDOF/s, Solver, Conv, Diff, Grad, Div) in the same figure.",
+    )
+    parser.add_argument(
+        "-normalize",
+        action="store_true",
+        help="Normalize each curve by its maximum so values range from 0 to 1.",
+    )
+    parser.add_argument(
+        "-log",
+        action="store_true",
+        help="Use a logarithmic scale on the X axis.",
+    )
+    args = parser.parse_args()
+
+    files = args.files
+    if not files:
+        files = sorted(glob.glob("*SCALING.*"))
+        _skip_exts = {".swp", ".pdf", ".png", ".eps", ".svg",
+                      ".jpg", ".jpeg", ".pyc", "~"}
+        files = [f for f in files
+                 if os.path.splitext(f)[1].lower() not in _skip_exts
+                 and not f.endswith("~")
+                 and os.path.isfile(f)]
+        if not files:
+            print("No SCALING files found in the current directory.")
+            print("Usage: python3 plot_scaling.py [file1 file2 ...] [-o output.pdf]")
+            sys.exit(1)
+        print(f"Auto-detected files: {files}")
+
+    plot_scaling(files, args.output, all_mode=args.all_mode, normalize=args.normalize, log=args.log)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/tests/GPU/OpenMP_Iterateur/scaling.sh b/tests/GPU/OpenMP_Iterateur/scaling.sh
index 800239ec47..c33d03160a 100755
--- a/tests/GPU/OpenMP_Iterateur/scaling.sh
+++ b/tests/GPU/OpenMP_Iterateur/scaling.sh
@@ -1,8 +1,8 @@
-#!/bin/bash
+#!/bin/bash 
 # Scaling a mesh on several GPU
 [ "$TRUST_ROOT" = "" ] && echo "TRUST_ROOT empty." && exit
-
-# HOST:   
+scale=1.15 && [ "$1" != "" ] && scale=$1
+# HOST:
 HOST=${HOST%.intra.cea.fr} && [ "$HOST" = portable ] && HOST=is246827
 # ARCH:
 GPU_ARCH=""
@@ -28,12 +28,12 @@ do
       gpus="1 2"
    else
       gpus="1"
-   fi   
+   fi
 
    # Target problem sizes in MDOF (millions of degrees of freedom).
    # MDOF formula: 1.2*40*(Nx-1)*(Ny-1)*(Nz-1)/1e6
    # Nx, Ny, Nz are scaled uniformly from the base mesh in the .data file.
-   
+
    # Read base Nombre_de_Noeuds from reference data file.
    # Prefer /* Nombre_de_Noeuds X Y Z */ commented template (scaling hint);
    # fall back to the last active Nombre_de_Noeuds line.
@@ -44,40 +44,26 @@ do
    else
       ref_data=$ROOT/`basename $ROOT`.data
    fi
-   read Nx0 Ny0 Nz0 <<< $(awk '
-      /\/\* Nombre_de_Noeuds [0-9]/          { cx=$3; cy=$4; cz=$5 }
-      /^[[:space:]]*Nombre_de_Noeuds [0-9]/  { ax=$2; ay=$3; az=$4 }
-      END { if (cx!="") print cx,cy,cz; else print ax,ay,az }
-   ' $ref_data)
-   [ -z "$Nx0" ] && echo "Error: Nombre_de_Noeuds not found in $ref_data" && exit 1
-   echo "# Base mesh from $ref_data: Nombre_de_Noeuds $Nx0 $Ny0 $Nz0"
-
+   Lx=`awk '/Longueurs/ {print $2}' $ref_data`
+   Ly=`awk '/Longueurs/ {print $3}' $ref_data`
+   Lz=`awk '/Longueurs/ {print $4}' $ref_data`
+   alpha=1
    for gpu in $gpus
    do
       mpis=$TRUST_NB_PHYSICAL_CORES && [ $version = gpu ] && mpis=$gpu
-      mdof_target=0.1
-      inc_mdof=0.1
       for mpi in $mpis
       do
          while [ 1 ]
          do
-            # Compute new Nx Ny Nz matching the target MDOF via uniform scaling:
-            #   alpha = cbrt( target_MDOF*1e6 / (1.2*40*(Nx0-1)*(Ny0-1)*(Nz0-1)) )
-            #   Ni = 1 + round( alpha * (Ni0-1) )
-            read Nx Ny Nz <<< $(awk -v Nx0=$Nx0 -v Ny0=$Ny0 -v Nz0=$Nz0 -v target=$mdof_target \
-               'BEGIN {
-                  base  = 1.2 * 40 * (Nx0-1) * (Ny0-1) * (Nz0-1)
-                  alpha = (target * 1e6 / base) ^ (1.0/3.0)
-                  Nx = int(0.5 + 1 + alpha * (Nx0-1)); if (Nx < 2) Nx = 2
-                  Ny = int(0.5 + 1 + alpha * (Ny0-1)); if (Ny < 2) Ny = 2
-                  Nz = int(0.5 + 1 + alpha * (Nz0-1)); if (Nz < 2) Nz = 2
-                  print Nx, Ny, Nz
-               }')
+            Nx=`echo $Lx $alpha | awk '{print int($1*$2)}'`
+            Ny=`echo $Ly $alpha | awk '{print int($1*$2)}'`
+            Nz=`echo $Lz $alpha | awk '{print int($1*$2)}'`
+            alpha=`echo $alpha*$scale | bc -l`
             jdd=$mpi"_"$gpu"_"${Nx}x${Ny}x${Nz}
             mkdir -p $ROOT/scaling/$jdd && cd $ROOT/scaling/$jdd
             # Run ?
             run=1 && [ -f $jdd.out_err ] && run=0
-            [ $run = 1 ] && echo "$jdd (target ${mdof_target} MDOF) ..."
+            [ $run = 1 ] && echo "$jdd ..."
             # Creation data
             if [ -f $ROOT/OpenMP_Iterateur.data ]
             then
@@ -100,15 +86,16 @@ do
             # Decoupage
             [ $run = 1 ] && [ $mpi != 1 ] && (make_PAR.data $jdd $mpi 1>/dev/null 2>&1;cp PAR_$jdd.data $jdd.data)
             # Calcul
-            [ $run = 1 ] && (trust $jdd $mpi -ksp_view -journal=0 1>$jdd.out_err 2>&1 || (rm -f *.TU;echo "Error:See "`pwd`/$jdd.out_err))
-            [ "`grep 'Arret des process' $jdd.out_err`" = "" ] && break
+            #[ $run = 1 ] && (trust $jdd $mpi -ksp_view -journal=0 1>$jdd.out_err 2>&1 || (rm -f *.TU;echo "Error:See "`pwd`/$jdd.out_err))
+            [ $run = 1 ] && (trust $jdd $mpi -journal=0 1>$jdd.out_err 2>&1 || (rm -f *.TU;echo "Error:See "`pwd`/$jdd.out_err))
+            [ "`grep 'Arret des process' $jdd.out_err`" = "" ] && alpha=`echo $alpha*0.5 | bc -l` && break
             # Analyse
-	    i=0 && [ "$HOST" = adastra ] && i=1
-	    hram=`awk -v i=$i '/RAM taken/ {if ($(13+i)>RAM) RAM=$(13+i)} END {print 0.1*int(0.01*RAM)}' $jdd.out_err`
-	    dram=`awk -v i=$i '/RAM allocated on a GPU/ {if ($(1+i)>RAM) RAM=$(1+i)} END {print RAM}' $jdd.out_err`
-            row=`awk '/Order of the PETSc matrix/ {print $7;exit}' $jdd.out_err`
-            faces=`awk '/Total number of faces/ {print $NF;exit}' $jdd.out_err`
-            elems=`awk '/Total number of elements/ {printf($NF);exit}' $jdd.out_err`
+            i=0 && [ "$HOST" = adastra ] && i=1
+            hram=`awk -v i=$i '/RAM taken/ {if ($(13+i)>RAM) RAM=$(13+i)} END {print 0.1*int(0.01*RAM)}' $jdd.out_err`
+            dram=`awk -v i=$i '/RAM allocated on a GPU/ {if ($(1+i)>RAM) RAM=$(1+i)} END {print RAM}' $jdd.out_err`
+            row=`awk -v i=$i '/Order of the PETSc matrix/ {print $(7+i);exit}' $jdd.out_err`
+            faces=`awk -v i=$i '/Total number of faces/ {print $NF;exit}' $jdd.out_err`
+            elems=`awk -v i=$i '/Total number of elements/ {printf($NF);exit}' $jdd.out_err`
             # No better to use dof=row
             dof=$row
             #dof=`echo 1*$faces | bc -l` # En VDF
@@ -117,21 +104,21 @@ do
             its=`TU.sh $jdd.TU -its`            
             awk -v mpi=$mpi -v gpu=$gpu -v elems=$elems -v row=$row -v dof=$dof -v hram=$hram -v dram=$dram -v dt=$dt -v its=$its '\
 	    BEGIN {config=mpi"MPI"(gpu==0?"":"+"gpu"GPU");mdof=dof/1e6;mtet=elems/1e6} \
-            /Linear solver/       {ts=$6;b=dt-ts;ls=mdof/ts} \
+            /Linear solver/       {ts=$6;b=dt-ts;ls=mdof/ts*its} \
             /Convection operator/ { conv=mdof/$4*$8 } \
             /Diffusion operator/  { diff=mdof/$4*$8 } \
             /Gradient operator/   { grad=mdof/$4*$8 } \
             /Divergence operator/ { dive=mdof/$4*$8 } \
-            /Kernels:/            { ks=mdof/$3 } \
-            END {printf("%s %7.3f %7.3f %11.3f %9.3f %5d %7.1f %10.3f %7.1f %8.1f %8.1f %6.1f %6.1f %4d %4d %4d %4d\n", \
+            /Kernels:/            { ks=int(mdof/$3) } \
+            END {printf("%s %7.3f %7.3f %11.3f %9.3f %5d %7.1f %10.3f %7.1f %8.1f %8.1f %4d %7d %4d %4d %4d %4d\n", \
 	             config, mtet, mdof, dt, ts, its, 1000*ts/its,    b,   hram, dram,  mdof/dt,  ls,  ks, conv, diff, grad, dive)}' $jdd.TU | tee -a $log
 	    # Clean
-	    rm -f *.sauv *.xyz *.*lata*	*.cgns* *.face *.son *.lml    
-            mdof_target=`echo $mdof_target+$inc_mdof | bc -l`
+            rm -f *.sauv *.xyz *.*lata*	*.cgns* *.face *.son *.lml
          done
       done
    done    
 done
 echo "$log created."
-python3 ./plot_scaling.py
-display JEL_bous_SCALING.png
+cd $ROOT
+python3 ../OpenMP_Iterateur/plot_scaling.py
+echo "display JEL_bous_SCALING.png"
diff --git a/tests/GPU/OpenMP_Iterateur/weak_scaling.sh b/tests/GPU/OpenMP_Iterateur/weak_scaling.sh
index 9169fa5b59..36b72d85fd 100755
--- a/tests/GPU/OpenMP_Iterateur/weak_scaling.sh
+++ b/tests/GPU/OpenMP_Iterateur/weak_scaling.sh
@@ -80,11 +80,13 @@ do
       load_imbalance=`awk '/Load imbalance/ {print $NF}' $jdd.out_err | tail -1`
       dof=`awk '/Total number of elements/ {print $NF}' $jdd.out_err | tail -1`
       dram=`awk '/allocated on a GPU/ {print $1}' $jdd.out_err | tail -1`
-      its=`awk '/Iterations/ && /solveur/ {print $NF}' $jdd.TU`
+      its=`TU.sh $jdd.TU -its` 
+      dt=`TU.sh $jdd.TU -dt`
+      s=`TU.sh $jdd.TU -solver`
       gpu="\t" && [ $bench = gpu ] && gpu="+"$mpi"GPU"
       direct="Off" && [ "`grep 'Enabling GPU' $jdd.out_err`" != "" ] && direct="On"
       kj=`grep -l $jdd myjob.* 2>/dev/null | tail -1 | awk -F. '{print $2}' | xargs -I {} sacct --format=JobID,ElapsedRaw,ConsumedEnergyRaw,NodeList --jobs={} 2>/dev/null | awk '/\.batch/ {print $3}'`
-      awk -v host=$HOST -v mpi=$mpi"MPI" -v gpu=$gpu -v dof=$dof -v lib=$load_imbalance -v its=$its -v direct=$direct -v kj=$kj -v dram=$dram '/Secondes/ && /pas de temps/ {dt=$NF} /Dont solveurs/ {s=$4;b=dt-s} END {print host" \t"dof" \t"mpi""gpu"\t"dt" \t"s" \t"b" \t"int(dof/dt*0.001*0.001)" \t"int(its)" \t"lib" \t"kj" \t\t"direct" \t\t"dram" \t\t"1000*s/its}' $jdd.TU
+      awk -v host=$HOST -v mpi=$mpi"MPI" -v gpu=$gpu -v dof=$dof -v lib=$load_imbalance -v its=$its -v direct=$direct -v kj=$kj -v dram=$dram -v dt=$dt -v s=$s 'END {print host" \t"dof" \t"mpi""gpu"\t"dt" \t"s" \t"dt-s" \t"int(dof/dt*0.001*0.001)" \t"int(its)" \t"lib" \t"kj" \t\t"direct" \t\t"dram" \t\t"1000*s/its}' $jdd.TU
       rm -f *.xyz *.sauv *.Zones # Clean
       cd - 1>/dev/null 2>&1
    done    
diff --git a/tests/GPU/OpenMP_QC/OpenMP_QC.data b/tests/GPU/OpenMP_QC/OpenMP_QC.data
index 5bf205259d..336b8fd9a8 100644
--- a/tests/GPU/OpenMP_QC/OpenMP_QC.data
+++ b/tests/GPU/OpenMP_QC/OpenMP_QC.data
@@ -25,7 +25,7 @@ Scatter DOM.Zones dom
 END SCATTER #
 
 VEFPreP1B dis
-lire dis { P0 P1 changement_de_base_P1bulle 1 CL_pression_sommet_faible 0 modif_div_face_dirichlet 0 }
+Lire dis { reorder { algo Hilbert } }
 
 Runge_Kutta_rationnel_ordre_2 sch
 lire sch
diff --git a/tests/GPU/OpenMP_QC/OpenMP_QC_BENCH.TU.adastra_gfx90a b/tests/GPU/OpenMP_QC/OpenMP_QC_BENCH.TU.adastra_gfx90a
index cdd75e4ae9..ca96c4e05a 100644
--- a/tests/GPU/OpenMP_QC/OpenMP_QC_BENCH.TU.adastra_gfx90a
+++ b/tests/GPU/OpenMP_QC/OpenMP_QC_BENCH.TU.adastra_gfx90a
@@ -8,8 +8,8 @@ Time is given in seconds
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
                                               Context of the computation
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-Date:     20-02-2026 -- 18:15:49
-OS:       g1081__Linux__x86_64__5.14.0-570.69.1.el9_6.x86_64__#1 SMP PREEMPT_DYNAMIC Tue Nov 25 01:30:14 EST 2025
+Date:     23-04-2026 -- 09:34:29
+OS:       g1182__Linux__x86_64__5.14.0-570.69.1.el9_6.x86_64__#1 SMP PREEMPT_DYNAMIC Tue Nov 25 01:30:14 EST 2025
 CPU model : AMD EPYC 7A53 64-Core Processor
 Total number of threads:128
 GPU model: AMD Instinct MI250X
@@ -22,40 +22,40 @@ Total number of elements used for the calculation: 393216
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
                                            Computation start-up statistics
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-Total time of the start-up:                                                12.5807        
+Total time of the start-up:                                                13.6576        
 
 Number of calls to the linear solver per time step:                        2              
-Average time of the resolution of the linear problem per call:             0.353507       
+Average time of the resolution of the linear problem per call:             0.43557        
 Average number of iteration of the linear solver per call:                 0              
 
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
                                                  Time loop statistics
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 The first time step is not accounted for the computation of the time loop statistics
-Total time of the time loop:                                               0.282159       
+Total time of the time loop:                                               0.274393       
 Number of time steps:                                                      2              
 Skipped time steps:                                                        1              
-Average time per time step:                                                0.14108        
-Standard deviation between time steps:                                     0.00447892     
-Time elapsed in the skipped time steps:                                    0.299704       
+Average time per time step:                                                0.137196       
+Standard deviation between time steps:                                     0.00374847     
+Time elapsed in the skipped time steps:                                    0.303199       
 
 
 Standard counter description             | Time/step       | % loop time | Call(s)/step   
 ------------------------------------------------------------------------------------------
-Linear solver resolutions Ax=B           | 0.0413907       | 14.2        | 2              
-Matrix assembly for implicit scheme      | 0.008847495     |  3.0        | 4              
-Convection operator                      | 0.009065098     |  3.1        | 6              
-Diffusion operator                       | 0.0179382       |  6.2        | 18             
-Gradient operator                        | 0.01716352      |  5.9        | 5              
-Divergence operator                      | 0.004838472     |  1.7        | 6              
-Source terms                             | 0.0005823695    |  0.2        | 4              
-Update ::mettre_a_jour                   | 0.01238165      |  4.3        | 4              
-Solver for implicit diffusion            | 0.008807739     |  3.0        | 4              
-Computation of the time step dt          | 0.003402077     |  1.2        | 6              
-Post-treatment operations                | 0.009174429     |  3.2        | 1              
-Other operations                         | 0.007487872     |  2.6        | 
+Linear solver resolutions Ax=B           | 0.0474501       | 34.6        | 2              
+Matrix assembly for implicit scheme      | 0.008291998     |  6.0        | 4              
+Convection operator                      | 0.008458615     |  6.2        | 6              
+Diffusion operator                       | 0.01870515      | 13.6        | 18             
+Gradient operator                        | 0.01000784      |  7.3        | 5              
+Divergence operator                      | 0.004593083     |  3.3        | 6              
+Source terms                             | 0.0005224655    |  0.4        | 4              
+Update ::mettre_a_jour                   | 0.009446594     |  6.9        | 4              
+Solver for implicit diffusion            | 0.008450158     |  6.2        | 4              
+Computation of the time step dt          | 0.003498005     |  2.5        | 6              
+Post-treatment operations                | 0.008986579     |  6.6        | 1              
+Other operations                         | 0.008785769     |  6.4        | 
 
-Average number of iteration of the linear solver per call:                 48             
+Average number of iteration of the linear solver per call:                 56.2           
 
 
 -----------------------------------------------------------------------------------------------------------
@@ -63,17 +63,17 @@ Average number of iteration of the linear solver per call:                 48
 -----------------------------------------------------------------------------------------------------------
 Counter description                      | Time per step   | % loop time | Call(s)/step    | Bandwidth 
 -----------------------------------------------------------------------------------------------------------
-Libraries:                               | 0.0411813       | 29.2        | 2               | 
-Kernels:                                 | 0.0890872       | 63.1        | 963             | 
-Copy host to device:                     | 0.000447347     |  0.3        | 21              | 2.2 GB/s
-Copy device to host:                     | 0.000568236     |  0.4        | 9               | 12.0 GB/s
-Alloc/Free on device:                    | 6.8284e-05      |  0.0        | 9               | 
-GPU: 92% Copy H<->D: 0.72% Alloc/free: 0.048% Comm: 0% CPU & I/O: 6.9%
+Libraries:                               | 0.0472339       | 34.4        | 2               | 
+Kernels:                                 | 0.0789504       | 57.5        | 963             | 
+Copy host to device:                     | 0.000471898     |  0.3        | 21              | 2.1 GB/s
+Copy device to host:                     | 0.000577923     |  0.4        | 9               | 11.8 GB/s
+Alloc/Free on device:                    | 5.6098e-05      |  0.0        | 9               | 
+GPU: 92% Copy H<->D: 0.77% Alloc/free: 0.041% Comm: 0% CPU & I/O: 7.2%
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
                                               Post-resolution statistics
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-Time of the post-resolution:                                               0.00244098     
+Time of the post-resolution:                                               0.00252862     
 
-Total time for the whole computation                                       13.165         
+Total time for the whole computation                                       14.2377        
 
-Power consumption: 0 kW 0 kWh 0 € (0.12€/kWh)
+[Slurm] Power consumption (22 s):  0.360 kW  0.002 kWh  0.000 € (0.10€/kWh)
diff --git a/tests/GPU/OpenMP_QC/OpenMP_QC_BENCH.TU.dalianvl_cc100 b/tests/GPU/OpenMP_QC/OpenMP_QC_BENCH.TU.dalianvl_cc100
new file mode 100644
index 0000000000..fd74cc1aad
--- /dev/null
+++ b/tests/GPU/OpenMP_QC/OpenMP_QC_BENCH.TU.dalianvl_cc100
@@ -0,0 +1,78 @@
+                                             # Global performance file #
+
+This is the global file for tracking performance in TRUST. It stores aggregated quantities.
+More detailed statistics can be found in the OpenMP_QC_BENCH_csv.TU file
+For time loop, only standard counters of level 1 are printed alongside your custom counters
+Time is given in seconds
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+                                              Context of the computation
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Date:     22-05-2026 -- 12:17:46
+OS:       dalianvl14__Linux__aarch64__5.14.0-503.40.1.el9_5.aarch64__#1 SMP PREEMPT_DYNAMIC Thu Apr 24 13:05:29 UTC 2025
+CPU model : Unknown Linux CPU
+Total number of threads:144
+GPU model: NVIDIA GB200
+CUDA runtime version: 13.0
+CUDA drivers version: 13.20
+Nb procs used for the computation: 1
+TRUST version: 1.9.8_beta
+Total number of elements used for the calculation: 393216
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+                                           Computation start-up statistics
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Total time of the start-up:                                                7.27273        
+
+Number of calls to the linear solver per time step:                        2              
+Average time of the resolution of the linear problem per call:             0.645492       
+Average number of iteration of the linear solver per call:                 0              
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+                                                 Time loop statistics
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+The first time step is not accounted for the computation of the time loop statistics
+Total time of the time loop:                                               0.148116       
+Number of time steps:                                                      2              
+Skipped time steps:                                                        1              
+Average time per time step:                                                0.0740581      
+Standard deviation between time steps:                                     0.00232288     
+Time elapsed in the skipped time steps:                                    0.181552       
+
+
+Standard counter description             | Time/step       | % loop time | Call(s)/step   
+------------------------------------------------------------------------------------------
+Linear solver resolutions Ax=B           | 0.0413845       | 55.9        | 2              
+Matrix assembly for implicit scheme      | 0.001681213     |  2.3        | 4              
+Convection operator                      | 0.00283675      |  3.8        | 6              
+Diffusion operator                       | 0.004026267     |  5.4        | 18             
+Gradient operator                        | 0.001471085     |  2.0        | 5              
+Divergence operator                      | 0.001090974     |  1.5        | 6              
+Source terms                             | 0.0003684635    |  0.5        | 4              
+Update ::mettre_a_jour                   | 0.004602123     |  6.2        | 4              
+Solver for implicit diffusion            | 0.004396186     |  5.9        | 4              
+Computation of the time step dt          | 0.000828607     |  1.1        | 6              
+Post-treatment operations                | 0.005638793     |  7.6        | 1              
+Other operations                         | 0.00573316      |  7.7        | 
+
+Average number of iteration of the linear solver per call:                 54             
+
+
+-----------------------------------------------------------------------------------------------------------
+                                                    GPU statistics
+-----------------------------------------------------------------------------------------------------------
+Counter description                      | Time per step   | % loop time | Call(s)/step    | Bandwidth 
+-----------------------------------------------------------------------------------------------------------
+Libraries:                               | 0.0411895       | 55.6        | 2               | 
+Kernels:                                 | 0.0225578       | 30.5        | 963             | 
+Copy host to device:                     | 0.000361967     |  0.5        | 21              | 2.7 GB/s
+Copy device to host:                     | 0.000221856     |  0.3        | 9               | 30.8 GB/s
+Alloc/Free on device:                    | 0.000213488     |  0.3        | 9               | 
+GPU: 86% Copy H<->D: 0.79% Alloc/free: 0.29% Comm: 0% CPU & I/O: 13%
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+                                              Post-resolution statistics
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Time of the post-resolution:                                               0.00191222     
+
+Total time for the whole computation                                       7.60432        
+
diff --git a/tests/GPU/OpenMP_QC/OpenMP_QC_BENCH.TU.eureka_cc89 b/tests/GPU/OpenMP_QC/OpenMP_QC_BENCH.TU.eureka_cc89
new file mode 100644
index 0000000000..fbbaf34459
--- /dev/null
+++ b/tests/GPU/OpenMP_QC/OpenMP_QC_BENCH.TU.eureka_cc89
@@ -0,0 +1,78 @@
+                                             # Global performance file #
+
+This is the global file for tracking performance in TRUST. It stores aggregated quantities.
+More detailed statistics can be found in the OpenMP_QC_BENCH_csv.TU file
+For time loop, only standard counters of level 1 are printed alongside your custom counters
+Time is given in seconds
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+                                              Context of the computation
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Date:     11-05-2026 -- 08:35:02
+OS:       eureka__Linux__x86_64__6.17.0-20-generic__#20~24.04.1-Ubuntu SMP PREEMPT_DYNAMIC Thu Mar 19 01:28:37 UTC 2
+CPU model : INTEL(R) XEON(R) PLATINUM 8580
+Total number of threads:240
+GPU model: NVIDIA RTX 6000 Ada Generation
+CUDA runtime version: 12.90
+CUDA drivers version: 13.10
+Nb procs used for the computation: 1
+TRUST version: 1.9.8_beta
+Total number of elements used for the calculation: 393216
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+                                           Computation start-up statistics
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Total time of the start-up:                                                9.42622        
+
+Number of calls to the linear solver per time step:                        2              
+Average time of the resolution of the linear problem per call:             0.357419       
+Average number of iteration of the linear solver per call:                 0              
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+                                                 Time loop statistics
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+The first time step is not accounted for the computation of the time loop statistics
+Total time of the time loop:                                               0.175936       
+Number of time steps:                                                      2              
+Skipped time steps:                                                        1              
+Average time per time step:                                                0.087968       
+Standard deviation between time steps:                                     0.00155373     
+Time elapsed in the skipped time steps:                                    0.315067       
+
+
+Standard counter description             | Time/step       | % loop time | Call(s)/step   
+------------------------------------------------------------------------------------------
+Linear solver resolutions Ax=B           | 0.0534873       | 60.8        | 2              
+Matrix assembly for implicit scheme      | 0.002269501     |  2.6        | 4              
+Convection operator                      | 0.004729102     |  5.4        | 6              
+Diffusion operator                       | 0.005568396     |  6.3        | 18             
+Gradient operator                        | 0.002533424     |  2.9        | 5              
+Divergence operator                      | 0.001298084     |  1.5        | 6              
+Source terms                             | 0.000266546     |  0.3        | 4              
+Update ::mettre_a_jour                   | 0.003159048     |  3.6        | 4              
+Solver for implicit diffusion            | 0.004780001     |  5.4        | 4              
+Computation of the time step dt          | 0.001324467     |  1.5        | 6              
+Post-treatment operations                | 0.003367655     |  3.8        | 1              
+Other operations                         | 0.005184457     |  5.9        | 
+
+Average number of iteration of the linear solver per call:                 54             
+
+
+-----------------------------------------------------------------------------------------------------------
+                                                    GPU statistics
+-----------------------------------------------------------------------------------------------------------
+Counter description                      | Time per step   | % loop time | Call(s)/step    | Bandwidth 
+-----------------------------------------------------------------------------------------------------------
+Libraries:                               | 0.0533364       | 60.6        | 2               | 
+Kernels:                                 | 0.02909         | 33.1        | 963             | 
+Copy host to device:                     | 0.000282456     |  0.3        | 21              | 3.5 GB/s
+Copy device to host:                     | 0.00072923      |  0.8        | 9               | 9.4 GB/s
+Alloc/Free on device:                    | 0.000103919     |  0.1        | 9               | 
+GPU: 94% Copy H<->D: 1.2% Alloc/free: 0.12% Comm: 0% CPU & I/O: 5%
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+                                              Post-resolution statistics
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Time of the post-resolution:                                               0.000208091    
+
+Total time for the whole computation                                       9.91744        
+
diff --git a/tests/GPU/OpenMP_QC/OpenMP_QC_BENCH.TU.is157091_cc86 b/tests/GPU/OpenMP_QC/OpenMP_QC_BENCH.TU.is157091_cc86
index 81040101ab..f92f5361a6 100644
--- a/tests/GPU/OpenMP_QC/OpenMP_QC_BENCH.TU.is157091_cc86
+++ b/tests/GPU/OpenMP_QC/OpenMP_QC_BENCH.TU.is157091_cc86
@@ -8,7 +8,7 @@ Time is given in seconds
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
                                               Context of the computation
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-Date:     11-03-2026 -- 18:36:08
+Date:     22-04-2026 -- 07:55:46
 OS:       is157091__Linux__x86_64__5.11.12-300.fc34.x86_64__#1 SMP Wed Apr 7 16:31:13 UTC 2021
 CPU model : AMD Ryzen Threadripper PRO 5975WX 32-Cores
 Total number of threads:64
@@ -22,38 +22,38 @@ Total number of elements used for the calculation: 393216
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
                                            Computation start-up statistics
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-Total time of the start-up:                                                6.16451        
+Total time of the start-up:                                                6.07675        
 
 Number of calls to the linear solver per time step:                        2              
-Average time of the resolution of the linear problem per call:             0.279578       
+Average time of the resolution of the linear problem per call:             0.338521       
 Average number of iteration of the linear solver per call:                 0              
 
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
                                                  Time loop statistics
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 The first time step is not accounted for the computation of the time loop statistics
-Total time of the time loop:                                               0.289052       
+Total time of the time loop:                                               0.284861       
 Number of time steps:                                                      2              
 Skipped time steps:                                                        1              
-Average time per time step:                                                0.144526       
-Standard deviation between time steps:                                     0.000854199    
-Time elapsed in the skipped time steps:                                    0.316179       
+Average time per time step:                                                0.14243        
+Standard deviation between time steps:                                     0.00177781     
+Time elapsed in the skipped time steps:                                    0.320267       
 
 
 Standard counter description             | Time/step       | % loop time | Call(s)/step   
 ------------------------------------------------------------------------------------------
-Linear solver resolutions Ax=B           | 0.0781792       | 54.1        | 2              
-Matrix assembly for implicit scheme      | 0.005994391     |  4.1        | 4              
-Convection operator                      | 0.009186321     |  6.4        | 6              
-Diffusion operator                       | 0.009221059     |  6.4        | 18             
-Gradient operator                        | 0.004951293     |  3.4        | 5              
-Divergence operator                      | 0.004072616     |  2.8        | 6              
-Source terms                             | 0.000345246     |  0.2        | 4              
-Update ::mettre_a_jour                   | 0.005491032     |  3.8        | 4              
-Solver for implicit diffusion            | 0.009880821     |  6.8        | 4              
-Computation of the time step dt          | 0.003923569     |  2.7        | 6              
-Post-treatment operations                | 0.0045862       |  3.2        | 1              
-Other operations                         | 0.008694332     |  6.0        | 
+Linear solver resolutions Ax=B           | 0.0777574       | 54.6        | 2              
+Matrix assembly for implicit scheme      | 0.00667315      |  4.7        | 4              
+Convection operator                      | 0.008953442     |  6.3        | 6              
+Diffusion operator                       | 0.009070647     |  6.4        | 18             
+Gradient operator                        | 0.003474078     |  2.4        | 5              
+Divergence operator                      | 0.004063237     |  2.9        | 6              
+Source terms                             | 0.000364597     |  0.3        | 4              
+Update ::mettre_a_jour                   | 0.005158036     |  3.6        | 4              
+Solver for implicit diffusion            | 0.009889927     |  6.9        | 4              
+Computation of the time step dt          | 0.003849252     |  2.7        | 6              
+Post-treatment operations                | 0.004260108     |  3.0        | 1              
+Other operations                         | 0.00891642      |  6.3        | 
 
 Average number of iteration of the linear solver per call:                 54             
 
@@ -63,16 +63,16 @@ Average number of iteration of the linear solver per call:                 54
 -----------------------------------------------------------------------------------------------------------
 Counter description                      | Time per step   | % loop time | Call(s)/step    | Bandwidth 
 -----------------------------------------------------------------------------------------------------------
-Libraries:                               | 0.07797         | 53.9        | 2               | 
-Kernels:                                 | 0.0604061       | 41.8        | 963             | 
-Copy host to device:                     | 0.00028165      |  0.2        | 21              | 3.5 GB/s
-Copy device to host:                     | 0.000714904     |  0.5        | 9               | 9.6 GB/s
-Alloc/Free on device:                    | 0.000213664     |  0.1        | 9               | 
-GPU: 96% Copy H<->D: 0.69% Alloc/free: 0.15% Comm: 0% CPU & I/O: 3.4%
+Libraries:                               | 0.0775452       | 54.4        | 2               | 
+Kernels:                                 | 0.0590798       | 41.5        | 963             | 
+Copy host to device:                     | 0.000279161     |  0.2        | 21              | 3.5 GB/s
+Copy device to host:                     | 0.000780016     |  0.5        | 9               | 8.8 GB/s
+Alloc/Free on device:                    | 9.50135e-05     |  0.1        | 9               | 
+GPU: 96% Copy H<->D: 0.74% Alloc/free: 0.067% Comm: 0% CPU & I/O: 3.3%
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
                                               Post-resolution statistics
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-Time of the post-resolution:                                               0.00025942     
+Time of the post-resolution:                                               0.000268055    
 
-Total time for the whole computation                                       6.77001        
+Total time for the whole computation                                       6.68216        
 
diff --git a/tests/GPU/OpenMP_QC/OpenMP_QC_BENCH.TU.is159479_cc120 b/tests/GPU/OpenMP_QC/OpenMP_QC_BENCH.TU.is159479_cc120
new file mode 100644
index 0000000000..91a4f8944a
--- /dev/null
+++ b/tests/GPU/OpenMP_QC/OpenMP_QC_BENCH.TU.is159479_cc120
@@ -0,0 +1,78 @@
+                                             # Global performance file #
+
+This is the global file for tracking performance in TRUST. It stores aggregated quantities.
+More detailed statistics can be found in the OpenMP_QC_BENCH_csv.TU file
+For time loop, only standard counters of level 1 are printed alongside your custom counters
+Time is given in seconds
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+                                              Context of the computation
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Date:     24-04-2026 -- 14:36:40
+OS:       is159479.intra.cea.fr__Linux__x86_64__6.12.0-160000.26-default__#1 SMP PREEMPT_DYNAMIC Thu Feb  5 00:00:11 UTC 2026 (f
+CPU model : AMD Ryzen Threadripper PRO 9965WX 24-Cores
+Total number of threads:48
+GPU model: NVIDIA RTX PRO 6000 Blackwell Workstation Edition
+CUDA runtime version: 13.0
+CUDA drivers version: 13.0
+Nb procs used for the computation: 1
+TRUST version: 1.9.8_beta
+Total number of elements used for the calculation: 393216
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+                                           Computation start-up statistics
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Total time of the start-up:                                                5.48508        
+
+Number of calls to the linear solver per time step:                        2              
+Average time of the resolution of the linear problem per call:             0.218228       
+Average number of iteration of the linear solver per call:                 0              
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+                                                 Time loop statistics
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+The first time step is not accounted for the computation of the time loop statistics
+Total time of the time loop:                                               0.138249       
+Number of time steps:                                                      2              
+Skipped time steps:                                                        1              
+Average time per time step:                                                0.0691244      
+Standard deviation between time steps:                                     0.00128628     
+Time elapsed in the skipped time steps:                                    0.194232       
+
+
+Standard counter description             | Time/step       | % loop time | Call(s)/step   
+------------------------------------------------------------------------------------------
+Linear solver resolutions Ax=B           | 0.0460209       | 66.6        | 2              
+Matrix assembly for implicit scheme      | 0.001367521     |  2.0        | 4              
+Convection operator                      | 0.003197111     |  4.6        | 6              
+Diffusion operator                       | 0.003536        |  5.1        | 18             
+Gradient operator                        | 0.001323068     |  1.9        | 5              
+Divergence operator                      | 0.000847333     |  1.2        | 6              
+Source terms                             | 0.000163009     |  0.2        | 4              
+Update ::mettre_a_jour                   | 0.002235803     |  3.2        | 4              
+Solver for implicit diffusion            | 0.003205218     |  4.6        | 4              
+Computation of the time step dt          | 0.000807989     |  1.2        | 6              
+Post-treatment operations                | 0.002658792     |  3.8        | 1              
+Other operations                         | 0.003761627     |  5.4        | 
+
+Average number of iteration of the linear solver per call:                 54             
+
+
+-----------------------------------------------------------------------------------------------------------
+                                                    GPU statistics
+-----------------------------------------------------------------------------------------------------------
+Counter description                      | Time per step   | % loop time | Call(s)/step    | Bandwidth 
+-----------------------------------------------------------------------------------------------------------
+Libraries:                               | 0.045932        | 66.4        | 2               | 
+Kernels:                                 | 0.0191397       | 27.7        | 963             | 
+Copy host to device:                     | 0.000208833     |  0.3        | 21              | 4.7 GB/s
+Copy device to host:                     | 0.000781099     |  1.1        | 9               | 8.8 GB/s
+Alloc/Free on device:                    | 4.53425e-05     |  0.1        | 9               | 
+GPU: 94% Copy H<->D: 1.4% Alloc/free: 0.066% Comm: 0% CPU & I/O: 4.4%
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+                                              Post-resolution statistics
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Time of the post-resolution:                                               0.0001672      
+
+Total time for the whole computation                                       5.81773        
+
diff --git a/tests/GPU/OpenMP_QC/OpenMP_QC_BENCH.TU.is247793_gfx1100 b/tests/GPU/OpenMP_QC/OpenMP_QC_BENCH.TU.is247793_gfx1100
index 698e29466a..8a943b5eb3 100644
--- a/tests/GPU/OpenMP_QC/OpenMP_QC_BENCH.TU.is247793_gfx1100
+++ b/tests/GPU/OpenMP_QC/OpenMP_QC_BENCH.TU.is247793_gfx1100
@@ -1,53 +1,78 @@
-Statistiques d'initialisation du calcul
-
-Temps total                       7.87363
-
-Statistiques de resolution du probleme
-
-Temps total                       3.18878
-
-
-Timesteps                         3
-Secondes / pas de temps           1.06292
-Dont solveurs Ax=B                0.786720 74% (2 appels/pas de temps)
-Dont solveur diffusion_implicite  0.012582  1% (4 appels/pas de temps)
-Dont assemblage matrice_implicite 0.039556  3% (4 appels/pas de temps)
-Dont mettre_a_jour                0.007267  0% (4 appels/pas de temps)
-Dont operateurs convection        0.042721  4% (6 appels/pas de temps)
-Dont operateurs diffusion         0.025574  2% (18 appels/pas de temps)
-Dont operateurs gradient          0.008015  0% (5 appels/pas de temps)
-Dont operateurs divergence        0.003617  0% (5.66667 appels/pas de temps)
-Dont operateurs source            0.002720  0% (4 appels/pas de temps)
-Dont operations postraitement     0.119419 11% (1 appel/pas de temps)
-Dont calcul dt                    0.003073  0% (6 appels/pas de temps)
-Dont calcul divers                0.011655  1% (0 appels/pas de temps)
-Nb solveur / pas de temps         2
-Secondes / solveur                0.39336
-Iterations / solveur              369
-GPU statistics per time step (experimental):
-Libraries : 0.786482 s 74.0%  2.0 calls
-Kernels   : 0.094407 s  8.9% 1409891.0 calls
-Copy H2D  : 0.026391 s  2.5% 71.0 calls  3.8 GB/s
-Copy D2H  : 0.003439 s  0.3% 76.0 calls  9.0 GB/s
-Alloc/Free: 0.004708 s  0.4% 58.0 calls
-GPU: 82.8% Copy H<->D: 2.8% Alloc/Free: 0.4% Comm: 0% CPU & Others: 13.8%
-I/O:
-
-Timesteps = number of time steps
-Nb solveur = number of linear system resolutions
-Nb assemblage implicite = number of matrix assemblies for the implicit scheme
-Iterations = average number of iterations of the solver
-Communications = fraction of the time spent
-                 in communications between processors (excluding io files)
-Network latency = time of one mpsum measured by an internal bench over 0.1s
-Network bandwidth = maximum on all processors
-                    of the average bandwidth of send_recv operations
-Waiting time = estimation of the waiting time of the different processors
-
-Max_waiting_time big    => probably due to a bad partitioning
-Communications > 30%    => too many processors or network too slow
-
-Statistiques de post resolution
-
-Temps total                       0.087521
+                                             # Global performance file #
+
+This is the global file for tracking performance in TRUST. It stores aggregated quantities.
+More detailed statistics can be found in the OpenMP_QC_BENCH_csv.TU file
+For time loop, only standard counters of level 1 are printed alongside your custom counters
+Time is given in seconds
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+                                              Context of the computation
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Date:     22-05-2026 -- 19:05:11
+OS:       is247793__Linux__x86_64__6.8.0-107-generic__#107~22.04.1-Ubuntu SMP PREEMPT_DYNAMIC Wed Mar 18 23:40:43 UTC 
+CPU model : AMD Ryzen Threadripper PRO 7965WX 24-Cores
+Total number of threads:48
+GPU model: AMD Radeon PRO W7900
+HIP runtime version: 7.53
+HIP drivers version: 7.53
+Nb procs used for the computation: 1
+TRUST version: 1.9.8_beta
+Total number of elements used for the calculation: 393216
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+                                           Computation start-up statistics
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Total time of the start-up:                                                6.45541        
+
+Number of calls to the linear solver per time step:                        2              
+Average time of the resolution of the linear problem per call:             0.599301       
+Average number of iteration of the linear solver per call:                 0              
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+                                                 Time loop statistics
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+The first time step is not accounted for the computation of the time loop statistics
+Total time of the time loop:                                               0.281635       
+Number of time steps:                                                      2              
+Skipped time steps:                                                        1              
+Average time per time step:                                                0.140818       
+Standard deviation between time steps:                                     0.00160022     
+Time elapsed in the skipped time steps:                                    0.339711       
+
+
+Standard counter description             | Time/step       | % loop time | Call(s)/step   
+------------------------------------------------------------------------------------------
+Linear solver resolutions Ax=B           | 0.0737454       | 52.4        | 2              
+Matrix assembly for implicit scheme      | 0.00484449      |  3.4        | 4              
+Convection operator                      | 0.00740623      |  5.3        | 6              
+Diffusion operator                       | 0.0112249       |  8.0        | 18             
+Gradient operator                        | 0.004945052     |  3.5        | 5              
+Divergence operator                      | 0.003099721     |  2.2        | 6              
+Source terms                             | 0.000512645     |  0.4        | 4              
+Update ::mettre_a_jour                   | 0.006273922     |  4.5        | 4              
+Solver for implicit diffusion            | 0.01006549      |  7.1        | 4              
+Computation of the time step dt          | 0.003010985     |  2.1        | 6              
+Post-treatment operations                | 0.005375151     |  3.8        | 1              
+Other operations                         | 0.0103136       |  7.3        | 
+
+Average number of iteration of the linear solver per call:                 56.2           
+
+
+-----------------------------------------------------------------------------------------------------------
+                                                    GPU statistics
+-----------------------------------------------------------------------------------------------------------
+Counter description                      | Time per step   | % loop time | Call(s)/step    | Bandwidth 
+-----------------------------------------------------------------------------------------------------------
+Libraries:                               | 0.0735468       | 52.2        | 2               | 
+Kernels:                                 | 0.0612288       | 43.5        | 963             | 
+Copy host to device:                     | 0.000498129     |  0.4        | 21              | 2.0 GB/s
+Copy device to host:                     | 0.000596809     |  0.4        | 9               | 11.5 GB/s
+Alloc/Free on device:                    | 7.26065e-05     |  0.1        | 9               | 
+GPU: 96% Copy H<->D: 0.78% Alloc/free: 0.052% Comm: 0% CPU & I/O: 3.5%
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+                                              Post-resolution statistics
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Time of the post-resolution:                                               0.000191543    
+
+Total time for the whole computation                                       7.07695        
 
diff --git a/tests/GPU/OpenMP_QC/OpenMP_QC_BENCH.TU.lumi_gfx90a b/tests/GPU/OpenMP_QC/OpenMP_QC_BENCH.TU.lumi_gfx90a
index b42b08e3c6..3b64cfeb32 100644
--- a/tests/GPU/OpenMP_QC/OpenMP_QC_BENCH.TU.lumi_gfx90a
+++ b/tests/GPU/OpenMP_QC/OpenMP_QC_BENCH.TU.lumi_gfx90a
@@ -8,8 +8,8 @@ Time is given in seconds
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
                                               Context of the computation
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-Date:     24-02-2026 -- 00:32:35
-OS:       nid007971__Linux__x86_64__6.4.0-150600.23.73_15.0.14-cray_shasta_c__#1 SMP Tue Oct 21 20:32:25 UTC 2025 (89d3c98)
+Date:     15-05-2026 -- 21:03:37
+OS:       nid005002__Linux__x86_64__6.4.0-150600.23.73_15.0.14-cray_shasta_c__#1 SMP Tue Oct 21 20:32:25 UTC 2025 (89d3c98)
 CPU model : AMD EPYC 7A53 64-Core Processor
 Total number of threads:128
 GPU model: AMD Instinct MI250X
@@ -22,40 +22,40 @@ Total number of elements used for the calculation: 393216
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
                                            Computation start-up statistics
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-Total time of the start-up:                                                46.4693        
+Total time of the start-up:                                                41.8307        
 
 Number of calls to the linear solver per time step:                        2              
-Average time of the resolution of the linear problem per call:             1.61811        
+Average time of the resolution of the linear problem per call:             1.59457        
 Average number of iteration of the linear solver per call:                 0              
 
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
                                                  Time loop statistics
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 The first time step is not accounted for the computation of the time loop statistics
-Total time of the time loop:                                               0.303302       
+Total time of the time loop:                                               0.275783       
 Number of time steps:                                                      2              
 Skipped time steps:                                                        1              
-Average time per time step:                                                0.151651       
-Standard deviation between time steps:                                     0.00396357     
-Time elapsed in the skipped time steps:                                    0.480891       
+Average time per time step:                                                0.137892       
+Standard deviation between time steps:                                     0.00135879     
+Time elapsed in the skipped time steps:                                    0.468064       
 
 
 Standard counter description             | Time/step       | % loop time | Call(s)/step   
 ------------------------------------------------------------------------------------------
-Linear solver resolutions Ax=B           | 0.0502534       | 12.8        | 2              
-Matrix assembly for implicit scheme      | 0.008697519     |  2.2        | 4              
-Convection operator                      | 0.008822867     |  2.3        | 6              
-Diffusion operator                       | 0.01684057      |  4.3        | 18             
-Gradient operator                        | 0.01633068      |  4.2        | 5              
-Divergence operator                      | 0.004752977     |  1.2        | 6              
-Source terms                             | 0.0005051555    |  0.1        | 4              
-Update ::mettre_a_jour                   | 0.01738123      |  4.4        | 4              
-Solver for implicit diffusion            | 0.008576068     |  2.2        | 4              
-Computation of the time step dt          | 0.003482782     |  0.9        | 6              
-Post-treatment operations                | 0.008540245     |  2.2        | 1              
-Other operations                         | 0.007467535     |  1.9        | 
+Linear solver resolutions Ax=B           | 0.0452876       | 32.8        | 2              
+Matrix assembly for implicit scheme      | 0.008186238     |  5.9        | 4              
+Convection operator                      | 0.008172697     |  5.9        | 6              
+Diffusion operator                       | 0.01754431      | 12.7        | 18             
+Gradient operator                        | 0.009107283     |  6.6        | 5              
+Divergence operator                      | 0.004602755     |  3.3        | 6              
+Source terms                             | 0.007739103     |  5.6        | 4              
+Update ::mettre_a_jour                   | 0.008671992     |  6.3        | 4              
+Solver for implicit diffusion            | 0.008301618     |  6.0        | 4              
+Computation of the time step dt          | 0.003485676     |  2.5        | 6              
+Post-treatment operations                | 0.008401907     |  6.1        | 1              
+Other operations                         | 0.008390358     |  6.1        | 
 
-Average number of iteration of the linear solver per call:                 54             
+Average number of iteration of the linear solver per call:                 56.2           
 
 
 -----------------------------------------------------------------------------------------------------------
@@ -63,17 +63,17 @@ Average number of iteration of the linear solver per call:                 54
 -----------------------------------------------------------------------------------------------------------
 Counter description                      | Time per step   | % loop time | Call(s)/step    | Bandwidth 
 -----------------------------------------------------------------------------------------------------------
-Libraries:                               | 0.049896        | 32.9        | 2               | 
-Kernels:                                 | 0.0915515       | 60.4        | 963             | 
-Copy host to device:                     | 0.000483606     |  0.3        | 21              | 2.0 GB/s
-Copy device to host:                     | 0.000589767     |  0.4        | 9               | 11.6 GB/s
-Alloc/Free on device:                    | 7.3544e-05      |  0.0        | 9               | 
-GPU: 93% Copy H<->D: 0.71% Alloc/free: 0.048% Comm: 0% CPU & I/O: 6%
+Libraries:                               | 0.0450666       | 32.7        | 2               | 
+Kernels:                                 | 0.082456        | 59.8        | 963             | 
+Copy host to device:                     | 0.000476035     |  0.3        | 21              | 2.1 GB/s
+Copy device to host:                     | 0.000586974     |  0.4        | 9               | 11.7 GB/s
+Alloc/Free on device:                    | 5.8974e-05      |  0.0        | 9               | 
+GPU: 92% Copy H<->D: 0.77% Alloc/free: 0.043% Comm: 0% CPU & I/O: 6.7%
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
                                               Post-resolution statistics
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-Time of the post-resolution:                                               0.00246653     
+Time of the post-resolution:                                               0.00315599     
 
-Total time for the whole computation                                       47.256         
+Total time for the whole computation                                       42.5777        
 
-Power consumption: 0 kW 0 kWh 0 € (0.12€/kWh)
+[Slurm] Power consumption (64 s):  0.438 kW  0.008 kWh  0.001 € (0.10€/kWh)
diff --git a/tests/GPU/PETSC_GAMG/PETSC_GAMG.lml.gz b/tests/GPU/PETSC_GAMG/PETSC_GAMG.lml.gz
new file mode 120000
index 0000000000..3b9fb55c30
--- /dev/null
+++ b/tests/GPU/PETSC_GAMG/PETSC_GAMG.lml.gz
@@ -0,0 +1 @@
+../JEL_bous/JEL_bous.lml.gz
\ No newline at end of file
diff --git a/tests/GPU/PETSC_KOKKOS/PETSC_KOKKOS.data b/tests/GPU/PETSC_KOKKOS/PETSC_KOKKOS.data
index 84f1cff86d..5d480ad45e 100644
--- a/tests/GPU/PETSC_KOKKOS/PETSC_KOKKOS.data
+++ b/tests/GPU/PETSC_KOKKOS/PETSC_KOKKOS.data
@@ -41,7 +41,8 @@ Scatter DOM.Zones dom_fluide
 END SCATTER #
 
 
-VEFPreP1B dis
+VEFPreP1B dis 
+Lire dis { reorder { algo Hilbert } }
 
 Scheme_euler_implicit sch
 Read sch
diff --git a/tests/GPU/PETSC_KOKKOS/PETSC_KOKKOS_BENCH.TU.adastra_gfx90a b/tests/GPU/PETSC_KOKKOS/PETSC_KOKKOS_BENCH.TU.adastra_gfx90a
index d27d19598c..692c68d2fd 100644
--- a/tests/GPU/PETSC_KOKKOS/PETSC_KOKKOS_BENCH.TU.adastra_gfx90a
+++ b/tests/GPU/PETSC_KOKKOS/PETSC_KOKKOS_BENCH.TU.adastra_gfx90a
@@ -8,8 +8,8 @@ Time is given in seconds
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
                                               Context of the computation
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-Date:     20-02-2026 -- 18:16:57
-OS:       g1081__Linux__x86_64__5.14.0-570.69.1.el9_6.x86_64__#1 SMP PREEMPT_DYNAMIC Tue Nov 25 01:30:14 EST 2025
+Date:     24-05-2026 -- 16:10:58
+OS:       g1266__Linux__x86_64__5.14.0-570.69.1.el9_6.x86_64__#1 SMP PREEMPT_DYNAMIC Tue Nov 25 01:30:14 EST 2025
 CPU model : AMD EPYC 7A53 64-Core Processor
 Total number of threads:128
 GPU model: AMD Instinct MI250X
@@ -22,38 +22,38 @@ Total number of elements used for the calculation: 2560000
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
                                            Computation start-up statistics
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-Total time of the start-up:                                                54.0032        
+Total time of the start-up:                                                44.6265        
 
 Number of calls to the linear solver per time step:                        1              
-Average time of the resolution of the linear problem per call:             2.48562        
+Average time of the resolution of the linear problem per call:             3.33197        
 Average number of iteration of the linear solver per call:                 0              
 
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
                                                  Time loop statistics
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 The first time step is not accounted for the computation of the time loop statistics
-Total time of the time loop:                                               16.1813        
+Total time of the time loop:                                               13.9877        
 Number of time steps:                                                      9              
 Skipped time steps:                                                        1              
-Average time per time step:                                                1.79792        
-Standard deviation between time steps:                                     0.0782326      
-Time elapsed in the skipped time steps:                                    15.1899        
+Average time per time step:                                                1.55419        
+Standard deviation between time steps:                                     0.0714623      
+Time elapsed in the skipped time steps:                                    26.42          
 
 
 Standard counter description             | Time/step       | % loop time | Call(s)/step   
 ------------------------------------------------------------------------------------------
-Linear solver resolutions Ax=B           | 0.952062        | 27.3        | 3              
-Matrix assembly for implicit scheme      | 0.1737865       |  5.0        | 1              
-Convection operator                      | 0.2060802       |  5.9        | 4              
-Diffusion operator                       | 0.014529        |  0.4        | 2              
-Divergence operator                      | 0.03175503      |  0.9        | 4              
-Source terms                             | 0.0007008112    |  0.0        | 2              
-Update ::mettre_a_jour                   | 0.01173972      |  0.3        | 4              
-Computation of the time step dt          | 0.001874476     |  0.1        | 4              
-Post-treatment operations                | 0.02246132      |  0.6        | 1              
-Other operations                         | 0.3829349       | 11.0        | 
+Linear solver resolutions Ax=B           | 0.88151         | 56.7        | 3              
+Matrix assembly for implicit scheme      | 0.1248069       |  8.0        | 1              
+Convection operator                      | 0.152424        |  9.8        | 4              
+Diffusion operator                       | 0.01211028      |  0.8        | 2              
+Divergence operator                      | 0.02130311      |  1.4        | 4              
+Source terms                             | 0.0005407483    |  0.0        | 2              
+Update ::mettre_a_jour                   | 0.00905912      |  0.6        | 4              
+Computation of the time step dt          | 0.001572527     |  0.1        | 4              
+Post-treatment operations                | 0.02116038      |  1.4        | 1              
+Other operations                         | 0.3296992       | 21.2        | 
 
-Average number of iteration of the linear solver per call:                 45.1           
+Average number of iteration of the linear solver per call:                 45.2           
 
 
 -----------------------------------------------------------------------------------------------------------
@@ -61,17 +61,17 @@ Average number of iteration of the linear solver per call:                 45.1
 -----------------------------------------------------------------------------------------------------------
 Counter description                      | Time per step   | % loop time | Call(s)/step    | Bandwidth 
 -----------------------------------------------------------------------------------------------------------
-Libraries:                               | 0.929697        | 51.7        | 3               | 
-Kernels:                                 | 0.856626        | 47.6        | 435             | 
-Copy host to device:                     | 0.000691429     |  0.0        | 21              | 6.5 GB/s
-Copy device to host:                     | 0.000787623     |  0.0        | 7               | 14.1 GB/s
-Alloc/Free on device:                    | 0.00013016      |  0.0        | 4               | 
-GPU: 99% Copy H<->D: 0.082% Alloc/free: 0.0072% Comm: 0% CPU & I/O: 0.56%
+Libraries:                               | 0.859416        | 55.3        | 3               | 
+Kernels:                                 | 0.682799        | 43.9        | 433             | 
+Copy host to device:                     | 0.000688401     |  0.0        | 21              | 6.5 GB/s
+Copy device to host:                     | 0.000780124     |  0.1        | 7               | 14.3 GB/s
+Alloc/Free on device:                    | 0.000153926     |  0.0        | 4               | 
+GPU: 99% Copy H<->D: 0.094% Alloc/free: 0.0099% Comm: 0% CPU & I/O: 0.67%
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
                                               Post-resolution statistics
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-Time of the post-resolution:                                               0.200093       
+Time of the post-resolution:                                               0.203241       
 
-Total time for the whole computation                                       85.5745        
+Total time for the whole computation                                       85.2374        
 
-Power consumption: 0 kW 0 kWh 0 € (0.12€/kWh)
+[Slurm] Power consumption (91 s):  0.524 kW  0.013 kWh  0.001 € (0.10€/kWh)
diff --git a/tests/GPU/PETSC_KOKKOS/PETSC_KOKKOS_BENCH.TU.adastra_gfx942 b/tests/GPU/PETSC_KOKKOS/PETSC_KOKKOS_BENCH.TU.adastra_gfx942
index 3c55fe1cfc..068c7032b7 100644
--- a/tests/GPU/PETSC_KOKKOS/PETSC_KOKKOS_BENCH.TU.adastra_gfx942
+++ b/tests/GPU/PETSC_KOKKOS/PETSC_KOKKOS_BENCH.TU.adastra_gfx942
@@ -8,8 +8,8 @@ Time is given in seconds
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
                                               Context of the computation
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-Date:     17-03-2026 -- 18:08:32
-OS:       a1007__Linux__x86_64__5.14.0-570.69.1.el9_6.x86_64__#1 SMP PREEMPT_DYNAMIC Tue Nov 25 01:30:14 EST 2025
+Date:     23-04-2026 -- 14:57:40
+OS:       a1003__Linux__x86_64__5.14.0-570.69.1.el9_6.x86_64__#1 SMP PREEMPT_DYNAMIC Tue Nov 25 01:30:14 EST 2025
 CPU model : AMD Instinct MI300A Accelerator
 Total number of threads:192
 GPU model: AMD Instinct MI300A
@@ -22,38 +22,38 @@ Total number of elements used for the calculation: 2560000
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
                                            Computation start-up statistics
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-Total time of the start-up:                                                87.9476        
+Total time of the start-up:                                                69.2005        
 
 Number of calls to the linear solver per time step:                        1              
-Average time of the resolution of the linear problem per call:             3.59228        
+Average time of the resolution of the linear problem per call:             4.08395        
 Average number of iteration of the linear solver per call:                 0              
 
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
                                                  Time loop statistics
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 The first time step is not accounted for the computation of the time loop statistics
-Total time of the time loop:                                               6.86811        
+Total time of the time loop:                                               6.57321        
 Number of time steps:                                                      9              
 Skipped time steps:                                                        1              
-Average time per time step:                                                0.763124       
-Standard deviation between time steps:                                     0.111657       
-Time elapsed in the skipped time steps:                                    26.2479        
+Average time per time step:                                                0.730357       
+Standard deviation between time steps:                                     0.108724       
+Time elapsed in the skipped time steps:                                    29.3565        
 
 
 Standard counter description             | Time/step       | % loop time | Call(s)/step   
 ------------------------------------------------------------------------------------------
-Linear solver resolutions Ax=B           | 0.580801        | 76.1        | 3              
-Matrix assembly for implicit scheme      | 0.03724753      |  4.9        | 1              
-Convection operator                      | 0.04440546      |  5.8        | 4              
-Diffusion operator                       | 0.004041744     |  0.5        | 2              
-Divergence operator                      | 0.008965229     |  1.2        | 4              
-Source terms                             | 0.0002944104    |  0.0        | 2              
-Update ::mettre_a_jour                   | 0.006270745     |  0.8        | 4              
-Computation of the time step dt          | 0.001288767     |  0.2        | 4              
-Post-treatment operations                | 0.01114546      |  1.5        | 1              
-Other operations                         | 0.06866393      |  9.0        | 
+Linear solver resolutions Ax=B           | 0.560471        | 76.7        | 3              
+Matrix assembly for implicit scheme      | 0.03394545      |  4.6        | 1              
+Convection operator                      | 0.0405187       |  5.5        | 4              
+Diffusion operator                       | 0.004115085     |  0.6        | 2              
+Divergence operator                      | 0.006495622     |  0.9        | 4              
+Source terms                             | 0.0003109058    |  0.0        | 2              
+Update ::mettre_a_jour                   | 0.005631407     |  0.8        | 4              
+Computation of the time step dt          | 0.001290499     |  0.2        | 4              
+Post-treatment operations                | 0.01139088      |  1.6        | 1              
+Other operations                         | 0.06618752      |  9.1        | 
 
-Average number of iteration of the linear solver per call:                 45.1           
+Average number of iteration of the linear solver per call:                 45.2           
 
 
 -----------------------------------------------------------------------------------------------------------
@@ -61,17 +61,17 @@ Average number of iteration of the linear solver per call:                 45.1
 -----------------------------------------------------------------------------------------------------------
 Counter description                      | Time per step   | % loop time | Call(s)/step    | Bandwidth 
 -----------------------------------------------------------------------------------------------------------
-Libraries:                               | 0.571281        | 74.9        | 3               | 
-Kernels:                                 | 0.179249        | 23.5        | 433             | 
-Copy host to device:                     | 0.000686672     |  0.1        | 21              | 6.5 GB/s
-Copy device to host:                     | 0.000558109     |  0.1        | 7               | 20.0 GB/s
-Alloc/Free on device:                    | 0.00091174      |  0.1        | 4               | 
-GPU: 98% Copy H<->D: 0.16% Alloc/free: 0.12% Comm: 0% CPU & I/O: 1.4%
+Libraries:                               | 0.551041        | 75.4        | 3               | 
+Kernels:                                 | 0.165837        | 22.7        | 433             | 
+Copy host to device:                     | 0.000688216     |  0.1        | 21              | 6.5 GB/s
+Copy device to host:                     | 0.000562285     |  0.1        | 7               | 19.8 GB/s
+Alloc/Free on device:                    | 0.000853448     |  0.1        | 4               | 
+GPU: 98% Copy H<->D: 0.17% Alloc/free: 0.12% Comm: 0% CPU & I/O: 1.6%
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
                                               Post-resolution statistics
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-Time of the post-resolution:                                               0.189987       
+Time of the post-resolution:                                               0.207525       
 
-Total time for the whole computation                                       121.254        
+Total time for the whole computation                                       105.338        
 
-[Slurm] Power consumption (136 s):  0.671 kW  0.025 kWh  0.003 € (0.10€/kWh)
+[Slurm] Power consumption (116 s):  0.683 kW  0.022 kWh  0.002 € (0.10€/kWh)
diff --git a/tests/GPU/PETSC_KOKKOS/PETSC_KOKKOS_BENCH.TU.dalianvl_cc100 b/tests/GPU/PETSC_KOKKOS/PETSC_KOKKOS_BENCH.TU.dalianvl_cc100
new file mode 100644
index 0000000000..a9f5b2197f
--- /dev/null
+++ b/tests/GPU/PETSC_KOKKOS/PETSC_KOKKOS_BENCH.TU.dalianvl_cc100
@@ -0,0 +1,76 @@
+                                             # Global performance file #
+
+This is the global file for tracking performance in TRUST. It stores aggregated quantities.
+More detailed statistics can be found in the PETSC_KOKKOS_BENCH_csv.TU file
+For time loop, only standard counters of level 1 are printed alongside your custom counters
+Time is given in seconds
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+                                              Context of the computation
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Date:     22-05-2026 -- 12:18:30
+OS:       dalianvl14__Linux__aarch64__5.14.0-503.40.1.el9_5.aarch64__#1 SMP PREEMPT_DYNAMIC Thu Apr 24 13:05:29 UTC 2025
+CPU model : Unknown Linux CPU
+Total number of threads:144
+GPU model: NVIDIA GB200
+CUDA runtime version: 13.0
+CUDA drivers version: 13.20
+Nb procs used for the computation: 1
+TRUST version: 1.9.8_beta
+Total number of elements used for the calculation: 2560000
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+                                           Computation start-up statistics
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Total time of the start-up:                                                39.7573        
+
+Number of calls to the linear solver per time step:                        1              
+Average time of the resolution of the linear problem per call:             2.71276        
+Average number of iteration of the linear solver per call:                 0              
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+                                                 Time loop statistics
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+The first time step is not accounted for the computation of the time loop statistics
+Total time of the time loop:                                               2.37842        
+Number of time steps:                                                      9              
+Skipped time steps:                                                        1              
+Average time per time step:                                                0.264269       
+Standard deviation between time steps:                                     0.0171366      
+Time elapsed in the skipped time steps:                                    14.8921        
+
+
+Standard counter description             | Time/step       | % loop time | Call(s)/step   
+------------------------------------------------------------------------------------------
+Linear solver resolutions Ax=B           | 0.203571        | 77.0        | 3              
+Matrix assembly for implicit scheme      | 0.01233993      |  4.7        | 1              
+Convection operator                      | 0.01126966      |  4.3        | 4              
+Diffusion operator                       | 0.001828819     |  0.7        | 2              
+Divergence operator                      | 0.001690409     |  0.6        | 4              
+Source terms                             | 0.0001626662    |  0.1        | 2              
+Update ::mettre_a_jour                   | 0.002726895     |  1.0        | 4              
+Computation of the time step dt          | 0.0005716936    |  0.2        | 4              
+Post-treatment operations                | 0.005884185     |  2.2        | 1              
+Other operations                         | 0.02422413      |  9.2        | 
+
+Average number of iteration of the linear solver per call:                 45.1           
+
+
+-----------------------------------------------------------------------------------------------------------
+                                                    GPU statistics
+-----------------------------------------------------------------------------------------------------------
+Counter description                      | Time per step   | % loop time | Call(s)/step    | Bandwidth 
+-----------------------------------------------------------------------------------------------------------
+Libraries:                               | 0.196436        | 74.3        | 3               | 
+Kernels:                                 | 0.0560966       | 21.2        | 433             | 
+Copy host to device:                     | 0.00043351      |  0.2        | 21              | 10.3 GB/s
+Copy device to host:                     | 0.000356846     |  0.1        | 7               | 31.2 GB/s
+Alloc/Free on device:                    | 0.00231052      |  0.9        | 4               | 
+GPU: 96% Copy H<->D: 0.3% Alloc/free: 0.87% Comm: 0% CPU & I/O: 3.3%
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+                                              Post-resolution statistics
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Time of the post-resolution:                                               0.155213       
+
+Total time for the whole computation                                       57.183         
+
diff --git a/tests/GPU/PETSC_KOKKOS/PETSC_KOKKOS_BENCH.TU.eureka_cc89 b/tests/GPU/PETSC_KOKKOS/PETSC_KOKKOS_BENCH.TU.eureka_cc89
new file mode 100644
index 0000000000..fcce0ed49b
--- /dev/null
+++ b/tests/GPU/PETSC_KOKKOS/PETSC_KOKKOS_BENCH.TU.eureka_cc89
@@ -0,0 +1,76 @@
+                                             # Global performance file #
+
+This is the global file for tracking performance in TRUST. It stores aggregated quantities.
+More detailed statistics can be found in the PETSC_KOKKOS_BENCH_csv.TU file
+For time loop, only standard counters of level 1 are printed alongside your custom counters
+Time is given in seconds
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+                                              Context of the computation
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Date:     11-05-2026 -- 08:35:53
+OS:       eureka__Linux__x86_64__6.17.0-20-generic__#20~24.04.1-Ubuntu SMP PREEMPT_DYNAMIC Thu Mar 19 01:28:37 UTC 2
+CPU model : INTEL(R) XEON(R) PLATINUM 8580
+Total number of threads:240
+GPU model: NVIDIA RTX 6000 Ada Generation
+CUDA runtime version: 12.90
+CUDA drivers version: 13.10
+Nb procs used for the computation: 1
+TRUST version: 1.9.8_beta
+Total number of elements used for the calculation: 2560000
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+                                           Computation start-up statistics
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Total time of the start-up:                                                48.8101        
+
+Number of calls to the linear solver per time step:                        1              
+Average time of the resolution of the linear problem per call:             3.58473        
+Average number of iteration of the linear solver per call:                 0              
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+                                                 Time loop statistics
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+The first time step is not accounted for the computation of the time loop statistics
+Total time of the time loop:                                               8.85236        
+Number of time steps:                                                      9              
+Skipped time steps:                                                        1              
+Average time per time step:                                                0.983596       
+Standard deviation between time steps:                                     0.0820317      
+Time elapsed in the skipped time steps:                                    37.7377        
+
+
+Standard counter description             | Time/step       | % loop time | Call(s)/step   
+------------------------------------------------------------------------------------------
+Linear solver resolutions Ax=B           | 0.759662        | 77.2        | 3              
+Matrix assembly for implicit scheme      | 0.03474596      |  3.5        | 1              
+Convection operator                      | 0.03646882      |  3.7        | 4              
+Diffusion operator                       | 0.003950059     |  0.4        | 2              
+Divergence operator                      | 0.004843026     |  0.5        | 4              
+Source terms                             | 0.000650252     |  0.1        | 2              
+Update ::mettre_a_jour                   | 0.004128376     |  0.4        | 4              
+Computation of the time step dt          | 0.001334431     |  0.1        | 4              
+Post-treatment operations                | 0.007428737     |  0.8        | 1              
+Other operations                         | 0.1303845       | 13.3        | 
+
+Average number of iteration of the linear solver per call:                 45.1           
+
+
+-----------------------------------------------------------------------------------------------------------
+                                                    GPU statistics
+-----------------------------------------------------------------------------------------------------------
+Counter description                      | Time per step   | % loop time | Call(s)/step    | Bandwidth 
+-----------------------------------------------------------------------------------------------------------
+Libraries:                               | 0.727012        | 73.9        | 3               | 
+Kernels:                                 | 0.246978        | 25.1        | 433             | 
+Copy host to device:                     | 0.00103084      |  0.1        | 21              | 4.3 GB/s
+Copy device to host:                     | 0.00146876      |  0.1        | 7               | 7.6 GB/s
+Alloc/Free on device:                    | 0.000581128     |  0.1        | 4               | 
+GPU: 99% Copy H<->D: 0.25% Alloc/free: 0.059% Comm: 0% CPU & I/O: 0.66%
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+                                              Post-resolution statistics
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Time of the post-resolution:                                               0.0882911      
+
+Total time for the whole computation                                       95.4885        
+
diff --git a/tests/GPU/PETSC_KOKKOS/PETSC_KOKKOS_BENCH.TU.is157091_cc86 b/tests/GPU/PETSC_KOKKOS/PETSC_KOKKOS_BENCH.TU.is157091_cc86
index bfa2beebaa..cca066e59c 100644
--- a/tests/GPU/PETSC_KOKKOS/PETSC_KOKKOS_BENCH.TU.is157091_cc86
+++ b/tests/GPU/PETSC_KOKKOS/PETSC_KOKKOS_BENCH.TU.is157091_cc86
@@ -8,7 +8,7 @@ Time is given in seconds
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
                                               Context of the computation
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-Date:     20-02-2026 -- 12:01:52
+Date:     22-04-2026 -- 20:47:12
 OS:       is157091__Linux__x86_64__5.11.12-300.fc34.x86_64__#1 SMP Wed Apr 7 16:31:13 UTC 2021
 CPU model : AMD Ryzen Threadripper PRO 5975WX 32-Cores
 Total number of threads:64
@@ -22,38 +22,38 @@ Total number of elements used for the calculation: 2560000
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
                                            Computation start-up statistics
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-Total time of the start-up:                                                43.8829        
+Total time of the start-up:                                                44.5716        
 
 Number of calls to the linear solver per time step:                        1              
-Average time of the resolution of the linear problem per call:             2.7629         
+Average time of the resolution of the linear problem per call:             3.22022        
 Average number of iteration of the linear solver per call:                 0              
 
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
                                                  Time loop statistics
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 The first time step is not accounted for the computation of the time loop statistics
-Total time of the time loop:                                               14.0367        
+Total time of the time loop:                                               12.358         
 Number of time steps:                                                      9              
 Skipped time steps:                                                        1              
-Average time per time step:                                                1.55963        
-Standard deviation between time steps:                                     0.104933       
-Time elapsed in the skipped time steps:                                    21.595         
+Average time per time step:                                                1.37311        
+Standard deviation between time steps:                                     0.098936       
+Time elapsed in the skipped time steps:                                    28.8715        
 
 
 Standard counter description             | Time/step       | % loop time | Call(s)/step   
 ------------------------------------------------------------------------------------------
-Linear solver resolutions Ax=B           | 1.02044         | 65.4        | 3              
-Matrix assembly for implicit scheme      | 0.09923768      |  6.4        | 1              
-Convection operator                      | 0.09379117      |  6.0        | 4              
-Diffusion operator                       | 0.007355902     |  0.5        | 2              
-Divergence operator                      | 0.02657271      |  1.7        | 4              
-Source terms                             | 0.001260578     |  0.1        | 2              
-Update ::mettre_a_jour                   | 0.0112653       |  0.7        | 4              
-Computation of the time step dt          | 0.00229102      |  0.1        | 4              
-Post-treatment operations                | 0.0168292       |  1.1        | 1              
-Other operations                         | 0.2805938       | 18.0        | 
+Linear solver resolutions Ax=B           | 0.930939        | 67.8        | 3              
+Matrix assembly for implicit scheme      | 0.07174572      |  5.2        | 1              
+Convection operator                      | 0.06642676      |  4.8        | 4              
+Diffusion operator                       | 0.005771405     |  0.4        | 2              
+Divergence operator                      | 0.01625026      |  1.2        | 4              
+Source terms                             | 0.0007618684    |  0.1        | 2              
+Update ::mettre_a_jour                   | 0.00867656      |  0.6        | 4              
+Computation of the time step dt          | 0.002122282     |  0.2        | 4              
+Post-treatment operations                | 0.01133303      |  0.8        | 1              
+Other operations                         | 0.2590803       | 18.9        | 
 
-Average number of iteration of the linear solver per call:                 44.6           
+Average number of iteration of the linear solver per call:                 45.1           
 
 
 -----------------------------------------------------------------------------------------------------------
@@ -61,16 +61,16 @@ Average number of iteration of the linear solver per call:                 44.6
 -----------------------------------------------------------------------------------------------------------
 Counter description                      | Time per step   | % loop time | Call(s)/step    | Bandwidth 
 -----------------------------------------------------------------------------------------------------------
-Libraries:                               | 0.980411        | 62.9        | 3               | 
-Kernels:                                 | 0.569024        | 36.5        | 435             | 
-Copy host to device:                     | 0.001424        |  0.1        | 21              | 3.1 GB/s
-Copy device to host:                     | 0.0010956       |  0.1        | 7               | 10.2 GB/s
-Alloc/Free on device:                    | 0.000533747     |  0.0        | 4               | 
-GPU: 99% Copy H<->D: 0.16% Alloc/free: 0.034% Comm: 0% CPU & I/O: 0.46%
+Libraries:                               | 0.892538        | 65.0        | 3               | 
+Kernels:                                 | 0.471266        | 34.3        | 433             | 
+Copy host to device:                     | 0.00140124      |  0.1        | 21              | 3.2 GB/s
+Copy device to host:                     | 0.00110888      |  0.1        | 7               | 10.0 GB/s
+Alloc/Free on device:                    | 0.000523076     |  0.0        | 4               | 
+GPU: 99% Copy H<->D: 0.18% Alloc/free: 0.038% Comm: 0% CPU & I/O: 0.46%
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
                                               Post-resolution statistics
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-Time of the post-resolution:                                               0.301731       
+Time of the post-resolution:                                               0.265491       
 
-Total time for the whole computation                                       79.8164        
+Total time for the whole computation                                       86.0666        
 
diff --git a/tests/GPU/PETSC_KOKKOS/PETSC_KOKKOS_BENCH.TU.is159479_cc120 b/tests/GPU/PETSC_KOKKOS/PETSC_KOKKOS_BENCH.TU.is159479_cc120
new file mode 100644
index 0000000000..c21639b3b7
--- /dev/null
+++ b/tests/GPU/PETSC_KOKKOS/PETSC_KOKKOS_BENCH.TU.is159479_cc120
@@ -0,0 +1,76 @@
+                                             # Global performance file #
+
+This is the global file for tracking performance in TRUST. It stores aggregated quantities.
+More detailed statistics can be found in the PETSC_KOKKOS_BENCH_csv.TU file
+For time loop, only standard counters of level 1 are printed alongside your custom counters
+Time is given in seconds
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+                                              Context of the computation
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Date:     24-04-2026 -- 14:37:16
+OS:       is159479.intra.cea.fr__Linux__x86_64__6.12.0-160000.26-default__#1 SMP PREEMPT_DYNAMIC Thu Feb  5 00:00:11 UTC 2026 (f
+CPU model : AMD Ryzen Threadripper PRO 9965WX 24-Cores
+Total number of threads:48
+GPU model: NVIDIA RTX PRO 6000 Blackwell Workstation Edition
+CUDA runtime version: 13.0
+CUDA drivers version: 13.0
+Nb procs used for the computation: 1
+TRUST version: 1.9.8_beta
+Total number of elements used for the calculation: 2560000
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+                                           Computation start-up statistics
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Total time of the start-up:                                                34.6555        
+
+Number of calls to the linear solver per time step:                        1              
+Average time of the resolution of the linear problem per call:             2.12537        
+Average number of iteration of the linear solver per call:                 0              
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+                                                 Time loop statistics
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+The first time step is not accounted for the computation of the time loop statistics
+Total time of the time loop:                                               4.81023        
+Number of time steps:                                                      9              
+Skipped time steps:                                                        1              
+Average time per time step:                                                0.53447        
+Standard deviation between time steps:                                     0.043241       
+Time elapsed in the skipped time steps:                                    20.3312        
+
+
+Standard counter description             | Time/step       | % loop time | Call(s)/step   
+------------------------------------------------------------------------------------------
+Linear solver resolutions Ax=B           | 0.411461        | 77.0        | 3              
+Matrix assembly for implicit scheme      | 0.02128278      |  4.0        | 1              
+Convection operator                      | 0.02519871      |  4.7        | 4              
+Diffusion operator                       | 0.002420275     |  0.5        | 2              
+Divergence operator                      | 0.002600299     |  0.5        | 4              
+Source terms                             | 0.0003420792    |  0.1        | 2              
+Update ::mettre_a_jour                   | 0.002391873     |  0.4        | 4              
+Computation of the time step dt          | 0.0007405434    |  0.1        | 4              
+Post-treatment operations                | 0.005288055     |  1.0        | 1              
+Other operations                         | 0.06274485      | 11.7        | 
+
+Average number of iteration of the linear solver per call:                 45.1           
+
+
+-----------------------------------------------------------------------------------------------------------
+                                                    GPU statistics
+-----------------------------------------------------------------------------------------------------------
+Counter description                      | Time per step   | % loop time | Call(s)/step    | Bandwidth 
+-----------------------------------------------------------------------------------------------------------
+Libraries:                               | 0.393842        | 73.7        | 3               | 
+Kernels:                                 | 0.13385         | 25.0        | 433             | 
+Copy host to device:                     | 0.000492432     |  0.1        | 21              | 9.1 GB/s
+Copy device to host:                     | 0.00135689      |  0.3        | 7               | 8.2 GB/s
+Alloc/Free on device:                    | 0.000450243     |  0.1        | 4               | 
+GPU: 99% Copy H<->D: 0.35% Alloc/free: 0.084% Comm: 0% CPU & I/O: 0.84%
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+                                              Post-resolution statistics
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Time of the post-resolution:                                               0.0666706      
+
+Total time for the whole computation                                       59.8636        
+
diff --git a/tests/GPU/PETSC_KOKKOS/PETSC_KOKKOS_BENCH.TU.is247793_gfx1100 b/tests/GPU/PETSC_KOKKOS/PETSC_KOKKOS_BENCH.TU.is247793_gfx1100
new file mode 100644
index 0000000000..f8732dd43a
--- /dev/null
+++ b/tests/GPU/PETSC_KOKKOS/PETSC_KOKKOS_BENCH.TU.is247793_gfx1100
@@ -0,0 +1,76 @@
+                                             # Global performance file #
+
+This is the global file for tracking performance in TRUST. It stores aggregated quantities.
+More detailed statistics can be found in the PETSC_KOKKOS_BENCH_csv.TU file
+For time loop, only standard counters of level 1 are printed alongside your custom counters
+Time is given in seconds
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+                                              Context of the computation
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Date:     22-05-2026 -- 17:41:36
+OS:       is247793__Linux__x86_64__6.8.0-107-generic__#107~22.04.1-Ubuntu SMP PREEMPT_DYNAMIC Wed Mar 18 23:40:43 UTC 
+CPU model : AMD Ryzen Threadripper PRO 7965WX 24-Cores
+Total number of threads:48
+GPU model: AMD Radeon PRO W7900
+HIP runtime version: 7.53
+HIP drivers version: 7.53
+Nb procs used for the computation: 1
+TRUST version: 1.9.8_beta
+Total number of elements used for the calculation: 2560000
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+                                           Computation start-up statistics
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Total time of the start-up:                                                38.5024        
+
+Number of calls to the linear solver per time step:                        1              
+Average time of the resolution of the linear problem per call:             4.06631        
+Average number of iteration of the linear solver per call:                 0              
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+                                                 Time loop statistics
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+The first time step is not accounted for the computation of the time loop statistics
+Total time of the time loop:                                               33.9639        
+Number of time steps:                                                      9              
+Skipped time steps:                                                        1              
+Average time per time step:                                                3.77377        
+Standard deviation between time steps:                                     0.552431       
+Time elapsed in the skipped time steps:                                    29.7015        
+
+
+Standard counter description             | Time/step       | % loop time | Call(s)/step   
+------------------------------------------------------------------------------------------
+Linear solver resolutions Ax=B           | 3.40042         | 90.1        | 3              
+Matrix assembly for implicit scheme      | 0.06572012      |  1.7        | 1              
+Convection operator                      | 0.077525        |  2.1        | 4              
+Diffusion operator                       | 0.008007824     |  0.2        | 2              
+Divergence operator                      | 0.01260207      |  0.3        | 4              
+Source terms                             | 0.0009658954    |  0.0        | 2              
+Update ::mettre_a_jour                   | 0.007784225     |  0.2        | 4              
+Computation of the time step dt          | 0.01158151      |  0.3        | 4              
+Post-treatment operations                | 0.01737637      |  0.5        | 1              
+Other operations                         | 0.1717834       |  4.6        | 
+
+Average number of iteration of the linear solver per call:                 44.2           
+
+
+-----------------------------------------------------------------------------------------------------------
+                                                    GPU statistics
+-----------------------------------------------------------------------------------------------------------
+Counter description                      | Time per step   | % loop time | Call(s)/step    | Bandwidth 
+-----------------------------------------------------------------------------------------------------------
+Libraries:                               | 3.3543          | 88.9        | 3               | 
+Kernels:                                 | 0.408706        | 10.8        | 433             | 
+Copy host to device:                     | 0.000746001     |  0.0        | 21              | 6.0 GB/s
+Copy device to host:                     | 0.000728967     |  0.0        | 7               | 15.3 GB/s
+Alloc/Free on device:                    | 0.000857389     |  0.0        | 4               | 
+GPU: 1e+02% Copy H<->D: 0.039% Alloc/free: 0.023% Comm: 0% CPU & I/O: 0.22%
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+                                              Post-resolution statistics
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Time of the post-resolution:                                               0.0997579      
+
+Total time for the whole computation                                       102.268        
+
diff --git a/tests/GPU/PETSC_KOKKOS/PETSC_KOKKOS_BENCH.TU.jean-zay_cc90 b/tests/GPU/PETSC_KOKKOS/PETSC_KOKKOS_BENCH.TU.jean-zay_cc90
index f2c082287e..f410cf6bb9 100644
--- a/tests/GPU/PETSC_KOKKOS/PETSC_KOKKOS_BENCH.TU.jean-zay_cc90
+++ b/tests/GPU/PETSC_KOKKOS/PETSC_KOKKOS_BENCH.TU.jean-zay_cc90
@@ -8,8 +8,8 @@ Time is given in seconds
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
                                               Context of the computation
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-Date:     21-02-2026 -- 07:39:41
-OS:       jzxh025__Linux__x86_64__5.14.0-570.58.1.el9_6.x86_64__#1 SMP PREEMPT_DYNAMIC Tue Oct 21 04:15:07 EDT 2025
+Date:     13-05-2026 -- 10:01:46
+OS:       jzxh116__Linux__x86_64__5.14.0-570.58.1.el9_6.x86_64__#1 SMP PREEMPT_DYNAMIC Tue Oct 21 04:15:07 EDT 2025
 CPU model : Intel(R) Xeon(R) Platinum 8468
 Total number of threads:192
 GPU model: NVIDIA H100 80GB HBM3
@@ -22,38 +22,38 @@ Total number of elements used for the calculation: 2560000
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
                                            Computation start-up statistics
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-Total time of the start-up:                                                41.4787        
+Total time of the start-up:                                                50.4068        
 
 Number of calls to the linear solver per time step:                        1              
-Average time of the resolution of the linear problem per call:             2.43523        
+Average time of the resolution of the linear problem per call:             3.0547         
 Average number of iteration of the linear solver per call:                 0              
 
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
                                                  Time loop statistics
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 The first time step is not accounted for the computation of the time loop statistics
-Total time of the time loop:                                               4.31082        
+Total time of the time loop:                                               4.01735        
 Number of time steps:                                                      9              
 Skipped time steps:                                                        1              
-Average time per time step:                                                0.47898        
-Standard deviation between time steps:                                     0.0324794      
-Time elapsed in the skipped time steps:                                    18.8744        
+Average time per time step:                                                0.446372       
+Standard deviation between time steps:                                     0.0328116      
+Time elapsed in the skipped time steps:                                    30.0468        
 
 
 Standard counter description             | Time/step       | % loop time | Call(s)/step   
 ------------------------------------------------------------------------------------------
-Linear solver resolutions Ax=B           | 0.324965        | 67.8        | 3              
-Matrix assembly for implicit scheme      | 0.02646704      |  5.5        | 1              
-Convection operator                      | 0.02453415      |  5.1        | 4              
-Diffusion operator                       | 0.003655498     |  0.8        | 2              
-Divergence operator                      | 0.006610645     |  1.4        | 4              
-Source terms                             | 0.0004884063    |  0.1        | 2              
-Update ::mettre_a_jour                   | 0.004852666     |  1.0        | 4              
-Computation of the time step dt          | 0.0008704661    |  0.2        | 4              
-Post-treatment operations                | 0.00899222      |  1.9        | 1              
-Other operations                         | 0.07754328      | 16.2        | 
+Linear solver resolutions Ax=B           | 0.314238        | 70.4        | 3              
+Matrix assembly for implicit scheme      | 0.01949049      |  4.4        | 1              
+Convection operator                      | 0.01833284      |  4.1        | 4              
+Diffusion operator                       | 0.003106423     |  0.7        | 2              
+Divergence operator                      | 0.003050106     |  0.7        | 4              
+Source terms                             | 0.0002473428    |  0.1        | 2              
+Update ::mettre_a_jour                   | 0.0037138       |  0.8        | 4              
+Computation of the time step dt          | 0.0007315362    |  0.2        | 4              
+Post-treatment operations                | 0.009454939     |  2.1        | 1              
+Other operations                         | 0.07400625      | 16.6        | 
 
-Average number of iteration of the linear solver per call:                 45             
+Average number of iteration of the linear solver per call:                 45.1           
 
 
 -----------------------------------------------------------------------------------------------------------
@@ -61,17 +61,17 @@ Average number of iteration of the linear solver per call:                 45
 -----------------------------------------------------------------------------------------------------------
 Counter description                      | Time per step   | % loop time | Call(s)/step    | Bandwidth 
 -----------------------------------------------------------------------------------------------------------
-Libraries:                               | 0.313836        | 65.5        | 3               | 
-Kernels:                                 | 0.15182         | 31.7        | 435             | 
-Copy host to device:                     | 0.000786192     |  0.2        | 21              | 5.7 GB/s
-Copy device to host:                     | 0.00176988      |  0.4        | 7               | 6.3 GB/s
-Alloc/Free on device:                    | 0.000779791     |  0.2        | 4               | 
-GPU: 97% Copy H<->D: 0.53% Alloc/free: 0.16% Comm: 0% CPU & I/O: 2.1%
+Libraries:                               | 0.303017        | 67.9        | 3               | 
+Kernels:                                 | 0.128906        | 28.9        | 433             | 
+Copy host to device:                     | 0.000767944     |  0.2        | 21              | 5.8 GB/s
+Copy device to host:                     | 0.00174116      |  0.4        | 7               | 6.4 GB/s
+Alloc/Free on device:                    | 0.000756607     |  0.2        | 4               | 
+GPU: 97% Copy H<->D: 0.56% Alloc/free: 0.17% Comm: 0% CPU & I/O: 2.5%
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
                                               Post-resolution statistics
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-Time of the post-resolution:                                               0.193317       
+Time of the post-resolution:                                               0.135781       
 
-Total time for the whole computation                                       64.8573        
+Total time for the whole computation                                       84.6067        
 
-Power consumption: 0 kW 0 kWh 0 € (0.12€/kWh)
+[Slurm] Power consumption (94 s):  0.440 kW  0.011 kWh  0.001 € (0.10€/kWh)
diff --git a/tests/GPU/PETSC_KOKKOS/PETSC_KOKKOS_BENCH.TU.lumi_gfx90a b/tests/GPU/PETSC_KOKKOS/PETSC_KOKKOS_BENCH.TU.lumi_gfx90a
index cbe9f936da..b06ce76fa0 100644
--- a/tests/GPU/PETSC_KOKKOS/PETSC_KOKKOS_BENCH.TU.lumi_gfx90a
+++ b/tests/GPU/PETSC_KOKKOS/PETSC_KOKKOS_BENCH.TU.lumi_gfx90a
@@ -8,8 +8,8 @@ Time is given in seconds
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
                                               Context of the computation
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-Date:     24-02-2026 -- 00:35:37
-OS:       nid007971__Linux__x86_64__6.4.0-150600.23.73_15.0.14-cray_shasta_c__#1 SMP Tue Oct 21 20:32:25 UTC 2025 (89d3c98)
+Date:     15-05-2026 -- 21:06:28
+OS:       nid005002__Linux__x86_64__6.4.0-150600.23.73_15.0.14-cray_shasta_c__#1 SMP Tue Oct 21 20:32:25 UTC 2025 (89d3c98)
 CPU model : AMD EPYC 7A53 64-Core Processor
 Total number of threads:128
 GPU model: AMD Instinct MI250X
@@ -22,38 +22,38 @@ Total number of elements used for the calculation: 2560000
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
                                            Computation start-up statistics
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-Total time of the start-up:                                                87.1868        
+Total time of the start-up:                                                92.6822        
 
 Number of calls to the linear solver per time step:                        1              
-Average time of the resolution of the linear problem per call:             6.21404        
+Average time of the resolution of the linear problem per call:             7.19735        
 Average number of iteration of the linear solver per call:                 0              
 
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
                                                  Time loop statistics
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 The first time step is not accounted for the computation of the time loop statistics
-Total time of the time loop:                                               15.8511        
+Total time of the time loop:                                               13.9732        
 Number of time steps:                                                      9              
 Skipped time steps:                                                        1              
-Average time per time step:                                                1.76123        
-Standard deviation between time steps:                                     0.0801091      
-Time elapsed in the skipped time steps:                                    30.8973        
+Average time per time step:                                                1.55257        
+Standard deviation between time steps:                                     0.0658627      
+Time elapsed in the skipped time steps:                                    40.3491        
 
 
 Standard counter description             | Time/step       | % loop time | Call(s)/step   
 ------------------------------------------------------------------------------------------
-Linear solver resolutions Ax=B           | 0.942021        | 18.1        | 3              
-Matrix assembly for implicit scheme      | 0.1707076       |  3.3        | 1              
-Convection operator                      | 0.2086225       |  4.0        | 4              
-Diffusion operator                       | 0.01388809      |  0.3        | 2              
-Divergence operator                      | 0.03186915      |  0.6        | 4              
-Source terms                             | 0.0006588372    |  0.0        | 2              
-Update ::mettre_a_jour                   | 0.01178035      |  0.2        | 4              
-Computation of the time step dt          | 0.001818052     |  0.0        | 4              
-Post-treatment operations                | 0.02213894      |  0.4        | 1              
-Other operations                         | 0.3577303       |  6.9        | 
+Linear solver resolutions Ax=B           | 0.873477        | 56.3        | 3              
+Matrix assembly for implicit scheme      | 0.1348632       |  8.7        | 1              
+Convection operator                      | 0.1748085       | 11.3        | 4              
+Diffusion operator                       | 0.01119091      |  0.7        | 2              
+Divergence operator                      | 0.01998403      |  1.3        | 4              
+Source terms                             | 0.00281269      |  0.2        | 2              
+Update ::mettre_a_jour                   | 0.008613155     |  0.6        | 4              
+Computation of the time step dt          | 0.001514032     |  0.1        | 4              
+Post-treatment operations                | 0.01969925      |  1.3        | 1              
+Other operations                         | 0.3056097       | 19.7        | 
 
-Average number of iteration of the linear solver per call:                 44.6           
+Average number of iteration of the linear solver per call:                 45.2           
 
 
 -----------------------------------------------------------------------------------------------------------
@@ -61,17 +61,17 @@ Average number of iteration of the linear solver per call:                 44.6
 -----------------------------------------------------------------------------------------------------------
 Counter description                      | Time per step   | % loop time | Call(s)/step    | Bandwidth 
 -----------------------------------------------------------------------------------------------------------
-Libraries:                               | 0.919261        | 52.2        | 3               | 
-Kernels:                                 | 0.830634        | 47.2        | 435             | 
-Copy host to device:                     | 0.000698769     |  0.0        | 21              | 6.4 GB/s
-Copy device to host:                     | 0.000794326     |  0.0        | 7               | 14.0 GB/s
-Alloc/Free on device:                    | 0.000123277     |  0.0        | 4               | 
-GPU: 99% Copy H<->D: 0.085% Alloc/free: 0.007% Comm: 0% CPU & I/O: 0.55%
+Libraries:                               | 0.851297        | 54.8        | 3               | 
+Kernels:                                 | 0.689726        | 44.4        | 433             | 
+Copy host to device:                     | 0.00081278      |  0.1        | 21              | 5.5 GB/s
+Copy device to host:                     | 0.000797494     |  0.1        | 7               | 14.0 GB/s
+Alloc/Free on device:                    | 0.000133037     |  0.0        | 4               | 
+GPU: 99% Copy H<->D: 0.1% Alloc/free: 0.0086% Comm: 0% CPU & I/O: 0.63%
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
                                               Post-resolution statistics
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-Time of the post-resolution:                                               0.190694       
+Time of the post-resolution:                                               0.181855       
 
-Total time for the whole computation                                       134.126        
+Total time for the whole computation                                       147.186        
 
-Power consumption: 0 kW 0 kWh 0 € (0.12€/kWh)
+[Slurm] Power consumption (170 s):  0.512 kW  0.024 kWh  0.002 € (0.10€/kWh)
diff --git a/tests/GPU/PETSC_KOKKOS/PETSC_KOKKOS_BENCH.TU.topaze_cc80 b/tests/GPU/PETSC_KOKKOS/PETSC_KOKKOS_BENCH.TU.topaze_cc80
index 0b7897eebc..c96cace81a 100644
--- a/tests/GPU/PETSC_KOKKOS/PETSC_KOKKOS_BENCH.TU.topaze_cc80
+++ b/tests/GPU/PETSC_KOKKOS/PETSC_KOKKOS_BENCH.TU.topaze_cc80
@@ -8,8 +8,8 @@ Time is given in seconds
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
                                               Context of the computation
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-Date:     20-03-2026 -- 15:58:09
-OS:       topaze7070__Linux__x86_64__4.18.0-553.89.1.el8_10.x86_64__#1 SMP Sat Nov 29 00:49:18 EST 2025
+Date:     15-05-2026 -- 13:55:52
+OS:       topaze7040__Linux__x86_64__4.18.0-553.89.1.el8_10.x86_64__#1 SMP Sat Nov 29 00:49:18 EST 2025
 CPU model : AMD EPYC 7763 64-Core Processor
 Total number of threads:256
 GPU model: NVIDIA A100-SXM4-80GB
@@ -22,38 +22,38 @@ Total number of elements used for the calculation: 2560000
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
                                            Computation start-up statistics
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-Total time of the start-up:                                                56.9018        
+Total time of the start-up:                                                58.4819        
 
 Number of calls to the linear solver per time step:                        1              
-Average time of the resolution of the linear problem per call:             4.27597        
+Average time of the resolution of the linear problem per call:             4.08312        
 Average number of iteration of the linear solver per call:                 0              
 
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
                                                  Time loop statistics
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 The first time step is not accounted for the computation of the time loop statistics
-Total time of the time loop:                                               6.81565        
+Total time of the time loop:                                               6.22125        
 Number of time steps:                                                      9              
 Skipped time steps:                                                        1              
-Average time per time step:                                                0.757294       
-Standard deviation between time steps:                                     0.0561637      
-Time elapsed in the skipped time steps:                                    29.5074        
+Average time per time step:                                                0.69125        
+Standard deviation between time steps:                                     0.0538561      
+Time elapsed in the skipped time steps:                                    38.441         
 
 
 Standard counter description             | Time/step       | % loop time | Call(s)/step   
 ------------------------------------------------------------------------------------------
-Linear solver resolutions Ax=B           | 0.530545        | 70.1        | 3              
-Matrix assembly for implicit scheme      | 0.04442663      |  5.9        | 1              
-Convection operator                      | 0.04330396      |  5.7        | 4              
-Diffusion operator                       | 0.005314186     |  0.7        | 2              
-Divergence operator                      | 0.008753432     |  1.2        | 4              
-Source terms                             | 0.001030859     |  0.1        | 2              
-Update ::mettre_a_jour                   | 0.004987524     |  0.7        | 4              
-Computation of the time step dt          | 0.001178208     |  0.2        | 4              
-Post-treatment operations                | 0.01122804      |  1.5        | 1              
-Other operations                         | 0.1065263       | 14.1        | 
+Linear solver resolutions Ax=B           | 0.510936        | 73.9        | 3              
+Matrix assembly for implicit scheme      | 0.02969765      |  4.3        | 1              
+Convection operator                      | 0.02792525      |  4.0        | 4              
+Diffusion operator                       | 0.004167876     |  0.6        | 2              
+Divergence operator                      | 0.003910002     |  0.6        | 4              
+Source terms                             | 0.000349984     |  0.1        | 2              
+Update ::mettre_a_jour                   | 0.004006453     |  0.6        | 4              
+Computation of the time step dt          | 0.0009945288    |  0.1        | 4              
+Post-treatment operations                | 0.01014142      |  1.5        | 1              
+Other operations                         | 0.09912006      | 14.3        | 
 
-Average number of iteration of the linear solver per call:                 45             
+Average number of iteration of the linear solver per call:                 45.1           
 
 
 -----------------------------------------------------------------------------------------------------------
@@ -61,17 +61,17 @@ Average number of iteration of the linear solver per call:                 45
 -----------------------------------------------------------------------------------------------------------
 Counter description                      | Time per step   | % loop time | Call(s)/step    | Bandwidth 
 -----------------------------------------------------------------------------------------------------------
-Libraries:                               | 0.512967        | 67.7        | 3               | 
-Kernels:                                 | 0.230442        | 30.4        | 434             | 
-Copy host to device:                     | 0.00184645      |  0.2        | 21              | 2.4 GB/s
-Copy device to host:                     | 0.00140153      |  0.2        | 7               | 7.9 GB/s
-Alloc/Free on device:                    | 0.00091814      |  0.1        | 4               | 
-GPU: 98% Copy H<->D: 0.43% Alloc/free: 0.12% Comm: 0% CPU & I/O: 1.3%
+Libraries:                               | 0.493351        | 71.4        | 3               | 
+Kernels:                                 | 0.183954        | 26.6        | 433             | 
+Copy host to device:                     | 0.00171774      |  0.2        | 21              | 2.6 GB/s
+Copy device to host:                     | 0.000954437     |  0.1        | 7               | 11.7 GB/s
+Alloc/Free on device:                    | 0.000935466     |  0.1        | 4               | 
+GPU: 98% Copy H<->D: 0.39% Alloc/free: 0.14% Comm: 0% CPU & I/O: 1.5%
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
                                               Post-resolution statistics
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-Time of the post-resolution:                                               0.308962       
+Time of the post-resolution:                                               0.211632       
 
-Total time for the whole computation                                       93.5339        
+Total time for the whole computation                                       103.356        
 
-[Slurm] Power consumption (131 s):  0.377 kW  0.014 kWh  0.001 € (0.10€/kWh)
+[Slurm] Power consumption (135 s):  0.554 kW  0.021 kWh  0.002 € (0.10€/kWh)
diff --git a/tests/GPU/PNE_LES_LHe/PNE_LES_LHe.data b/tests/GPU/PNE_LES_LHe/PNE_LES_LHe.data
new file mode 100644
index 0000000000..2a66cff8f0
--- /dev/null
+++ b/tests/GPU/PNE_LES_LHe/PNE_LES_LHe.data
@@ -0,0 +1,245 @@
+# PARALLEL OK #
+# Warning: Mesh is reduced compared to the real one #
+dimension 3
+
+pb_thermohydraulique_turbulent pb_fluide
+Domaine dom_fluide
+
+# BEGIN MESH #
+Mailler dom_fluide
+{
+      Pave Cav_leftdown
+                {
+                	Origine 0. 0.  0
+                	Nombre_de_Noeuds 11 10  3
+                	Longueurs 0.02  0.1   0.03
+			Facteurs 1.01  1.0   1
+                }
+                {
+                	Bord front   	 Z = 0.      0.  <= X <= 0.02    0.  <= Y <= 0.1        
+                	Bord back   	 Z = 0.03    0.  <= X <= 0.02    0.  <= Y <= 0.1        
+                	Bord left   	 X = 0.      0.  <= Y <= 0.1    0.  <= Z <= 0.03      
+                	bord bas         Y = 0.      0.  <= X <= 0.02    0.  <= Z <= 0.03   
+                } ,
+        Pave Cav_leftmid
+                {
+                	Origine 0. 0.1  0.
+                	Nombre_de_Noeuds 11 110  3
+                	Longueurs 0.02  0.95  0.03
+			Facteurs 1.01  1.0  1
+                }
+                {
+                	Bord front   	 Z = 0.      0.  <= X <= 0.02    0.1  <= Y <= 1.05        
+                	Bord back   	 Z = 0.03    0.  <= X <= 0.02    0.1  <= Y <= 1.05        
+                	Bord wallheat    X = 0.      0.1  <= Y <= 1.05     0.  <= Z <= 0.03   
+                } ,
+		
+        Pave Cav_leftup
+                {
+                	Origine 0. 1.05  0
+                	Nombre_de_Noeuds 11 25  3
+                	Longueurs 0.02  1.5  0.03
+			Facteurs 1.01 1.01  1
+                }
+                {
+                	Bord left       X = 0.    1.05  <= Y <= 2.55    0.  <= Z <= 0.03          
+                	bord up          Y = 2.55    0.  <= X <= 0.02   0.  <= Z <= 0.03       
+                	Bord front   	 Z = 0.      0.  <= X <= 0.02    1.05  <= Y <= 2.55       
+                	Bord back   	 Z = 0.03    0.  <= X <= 0.02    1.05  <= Y <= 2.55       
+                } ,
+			
+       Pave Cav_rightdown
+                {
+                	Origine 0.02 0.  0
+                	Nombre_de_Noeuds 4 10  3
+                	Longueurs 0.23  0.1   0.03
+			Facteurs 1  1.0   1
+                }
+                {
+                	Bord front   	 Z = 0.      0.02  <= X <= 0.25    0.  <= Y <= 0.1        
+                	Bord back   	 Z = 0.03    0.02  <= X <= 0.25    0.  <= Y <= 0.1        
+                	Bord right      X = 0.25     0.  <= Y <= 0.1   0.  <= Z <= 0.03           
+                	bord bas         Y = 0.      0.02  <= X <= 0.249    0.  <= Z <= 0.03   
+                	bord inlet         Y = 0.      0.249  <= X <= 0.25    0.  <= Z <= 0.03   		 
+                } ,
+        Pave Cav_rightmid
+                {
+                	Origine 0.02 0.1  0
+                	Nombre_de_Noeuds 4 110  3
+                	Longueurs 0.23  0.95  0.03
+			Facteurs 1  1.0  1
+                }
+                {
+                 	Bord front   	 Z = 0.      0.02  <= X <= 0.25     0.1  <= Y <= 1.05         
+                	Bord back   	 Z = 0.03    0.02  <= X <= 0.25      0.1  <= Y <= 1.05         
+                	Bord right       X = 0.25     0.1  <= Y <= 1.05   0.  <= Z <= 0.03            
+                } ,		
+        Pave Cav_rightup
+                {
+                	Origine 0.02 1.05  0
+                	Nombre_de_Noeuds 4 25  3
+                	Longueurs 0.23  1.5  0.03
+			Facteurs 1  1.01  1
+                } 
+                {
+                 	Bord right   	 X = 0.25      1.05  <= Y <= 2.55       0.  <= Z <= 0.03          
+                	bord up          Y = 2.55     0.02  <= X <= 0.249      0.  <= Z <= 0.03        
+                	bord outlet      Y = 2.55    0.249  <= X <= 0.25     0.  <= Z <= 0.03       
+                	Bord front   	 Z = 0.      0.02  <= X <= 0.25    1.05  <= Y <= 2.55        
+                	Bord back   	 Z = 0.03    0.02  <= X <= 0.25    1.05  <= Y <= 2.55        	
+                } 	
+}
+/* raffiner_isotrope dom_fluide raffiner_isotrope dom_fluide raffiner_isotrope dom_fluide */
+RegroupeBord dom_fluide perio { front back }
+Corriger_frontiere_periodique { domaine dom_fluide bord perio }
+# END MESH #
+
+# BEGIN PARTITION
+Partition dom_fluide
+{
+	Partition_tool metis { Nb_parts 2 }
+	Larg_joint 2
+	zones_name DOM
+    	single_hdf
+}
+END PARTITION #
+
+# BEGIN SCATTER
+Scatter DOM.Zones dom_fluide
+END SCATTER #
+
+VDF dis
+Runge_Kutta_ordre_3 sch
+Read sch
+{
+    nb_pas_dt_max 10
+    tinit 0
+    tmax 30
+    dt_min 5E-7
+    dt_max 1E-2
+    dt_start dt_fixe 1E-6
+    dt_impr 1.e-1
+    dt_sauv -1
+    seuil_statio 1.e-18
+    diffusion_implicite 1
+    Facsec 1
+}
+
+Associate pb_fluide dom_fluide
+Associate  pb_fluide sch
+Discretize pb_fluide dis
+
+
+Read pb_fluide
+{
+    fluide_incompressible {
+        gravite Champ_Uniforme 3  0. -9.81 0
+        mu       Champ_Fonc_Fonction  pb_fluide temperature 1 -2.90526e-05*val+6.56303e-06*val^2-5.0438e-07*val^3+4.71169e-05
+        rho      Champ_Uniforme 1   125.9132
+        lambda  Champ_Fonc_Fonction pb_fluide temperature 1  0.01966864+val-val
+        Cp     Champ_Uniforme 1   5128.20066
+        beta_th  Champ_Fonc_Fonction pb_fluide temperature 1  -2395.84*val+805.478*val^2-120.134*val^3+6.70735*val^4+2667.35
+    }
+
+    Navier_Stokes_Turbulent
+    {
+        /* solveur_pression AMG GCP { atol 1.e-6 impr } */
+        solveur_pression petsc_gpu cli
+        {
+            -pc_type hypre
+                -pc_hypre_type boomeramg
+                -pc_hypre_boomeramg_strong_threshold 0.5
+                -pc_hypre_boomeramg_agg_nl 4
+                -pc_hypre_boomeramg_agg_num_paths 5
+                -pc_hypre_boomeramg_max_levels 25
+                -pc_hypre_boomeramg_coarsen_type PMIS
+                -pc_hypre_boomeramg_interp_type ext+i
+                -pc_hypre_boomeramg_P_max 2
+                -pc_hypre_boomeramg_truncfactor 0.5
+                -ksp_atol 1e-5
+                -ksp_max_it 20000
+            }
+        Parametre_equation parametre_diffusion_implicite
+        {
+            crank 1
+            niter_max_diffusion_implicite  300
+            preconditionnement_diag 1
+            seuil_diffusion_implicite 1.e-9
+        }
+
+        convection { quick }
+        diffusion { }
+        initial_conditions { vitesse champ_uniforme 3 0 0 0 }
+        sources { Boussinesq_temperature { T0 4.2 verif_boussinesq 0 } }
+        boundary_conditions
+        {
+            left paroi_fixe
+            up paroi_fixe
+            bas  paroi_fixe
+            right paroi_fixe
+            perio    periodique
+            outlet frontiere_ouverte_pression_imposee champ_front_uniforme 1  240000
+            wallheat paroi_defilante champ_front_fonc_txyz   3   0.5*(15.1875*((y-0.1)/0.025)^5-35.4375*((y-0.1)/0.025)^4+20.25*((y-0.1)/0.025)^3)*sin(2*Pi*10*t)*(0.1<y)*(y<0.125)*(x<0.001)+0.5*(15.1875*((0.15-y)/0.025)^5-35.4375*((0.15-y)/0.025)^4+20.25*((0.15-y)/0.025)^3)*cos(2*Pi*10*t)*(0.125<y)*(y<0.15)*(x<0.001)     0  0
+            inlet frontiere_ouverte_vitesse_imposee  champ_front_fonction 3 vitesse 0.0001  0.  0
+
+        }
+        modele_turbulence sous_maille_wale  {
+            cw   0.55
+            turbulence_paroi negligeable
+        }
+    }
+
+    Convection_Diffusion_temperature_Turbulent
+    {
+        Modele_Turbulence prandtl { turbulence_paroi negligeable_scalaire dt_impr_nusselt 10 }
+        diffusion { }
+        convection { quick }
+        boundary_conditions   {
+            left         	Paroi_adiabatique
+            up         		Paroi_temperature_imposee champ_front_uniforme 1 4.2
+            perio  		periodique
+            bas        		Paroi_temperature_imposee champ_front_uniforme 1 4.2
+            outlet     		frontiere_ouverte_temperature_imposee champ_front_uniforme 1 4.2
+            inlet    		frontiere_ouverte_temperature_imposee champ_front_uniforme 1 4.2
+            right               Paroi_temperature_imposee champ_front_uniforme 1 4.2
+            wallheat            Paroi_temperature_imposee champ_front_uniforme 1 4.8
+        }
+        initial_conditions { temperature champ_uniforme  1 4.2      }
+    }
+
+    Post_processing
+    {
+        definition_champs
+        {
+            Moy_temp_fluide Reduction_0D { methode moyenne_ponderee sources { refChamp { Pb_champ pb_fluide temperature } } }
+        }
+        Sondes {
+            sonde_vit vitesseY periode 2e-4 point 1 10e-6 0.75 0.015
+            sonde_tem temperature periode 2e-4 point 1 10e-6 0.75  0.015
+            sonde_Temp_Paroi  temperature periode 2e-4 segment 100 10e-6  0. 0.015 10e-6  0.8 0.015
+            sonde_vit2 vitesseY periode 2e-4 point 1 10e-6 0.2 0.015
+            sonde_vit4 vitesseY periode 2e-4 point 1  10e-6 0.17 0.015
+            sonde_vit3 vitesseY periode 2e-4 point 1  10e-6 0.25 0.015
+        }
+        format lml
+        Champs dt_post 1
+        {
+            pression_pa elem
+        }
+        Statistiques dt_post 1
+        {
+            t_deb    0
+            t_fin    200
+	    moyenne vitesse elem
+            moyenne temperature elem
+            correlation vitesse temperature elem
+            correlation temperature temperature elem
+        }
+    }
+}
+
+Solve pb_fluide
+
+postraiter_domaine { domaine dom_fluide fichier file format lata }
+
+End
diff --git a/tests/GPU/PNE_LES_LHe/PNE_LES_LHe_BENCH.TU.is157091_cc86 b/tests/GPU/PNE_LES_LHe/PNE_LES_LHe_BENCH.TU.is157091_cc86
new file mode 100644
index 0000000000..e31fa34cee
--- /dev/null
+++ b/tests/GPU/PNE_LES_LHe/PNE_LES_LHe_BENCH.TU.is157091_cc86
@@ -0,0 +1,79 @@
+                                             # Global performance file #
+
+This is the global file for tracking performance in TRUST. It stores aggregated quantities.
+More detailed statistics can be found in the PNE_LES_LHe_BENCH_csv.TU file
+For time loop, only standard counters of level 1 are printed alongside your custom counters
+Time is given in seconds
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+                                              Context of the computation
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Date:     10-06-2026 -- 15:01:40
+OS:       is157091__Linux__x86_64__5.11.12-300.fc34.x86_64__#1 SMP Wed Apr 7 16:31:13 UTC 2021
+CPU model : AMD Ryzen Threadripper PRO 5975WX 32-Cores
+Total number of threads:64
+GPU model: NVIDIA RTX A6000
+CUDA runtime version: 12.90
+CUDA drivers version: 13.20
+Nb procs used for the computation: 1
+TRUST version: 1.9.8_beta
+Total number of elements used for the calculation: 1890304
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+                                           Computation start-up statistics
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Total time of the start-up:                                                13.8888        
+
+Number of calls to the linear solver per time step:                        2              
+Average time of the resolution of the linear problem per call:             0.911821       
+Average number of iteration of the linear solver per call:                 0              
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+                                                 Time loop statistics
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+The first time step is not accounted for the computation of the time loop statistics
+Total time of the time loop:                                               24.4568        
+Number of time steps:                                                      9              
+Skipped time steps:                                                        1              
+Average time per time step:                                                2.71742        
+Standard deviation between time steps:                                     0.0695912      
+Time elapsed in the skipped time steps:                                    3.13353        
+
+
+Standard counter description             | Time/step       | % loop time | Call(s)/step   
+------------------------------------------------------------------------------------------
+Linear solver resolutions Ax=B           | 0.0388775       |  1.4        | 3              
+Matrix assembly for implicit scheme      | 2.324819        | 85.6        | 3              
+Convection operator                      | 0.02628685      |  1.0        | 6              
+Diffusion operator                       | 0.1543779       |  5.7        | 30             
+Gradient operator                        | 0.004737031     |  0.2        | 6              
+Divergence operator                      | 0.00219372      |  0.1        | 4              
+Source terms                             | 0.001756303     |  0.1        | 3              
+Update ::mettre_a_jour                   | 0.02134433      |  0.8        | 1              
+Solver for implicit diffusion            | 0.03286185      |  1.2        | 6              
+Computation of the time step dt          | 0.06742379      |  2.5        | 10             
+Turbulence model::update                 | 0.007030298     |  0.3        | 1              
+Post-treatment operations                | 0.02855687      |  1.1        | 1              
+Other operations                         | 0.007150974     |  0.3        | 
+
+Average number of iteration of the linear solver per call:                 2.52           
+
+
+-----------------------------------------------------------------------------------------------------------
+                                                    GPU statistics
+-----------------------------------------------------------------------------------------------------------
+Counter description                      | Time per step   | % loop time | Call(s)/step    | Bandwidth 
+-----------------------------------------------------------------------------------------------------------
+Libraries:                               | 0.0377956       |  1.4        | 3               | 
+Kernels:                                 | 0.186466        |  6.9        | 1783            | 
+Copy host to device:                     | 0.0303324       |  1.1        | 83              | 13.0 GB/s
+Copy device to host:                     | 0.0568247       |  2.1        | 79              | 12.3 GB/s
+Alloc/Free on device:                    | 0.000127717     |  0.0        | 2236            | 
+GPU: 8.3% Copy H<->D: 3.2% Alloc/free: 0.0047% Comm: 0% CPU & I/O: 89%
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+                                              Post-resolution statistics
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Time of the post-resolution:                                               0.179095       
+
+Total time for the whole computation                                       41.6582        
+
diff --git a/tests/GPU/PNE_LES_LHe/PNE_LES_LHe_BENCH.TU.topaze_cc80 b/tests/GPU/PNE_LES_LHe/PNE_LES_LHe_BENCH.TU.topaze_cc80
new file mode 100644
index 0000000000..620438b849
--- /dev/null
+++ b/tests/GPU/PNE_LES_LHe/PNE_LES_LHe_BENCH.TU.topaze_cc80
@@ -0,0 +1,80 @@
+                                             # Global performance file #
+
+This is the global file for tracking performance in TRUST. It stores aggregated quantities.
+More detailed statistics can be found in the PNE_LES_LHe_BENCH_csv.TU file
+For time loop, only standard counters of level 1 are printed alongside your custom counters
+Time is given in seconds
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+                                              Context of the computation
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Date:     13-06-2026 -- 12:33:04
+OS:       topaze7062__Linux__x86_64__4.18.0-553.123.1.el8_10.x86_64__#1 SMP Mon May 4 13:45:48 EDT 2026
+CPU model : AMD EPYC 7763 64-Core Processor
+Total number of threads:256
+GPU model: NVIDIA A100-SXM4-80GB
+CUDA runtime version: 12.90
+CUDA drivers version: 13.0
+Nb procs used for the computation: 1
+TRUST version: 1.9.8_beta
+Total number of elements used for the calculation: 1890304
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+                                           Computation start-up statistics
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Total time of the start-up:                                                21.2704        
+
+Number of calls to the linear solver per time step:                        2              
+Average time of the resolution of the linear problem per call:             1.52921        
+Average number of iteration of the linear solver per call:                 0              
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+                                                 Time loop statistics
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+The first time step is not accounted for the computation of the time loop statistics
+Total time of the time loop:                                               26.8942        
+Number of time steps:                                                      9              
+Skipped time steps:                                                        1              
+Average time per time step:                                                2.98825        
+Standard deviation between time steps:                                     0.115967       
+Time elapsed in the skipped time steps:                                    3.76817        
+
+
+Standard counter description             | Time/step       | % loop time | Call(s)/step   
+------------------------------------------------------------------------------------------
+Linear solver resolutions Ax=B           | 0.0290735       |  1.0        | 3              
+Matrix assembly for implicit scheme      | 2.659207        | 89.0        | 3              
+Convection operator                      | 0.01160732      |  0.4        | 6              
+Diffusion operator                       | 0.1153224       |  3.9        | 30             
+Gradient operator                        | 0.002387247     |  0.1        | 6              
+Divergence operator                      | 0.001326333     |  0.0        | 4              
+Source terms                             | 0.0009539784    |  0.0        | 3              
+Update ::mettre_a_jour                   | 0.01441476      |  0.5        | 1              
+Solver for implicit diffusion            | 0.01719583      |  0.6        | 6              
+Computation of the time step dt          | 0.08637916      |  2.9        | 10             
+Turbulence model::update                 | 0.003609511     |  0.1        | 1              
+Post-treatment operations                | 0.04176783      |  1.4        | 1              
+Other operations                         | 0.005002396     |  0.2        | 
+
+Average number of iteration of the linear solver per call:                 2.52           
+
+
+-----------------------------------------------------------------------------------------------------------
+                                                    GPU statistics
+-----------------------------------------------------------------------------------------------------------
+Counter description                      | Time per step   | % loop time | Call(s)/step    | Bandwidth 
+-----------------------------------------------------------------------------------------------------------
+Libraries:                               | 0.0284574       |  1.0        | 3               | 
+Kernels:                                 | 0.0965999       |  3.2        | 1783            | 
+Copy host to device:                     | 0.0240385       |  0.8        | 83              | 16.3 GB/s
+Copy device to host:                     | 0.0485745       |  1.6        | 79              | 14.4 GB/s
+Alloc/Free on device:                    | 0.000201251     |  0.0        | 2236            | 
+GPU: 4.2% Copy H<->D: 2.4% Alloc/free: 0.0067% Comm: 0% CPU & I/O: 93%
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+                                              Post-resolution statistics
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Time of the post-resolution:                                               0.305996       
+
+Total time for the whole computation                                       52.2388        
+
+[Slurm] Power consumption (97 s):  0.379 kW  0.010 kWh  0.001 € (0.10€/kWh)
diff --git a/tests/GPU/PNE_LES_LHe/check_perf.sh b/tests/GPU/PNE_LES_LHe/check_perf.sh
new file mode 120000
index 0000000000..6d20411c12
--- /dev/null
+++ b/tests/GPU/PNE_LES_LHe/check_perf.sh
@@ -0,0 +1 @@
+../DomainFlowLES/check_perf.sh
\ No newline at end of file
diff --git a/tests/GPU/TRUSTSingle/TRUSTSingle.TU.ref_is157091 b/tests/GPU/TRUSTSingle/TRUSTSingle.TU.ref_is157091
new file mode 100644
index 0000000000..44b677f2d0
--- /dev/null
+++ b/tests/GPU/TRUSTSingle/TRUSTSingle.TU.ref_is157091
@@ -0,0 +1,60 @@
+                                             # Global performance file #
+
+This is the global file for tracking performance in TRUST. It stores aggregated quantities.
+More detailed statistics can be found in the TRUSTSingle_csv.TU file
+For time loop, only standard counters of level 1 are printed alongside your custom counters
+Time is given in seconds
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+                                              Context of the computation
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Date:     09-04-2026 -- 14:28:31
+OS:       is157091__Linux__x86_64__5.11.12-300.fc34.x86_64__#1 SMP Wed Apr 7 16:31:13 UTC 2021
+CPU model : AMD Ryzen Threadripper PRO 5975WX 32-Cores
+Total number of threads:64
+GPU model: No GPU used for the computation
+Nb procs used for the computation: 1
+TRUST version: 1.9.8_beta
+Total number of elements used for the calculation: 39601
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+                                           Computation start-up statistics
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Total time of the start-up:                                                0.269967       
+
+Number of calls to the linear solver per time step:                        2              
+Average time of the resolution of the linear problem per call:             0.0943477      
+Average number of iteration of the linear solver per call:                 1              
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+                                                 Time loop statistics
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Total time of the time loop:                                               3.16648        
+Number of time steps:                                                      100            
+Skipped time steps:                                                        0              
+Average time per time step:                                                0.0316648      
+Standard deviation between time steps:                                     0.000411583    
+Time elapsed in the skipped time steps:                                    0              
+
+
+Standard counter description             | Time/step       | % loop time | Call(s)/step   
+------------------------------------------------------------------------------------------
+Linear solver resolutions Ax=B           | 0.014383        | 45.4        | 1              
+Convection operator                      | 0.002030326     |  6.4        | 10             
+Diffusion operator                       | 0.006760434     | 21.4        | 10             
+Gradient operator                        | 0.000844102     |  2.7        | 2              
+Divergence operator                      | 0.0005150209    |  1.6        | 2              
+Update ::mettre_a_jour                   | 0.001913582     |  6.0        | 1              
+Computation of the time step dt          | 0.002560449     |  8.1        | 20             
+Post-treatment operations                | 0.0002793886    |  0.9        | 1              
+Other operations                         | 0.002378514     |  7.5        | 
+
+Average number of iteration of the linear solver per call:                 1              
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+                                              Post-resolution statistics
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Time of the post-resolution:                                               0.000175592    
+
+Total time for the whole computation                                       3.43674        
+
diff --git a/tests/GPU/TRUSTSingle/TRUSTSingle.data b/tests/GPU/TRUSTSingle/TRUSTSingle.data
new file mode 100644
index 0000000000..cd8b137659
--- /dev/null
+++ b/tests/GPU/TRUSTSingle/TRUSTSingle.data
@@ -0,0 +1,122 @@
+# Performance of TRUSTSingle #
+dimension 2
+
+Pb_Thermohydraulique_Cloned_Concentration pb
+Domaine dom
+
+# BEGIN MESH #
+Mailler dom
+{
+    Pave Cavite
+    {
+        Origine 0. 0.
+        Nombre_de_Noeuds 200 200
+        Longueurs 0.01 0.01
+    }
+    {
+        Bord Gauche X = 0.	0. <= Y <= 0.01
+        Bord Haut   Y = 0.01	0. <= X <= 0.01
+        Bord Bas    Y = 0.	0. <= X <= 0.01
+        Bord Droit  X = 0.01	0. <= Y <= 0.01
+    }
+}
+
+# END MESH #
+# BEGIN PARTITION
+Partition dom
+{
+    Partition_tool tranche { tranches 2 1 }
+    Larg_joint 2
+    zones_name DOM
+}
+End
+END PARTITION #
+
+# BEGIN SCATTER
+Scatter DOM.Zones dom
+END SCATTER #
+
+vdf dis
+
+Schema_euler_explicite sch
+Read sch
+{
+    nb_pas_dt_max 100
+    tinit 0
+    dt_max 1e-5
+    dt_impr 1e-10
+    dt_sauv -1
+}
+
+Associate pb dom
+Associate pb sch
+Discretize pb dis
+
+Read pb
+{
+    Fluide_Incompressible
+    {
+        mu	champ_fonc_fonction pb temperature 1 1.85e-5*(1+val*0.01)
+        rho	Champ_Uniforme 1 1.
+        lambda	champ_fonc_fonction pb temperature 1 0.0262
+        Cp	Champ_Uniforme 1 1.
+        beta_th	Champ_Uniforme 1 3.41e-3
+        beta_co	Champ_Uniforme 1 0.04
+        gravite Champ_Uniforme 2 0 -9.81
+    }
+
+    Constituant
+    {
+        coefficient_diffusion champ_uniforme 8 0.000262 0.00262 0.0262 0.262 0.000262 0.00262 0.0262 0.262
+    }
+
+    Navier_Stokes_standard
+    {
+        solveur_pression petsc cholesky { }
+
+        convection { amont }
+        diffusion { }
+        initial_conditions
+        {
+            vitesse Champ_Uniforme 2 0. 0.
+        }
+        boundary_conditions
+        {
+            Haut	symetrie
+            Droit	frontiere_ouverte_vitesse_imposee Champ_Front_Uniforme 2  0. 0.
+            Bas	    symetrie
+            Gauche	frontiere_ouverte_vitesse_imposee Champ_Front_Uniforme 2  0. 0.
+        }
+
+    }
+    Convection_Diffusion_Temperature
+    {
+        diffusion { }
+        convection { amont }
+        initial_conditions { Temperature Champ_Uniforme 1 0. }
+        boundary_conditions
+        {
+            Haut symetrie
+            Bas     symetrie
+            Gauche	frontiere_ouverte_temperature_imposee Champ_Front_Uniforme 1 0.
+            Droit	frontiere_ouverte_temperature_imposee Champ_Front_Uniforme 1 0.
+        }
+    }
+    Convection_diffusion_Concentration
+    {
+        diffusion { }
+        convection { amont }
+        initial_conditions { concentration0 Champ_Uniforme 1 0. }
+        boundary_conditions
+        {
+            Haut    symetrie
+            Bas     symetrie
+            Gauche	frontiere_ouverte_concentration_imposee Champ_Front_Uniforme 1 0.
+            Droit	frontiere_ouverte_concentration_imposee Champ_Front_Uniforme 1 10.
+        }
+    }
+}
+
+Solve pb
+End
+
diff --git a/tests/GPU/TRUSTSingle/TRUSTSingle.lml.gz b/tests/GPU/TRUSTSingle/TRUSTSingle.lml.gz
new file mode 100644
index 0000000000..df58e35069
Binary files /dev/null and b/tests/GPU/TRUSTSingle/TRUSTSingle.lml.gz differ
diff --git a/tests/GPU/TaylorGreen/TaylorGreen.data b/tests/GPU/TaylorGreen/TaylorGreen.data
new file mode 100644
index 0000000000..dba39d9940
--- /dev/null
+++ b/tests/GPU/TaylorGreen/TaylorGreen.data
@@ -0,0 +1,114 @@
+# Taylor-Green vortex DNS - Wang 2013, section 4.15             #
+# 3D incompressible Navier-Stokes, Re=1600                       #
+# Domain: -pi*L <= x,y,z <= pi*L with L=1                       #
+# V0=1, tc=L/V0=1, t_final=20*tc=20                             #
+# Re = rho0*V0*L/mu = 1600  =>  mu = 1/1600 = 6.25e-4           #
+# Initial conditions (Taylor-Green vortex):                       #
+#   u =  V0*sin(x/L)*cos(y/L)*cos(z/L) =  sin(x)*cos(y)*cos(z) #
+#   v = -V0*cos(x/L)*sin(y/L)*cos(z/L) = -cos(x)*sin(y)*cos(z) #
+#   w = 0                                                         #
+Dimension 3
+Pb_Hydraulique pb
+Domaine dom
+
+# BEGIN MESH #
+Mailler dom
+{
+    Pave Cube
+    {
+        Origine -3.14159265358979 -3.14159265358979 -3.14159265358979
+        Nombre_de_Noeuds 6 6 6
+        /* Nombre_de_Noeuds 101 101 101 */
+        Longueurs 6.28318530717959 6.28318530717959 6.28318530717959
+    }
+    {
+        Bord periox   X = -3.14159265358979  -3.14159265358979 <= Y <= 3.14159265358979  -3.14159265358979 <= Z <= 3.14159265358979
+        Bord periox   X =  3.14159265358979  -3.14159265358979 <= Y <= 3.14159265358979  -3.14159265358979 <= Z <= 3.14159265358979
+        Bord perioy   Y = -3.14159265358979  -3.14159265358979 <= X <= 3.14159265358979  -3.14159265358979 <= Z <= 3.14159265358979
+        Bord perioy   Y =  3.14159265358979  -3.14159265358979 <= X <= 3.14159265358979  -3.14159265358979 <= Z <= 3.14159265358979
+        Bord perioz   Z = -3.14159265358979  -3.14159265358979 <= X <= 3.14159265358979  -3.14159265358979 <= Y <= 3.14159265358979
+        Bord perioz   Z =  3.14159265358979  -3.14159265358979 <= X <= 3.14159265358979  -3.14159265358979 <= Y <= 3.14159265358979
+    }
+}
+Declarer_bord_perio { domaine dom bord periox }
+Declarer_bord_perio { domaine dom bord perioy }
+Declarer_bord_perio { domaine dom bord perioz }
+# END MESH #
+
+# BEGIN PARTITION
+Partition dom
+{
+    Partition_tool Metis { Nb_parts 4 }
+    Larg_joint 2
+    single_hdf
+    zones_name DOM
+}
+End
+END PARTITION #
+
+# BEGIN SCATTER
+Scatter DOM.Zones dom
+END SCATTER #
+
+VDF dis
+Lire dis { reorder { algo hilbert } }
+
+Runge_Kutta_ordre_3 sch
+Read sch
+{
+    nb_pas_dt_max 50
+    tinit 0
+    tmax 20
+    dt_impr 1.0
+    facsec 1.
+}
+
+Associate pb dom
+Associate pb sch
+Discretize pb dis
+
+#
+Domaine plan_x0
+Domaine plan_y0
+Domaine plan_z0
+Extraire_surface { domaine plan_x0 probleme pb condition_elements -0.032<x<0.032 }
+Extraire_surface { domaine plan_y0 probleme pb condition_elements -0.032<y<0.032 }
+Extraire_surface { domaine plan_z0 probleme pb condition_elements -0.032<z<0.032 }
+Domaine lateral
+Extraire_surface
+{
+    domaine lateral
+    probleme pb
+    avec_certains_bords 3 periox perioy perioz
+}
+#
+Lire pb
+{
+    Fluide_incompressible
+    {
+        mu  champ_uniforme 1 6.25e-4
+        rho champ_uniforme 1 1
+    }
+    Navier_Stokes_standard
+    {
+        Solveur_pression    AMG GCP { rtol 1e-4 atol 1.e-15 impr }
+        Convection          { quick }
+        Diffusion           { }
+        Conditions_initiales {
+            vitesse champ_fonc_xyz dom 3
+                sin(x)*cos(y)*cos(z)
+                -cos(x)*sin(y)*cos(z)
+                0.
+            pression champ_fonc_xyz dom 1 (cos(2*x)+cos(2*y))*(cos(2*z)+2)/16
+        }
+        Conditions_limites  {
+            periox   periodique
+            perioy   periodique
+            perioz   periodique
+        }
+        Traitement_particulier { EC { Ec periode 0.1 } } # Ec computed accurately, no velocity interpolation on element #
+    }
+    Postraitement { format lml Champs dt_post 1e6 { vorticite elem } }
+}
+Solve pb
+End
diff --git a/tests/GPU/TaylorGreen/TaylorGreen.lml.gz b/tests/GPU/TaylorGreen/TaylorGreen.lml.gz
new file mode 100644
index 0000000000..b4c24e7f2d
Binary files /dev/null and b/tests/GPU/TaylorGreen/TaylorGreen.lml.gz differ
diff --git a/tests/GPU/TaylorGreen/TaylorGreen_BENCH.TU.adastra_gfx90a b/tests/GPU/TaylorGreen/TaylorGreen_BENCH.TU.adastra_gfx90a
new file mode 100644
index 0000000000..b1af46b93f
--- /dev/null
+++ b/tests/GPU/TaylorGreen/TaylorGreen_BENCH.TU.adastra_gfx90a
@@ -0,0 +1,76 @@
+                                             # Global performance file #
+
+This is the global file for tracking performance in TRUST. It stores aggregated quantities.
+More detailed statistics can be found in the TaylorGreen_BENCH_csv.TU file
+For time loop, only standard counters of level 1 are printed alongside your custom counters
+Time is given in seconds
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+                                              Context of the computation
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Date:     23-04-2026 -- 09:36:49
+OS:       g1182__Linux__x86_64__5.14.0-570.69.1.el9_6.x86_64__#1 SMP PREEMPT_DYNAMIC Tue Nov 25 01:30:14 EST 2025
+CPU model : AMD EPYC 7A53 64-Core Processor
+Total number of threads:128
+GPU model: AMD Instinct MI250X
+HIP runtime version: 6.43
+HIP drivers version: 6.43
+Nb procs used for the computation: 1
+TRUST version: 1.9.8_beta
+Total number of elements used for the calculation: 1000000
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+                                           Computation start-up statistics
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Total time of the start-up:                                                12.9292        
+
+Number of calls to the linear solver per time step:                        2              
+Average time of the resolution of the linear problem per call:             0.648536       
+Average number of iteration of the linear solver per call:                 11             
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+                                                 Time loop statistics
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+The first time step is not accounted for the computation of the time loop statistics
+Total time of the time loop:                                               30.5851        
+Number of time steps:                                                      388            
+Skipped time steps:                                                        1              
+Average time per time step:                                                0.0788276      
+Standard deviation between time steps:                                     0.0957122      
+Time elapsed in the skipped time steps:                                    0.302043       
+
+
+Standard counter description             | Time/step       | % loop time | Call(s)/step   
+------------------------------------------------------------------------------------------
+Linear solver resolutions Ax=B           | 0.0422761       | 53.6        | 3              
+Convection operator                      | 0.0100035       | 12.7        | 3              
+Diffusion operator                       | 0.005446688     |  6.9        | 3              
+Gradient operator                        | 0.001918766     |  2.4        | 6              
+Divergence operator                      | 0.002071648     |  2.6        | 4              
+Update ::mettre_a_jour                   | 0.001549268     |  2.0        | 1              
+Computation of the time step dt          | 0.0004215803    |  0.5        | 2              
+Post-treatment operations                | 0.01133839      | 14.4        | 1              
+Other operations                         | 0.003801692     |  4.8        | 
+
+Average number of iteration of the linear solver per call:                 13.5           
+
+
+-----------------------------------------------------------------------------------------------------------
+                                                    GPU statistics
+-----------------------------------------------------------------------------------------------------------
+Counter description                      | Time per step   | % loop time | Call(s)/step    | Bandwidth 
+-----------------------------------------------------------------------------------------------------------
+Libraries:                               | 0.0419368       | 53.2        | 3               | 
+Kernels:                                 | 0.0312796       | 39.7        | 303             | 
+Copy host to device:                     | 0.000231061     |  0.3        | 13              | 0.7 GB/s
+Copy device to host:                     | 1.64936e-05     |  0.0        | 0               | 13.3 GB/s
+Alloc/Free on device:                    | 0.000498703     |  0.6        | 310             | 
+GPU: 93% Copy H<->D: 0.31% Alloc/free: 0.63% Comm: 0% CPU & I/O: 6.2%
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+                                              Post-resolution statistics
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Time of the post-resolution:                                               0.037555       
+
+Total time for the whole computation                                       43.8542        
+
+[Slurm] Power consumption (51 s):  0.536 kW  0.008 kWh  0.001 € (0.10€/kWh)
diff --git a/tests/GPU/TaylorGreen/TaylorGreen_BENCH.TU.adastra_gfx942 b/tests/GPU/TaylorGreen/TaylorGreen_BENCH.TU.adastra_gfx942
new file mode 100644
index 0000000000..e1b77192d9
--- /dev/null
+++ b/tests/GPU/TaylorGreen/TaylorGreen_BENCH.TU.adastra_gfx942
@@ -0,0 +1,76 @@
+                                             # Global performance file #
+
+This is the global file for tracking performance in TRUST. It stores aggregated quantities.
+More detailed statistics can be found in the TaylorGreen_BENCH_csv.TU file
+For time loop, only standard counters of level 1 are printed alongside your custom counters
+Time is given in seconds
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+                                              Context of the computation
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Date:     23-04-2026 -- 14:58:43
+OS:       a1003__Linux__x86_64__5.14.0-570.69.1.el9_6.x86_64__#1 SMP PREEMPT_DYNAMIC Tue Nov 25 01:30:14 EST 2025
+CPU model : AMD Instinct MI300A Accelerator
+Total number of threads:192
+GPU model: AMD Instinct MI300A
+HIP runtime version: 6.43
+HIP drivers version: 6.43
+Nb procs used for the computation: 1
+TRUST version: 1.9.8_beta
+Total number of elements used for the calculation: 1000000
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+                                           Computation start-up statistics
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Total time of the start-up:                                                11.2893        
+
+Number of calls to the linear solver per time step:                        2              
+Average time of the resolution of the linear problem per call:             0.694486       
+Average number of iteration of the linear solver per call:                 11             
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+                                                 Time loop statistics
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+The first time step is not accounted for the computation of the time loop statistics
+Total time of the time loop:                                               16.38          
+Number of time steps:                                                      388            
+Skipped time steps:                                                        1              
+Average time per time step:                                                0.0422165      
+Standard deviation between time steps:                                     0.00388779     
+Time elapsed in the skipped time steps:                                    0.160434       
+
+
+Standard counter description             | Time/step       | % loop time | Call(s)/step   
+------------------------------------------------------------------------------------------
+Linear solver resolutions Ax=B           | 0.0230728       | 54.7        | 3              
+Convection operator                      | 0.003841562     |  9.1        | 3              
+Diffusion operator                       | 0.00227114      |  5.4        | 3              
+Gradient operator                        | 0.001197736     |  2.8        | 6              
+Divergence operator                      | 0.001136415     |  2.7        | 4              
+Update ::mettre_a_jour                   | 0.001148386     |  2.7        | 1              
+Computation of the time step dt          | 0.0002428235    |  0.6        | 2              
+Post-treatment operations                | 0.006814134     | 16.1        | 1              
+Other operations                         | 0.002491502     |  5.9        | 
+
+Average number of iteration of the linear solver per call:                 13.5           
+
+
+-----------------------------------------------------------------------------------------------------------
+                                                    GPU statistics
+-----------------------------------------------------------------------------------------------------------
+Counter description                      | Time per step   | % loop time | Call(s)/step    | Bandwidth 
+-----------------------------------------------------------------------------------------------------------
+Libraries:                               | 0.0228135       | 54.0        | 3               | 
+Kernels:                                 | 0.0121658       | 28.8        | 318             | 
+Copy host to device:                     | 0.000199058     |  0.5        | 13              | 0.8 GB/s
+Copy device to host:                     | 0.000438415     |  1.0        | 1               | 49.8 GB/s
+Alloc/Free on device:                    | 0.000203403     |  0.5        | 311             | 
+GPU: 83% Copy H<->D: 1.5% Alloc/free: 0.48% Comm: 0% CPU & I/O: 15%
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+                                              Post-resolution statistics
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Time of the post-resolution:                                               0.0383888      
+
+Total time for the whole computation                                       27.8684        
+
+[Slurm] Power consumption (38 s):  0.784 kW  0.008 kWh  0.001 € (0.10€/kWh)
diff --git a/tests/GPU/TaylorGreen/TaylorGreen_BENCH.TU.dalianvl_cc100 b/tests/GPU/TaylorGreen/TaylorGreen_BENCH.TU.dalianvl_cc100
new file mode 100644
index 0000000000..7da4e326b3
--- /dev/null
+++ b/tests/GPU/TaylorGreen/TaylorGreen_BENCH.TU.dalianvl_cc100
@@ -0,0 +1,75 @@
+                                             # Global performance file #
+
+This is the global file for tracking performance in TRUST. It stores aggregated quantities.
+More detailed statistics can be found in the TaylorGreen_BENCH_csv.TU file
+For time loop, only standard counters of level 1 are printed alongside your custom counters
+Time is given in seconds
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+                                              Context of the computation
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Date:     09-06-2026 -- 09:05:44
+OS:       dalianvl06__Linux__aarch64__5.14.0-503.40.1.el9_5.aarch64__#1 SMP PREEMPT_DYNAMIC Thu Apr 24 13:05:29 UTC 2025
+CPU model : Unknown Linux CPU
+Total number of threads:144
+GPU model: NVIDIA GB200
+CUDA runtime version: 13.0
+CUDA drivers version: 13.20
+Nb procs used for the computation: 1
+TRUST version: 1.9.8_beta
+Total number of elements used for the calculation: 1000000
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+                                           Computation start-up statistics
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Total time of the start-up:                                                6.74914        
+
+Number of calls to the linear solver per time step:                        2              
+Average time of the resolution of the linear problem per call:             0.471625       
+Average number of iteration of the linear solver per call:                 11             
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+                                                 Time loop statistics
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+The first time step is not accounted for the computation of the time loop statistics
+Total time of the time loop:                                               1.83892        
+Number of time steps:                                                      49             
+Skipped time steps:                                                        1              
+Average time per time step:                                                0.037529       
+Standard deviation between time steps:                                     0.00528329     
+Time elapsed in the skipped time steps:                                    0.0876919      
+
+
+Standard counter description             | Time/step       | % loop time | Call(s)/step   
+------------------------------------------------------------------------------------------
+Linear solver resolutions Ax=B           | 0.0280251       | 74.7        | 3              
+Convection operator                      | 0.001938012     |  5.2        | 3              
+Diffusion operator                       | 0.001504543     |  4.0        | 3              
+Gradient operator                        | 0.0009621812    |  2.6        | 6              
+Divergence operator                      | 0.001100594     |  2.9        | 4              
+Update ::mettre_a_jour                   | 0.0007023104    |  1.9        | 1              
+Computation of the time step dt          | 0.000180949     |  0.5        | 2              
+Post-treatment operations                | 0.0003962359    |  1.1        | 1              
+Other operations                         | 0.00271915      |  7.2        | 
+
+Average number of iteration of the linear solver per call:                 14             
+
+
+-----------------------------------------------------------------------------------------------------------
+                                                    GPU statistics
+-----------------------------------------------------------------------------------------------------------
+Counter description                      | Time per step   | % loop time | Call(s)/step    | Bandwidth 
+-----------------------------------------------------------------------------------------------------------
+Libraries:                               | 0.026644        | 71.0        | 3               | 
+Kernels:                                 | 0.00819043      | 21.8        | 304             | 
+Copy host to device:                     | 0.000177673     |  0.5        | 13              | 0.0 GB/s
+Copy device to host:                     | 0.00025907      |  0.7        | 12              | 1.8 GB/s
+Alloc/Free on device:                    | 1.19745e-05     |  0.0        | 286             | 
+GPU: 93% Copy H<->D: 1.2% Alloc/free: 0.032% Comm: 0% CPU & I/O: 6%
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+                                              Post-resolution statistics
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Time of the post-resolution:                                               0.033557       
+
+Total time for the whole computation                                       8.70936        
+
diff --git a/tests/GPU/TaylorGreen/TaylorGreen_BENCH.TU.eureka_cc89 b/tests/GPU/TaylorGreen/TaylorGreen_BENCH.TU.eureka_cc89
new file mode 100644
index 0000000000..462a4c6d56
--- /dev/null
+++ b/tests/GPU/TaylorGreen/TaylorGreen_BENCH.TU.eureka_cc89
@@ -0,0 +1,75 @@
+                                             # Global performance file #
+
+This is the global file for tracking performance in TRUST. It stores aggregated quantities.
+More detailed statistics can be found in the TaylorGreen_BENCH_csv.TU file
+For time loop, only standard counters of level 1 are printed alongside your custom counters
+Time is given in seconds
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+                                              Context of the computation
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Date:     11-05-2026 -- 08:36:48
+OS:       eureka__Linux__x86_64__6.17.0-20-generic__#20~24.04.1-Ubuntu SMP PREEMPT_DYNAMIC Thu Mar 19 01:28:37 UTC 2
+CPU model : INTEL(R) XEON(R) PLATINUM 8580
+Total number of threads:240
+GPU model: NVIDIA RTX 6000 Ada Generation
+CUDA runtime version: 12.90
+CUDA drivers version: 13.10
+Nb procs used for the computation: 1
+TRUST version: 1.9.8_beta
+Total number of elements used for the calculation: 1000000
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+                                           Computation start-up statistics
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Total time of the start-up:                                                5.38887        
+
+Number of calls to the linear solver per time step:                        2              
+Average time of the resolution of the linear problem per call:             0.62054        
+Average number of iteration of the linear solver per call:                 11             
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+                                                 Time loop statistics
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+The first time step is not accounted for the computation of the time loop statistics
+Total time of the time loop:                                               25.6012        
+Number of time steps:                                                      388            
+Skipped time steps:                                                        1              
+Average time per time step:                                                0.0659824      
+Standard deviation between time steps:                                     0.0060902      
+Time elapsed in the skipped time steps:                                    0.244451       
+
+
+Standard counter description             | Time/step       | % loop time | Call(s)/step   
+------------------------------------------------------------------------------------------
+Linear solver resolutions Ax=B           | 0.0404207       | 61.3        | 3              
+Convection operator                      | 0.006148762     |  9.3        | 3              
+Diffusion operator                       | 0.003561739     |  5.4        | 3              
+Gradient operator                        | 0.001136671     |  1.7        | 6              
+Divergence operator                      | 0.006674934     | 10.1        | 4              
+Update ::mettre_a_jour                   | 0.001842883     |  2.8        | 1              
+Computation of the time step dt          | 0.0003300212    |  0.5        | 2              
+Post-treatment operations                | 0.004873083     |  7.4        | 1              
+Other operations                         | 0.0009936003    |  1.5        | 
+
+Average number of iteration of the linear solver per call:                 13.6           
+
+
+-----------------------------------------------------------------------------------------------------------
+                                                    GPU statistics
+-----------------------------------------------------------------------------------------------------------
+Counter description                      | Time per step   | % loop time | Call(s)/step    | Bandwidth 
+-----------------------------------------------------------------------------------------------------------
+Libraries:                               | 0.0402207       | 61.0        | 3               | 
+Kernels:                                 | 0.0160424       | 24.3        | 315             | 
+Copy host to device:                     | 0.00310556      |  4.7        | 17              | 9.6 GB/s
+Copy device to host:                     | 0.00392875      |  6.0        | 5               | 13.1 GB/s
+Alloc/Free on device:                    | 0.0005011       |  0.8        | 311             | 
+GPU: 85% Copy H<->D: 11% Alloc/free: 0.76% Comm: 0% CPU & I/O: 3.3%
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+                                              Post-resolution statistics
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Time of the post-resolution:                                               0.014785       
+
+Total time for the whole computation                                       31.2497        
+
diff --git a/tests/GPU/TaylorGreen/TaylorGreen_BENCH.TU.irene-amd-ccrt_cc70 b/tests/GPU/TaylorGreen/TaylorGreen_BENCH.TU.irene-amd-ccrt_cc70
new file mode 100644
index 0000000000..e22a4dc95e
--- /dev/null
+++ b/tests/GPU/TaylorGreen/TaylorGreen_BENCH.TU.irene-amd-ccrt_cc70
@@ -0,0 +1,76 @@
+                                             # Global performance file #
+
+This is the global file for tracking performance in TRUST. It stores aggregated quantities.
+More detailed statistics can be found in the TaylorGreen_BENCH_csv.TU file
+For time loop, only standard counters of level 1 are printed alongside your custom counters
+Time is given in seconds
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+                                              Context of the computation
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Date:     23-04-2026 -- 09:45:25
+OS:       irene7050__Linux__x86_64__4.18.0-553.89.1.el8_10.x86_64__#1 SMP Sat Nov 29 00:49:18 EST 2025
+CPU model : Intel(R) Xeon(R) Gold 6230 CPU @ 2.10GHz
+Total number of threads:80
+GPU model: Tesla V100-SXM2-16GB
+CUDA runtime version: 12.90
+CUDA drivers version: 13.0
+Nb procs used for the computation: 1
+TRUST version: 1.9.8_beta
+Total number of elements used for the calculation: 1000000
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+                                           Computation start-up statistics
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Total time of the start-up:                                                10.2371        
+
+Number of calls to the linear solver per time step:                        2              
+Average time of the resolution of the linear problem per call:             0.90031        
+Average number of iteration of the linear solver per call:                 11             
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+                                                 Time loop statistics
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+The first time step is not accounted for the computation of the time loop statistics
+Total time of the time loop:                                               35.8402        
+Number of time steps:                                                      388            
+Skipped time steps:                                                        1              
+Average time per time step:                                                0.0923716      
+Standard deviation between time steps:                                     0.0642075      
+Time elapsed in the skipped time steps:                                    0.434717       
+
+
+Standard counter description             | Time/step       | % loop time | Call(s)/step   
+------------------------------------------------------------------------------------------
+Linear solver resolutions Ax=B           | 0.0624471       | 67.6        | 3              
+Convection operator                      | 0.00941783      | 10.2        | 3              
+Diffusion operator                       | 0.004559281     |  4.9        | 3              
+Gradient operator                        | 0.00250871      |  2.7        | 6              
+Divergence operator                      | 0.002154565     |  2.3        | 4              
+Update ::mettre_a_jour                   | 0.002488689     |  2.7        | 1              
+Computation of the time step dt          | 0.0004637799    |  0.5        | 2              
+Post-treatment operations                | 0.003477803     |  3.8        | 1              
+Other operations                         | 0.00485381      |  5.3        | 
+
+Average number of iteration of the linear solver per call:                 13.6           
+
+
+-----------------------------------------------------------------------------------------------------------
+                                                    GPU statistics
+-----------------------------------------------------------------------------------------------------------
+Counter description                      | Time per step   | % loop time | Call(s)/step    | Bandwidth 
+-----------------------------------------------------------------------------------------------------------
+Libraries:                               | 0.0620342       | 67.2        | 3               | 
+Kernels:                                 | 0.0232568       | 25.2        | 303             | 
+Copy host to device:                     | 0.000207016     |  0.2        | 13              | 0.8 GB/s
+Copy device to host:                     | 6.68813e-05     |  0.1        | 0               | 3.3 GB/s
+Alloc/Free on device:                    | 0.000659453     |  0.7        | 310             | 
+GPU: 92% Copy H<->D: 0.3% Alloc/free: 0.71% Comm: 0% CPU & I/O: 6.7%
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+                                              Post-resolution statistics
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Time of the post-resolution:                                               0.0348589      
+
+Total time for the whole computation                                       46.5472        
+
+[Slurm] Power consumption (59 s):  0.183 kW  0.003 kWh  0.000 € (0.10€/kWh)
diff --git a/tests/GPU/TaylorGreen/TaylorGreen_BENCH.TU.is157091_cc86 b/tests/GPU/TaylorGreen/TaylorGreen_BENCH.TU.is157091_cc86
new file mode 100644
index 0000000000..e6259a042c
--- /dev/null
+++ b/tests/GPU/TaylorGreen/TaylorGreen_BENCH.TU.is157091_cc86
@@ -0,0 +1,75 @@
+                                             # Global performance file #
+
+This is the global file for tracking performance in TRUST. It stores aggregated quantities.
+More detailed statistics can be found in the TaylorGreen_BENCH_csv.TU file
+For time loop, only standard counters of level 1 are printed alongside your custom counters
+Time is given in seconds
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+                                              Context of the computation
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Date:     03-06-2026 -- 13:01:43
+OS:       is157091__Linux__x86_64__5.11.12-300.fc34.x86_64__#1 SMP Wed Apr 7 16:31:13 UTC 2021
+CPU model : AMD Ryzen Threadripper PRO 5975WX 32-Cores
+Total number of threads:64
+GPU model: NVIDIA RTX A6000
+CUDA runtime version: 12.90
+CUDA drivers version: 13.20
+Nb procs used for the computation: 1
+TRUST version: 1.9.8_beta
+Total number of elements used for the calculation: 1000000
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+                                           Computation start-up statistics
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Total time of the start-up:                                                6.97009        
+
+Number of calls to the linear solver per time step:                        2              
+Average time of the resolution of the linear problem per call:             0.637857       
+Average number of iteration of the linear solver per call:                 11             
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+                                                 Time loop statistics
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+The first time step is not accounted for the computation of the time loop statistics
+Total time of the time loop:                                               4.59486        
+Number of time steps:                                                      49             
+Skipped time steps:                                                        1              
+Average time per time step:                                                0.0937727      
+Standard deviation between time steps:                                     0.00213292     
+Time elapsed in the skipped time steps:                                    0.2318         
+
+
+Standard counter description             | Time/step       | % loop time | Call(s)/step   
+------------------------------------------------------------------------------------------
+Linear solver resolutions Ax=B           | 0.0688406       | 73.4        | 3              
+Convection operator                      | 0.00943249      | 10.1        | 3              
+Diffusion operator                       | 0.004282985     |  4.6        | 3              
+Gradient operator                        | 0.002661855     |  2.8        | 6              
+Divergence operator                      | 0.001830782     |  2.0        | 4              
+Update ::mettre_a_jour                   | 0.0006778731    |  0.7        | 1              
+Computation of the time step dt          | 0.000528149     |  0.6        | 2              
+Post-treatment operations                | 0.0002804149    |  0.3        | 1              
+Other operations                         | 0.005237539     |  5.6        | 
+
+Average number of iteration of the linear solver per call:                 14             
+
+
+-----------------------------------------------------------------------------------------------------------
+                                                    GPU statistics
+-----------------------------------------------------------------------------------------------------------
+Counter description                      | Time per step   | % loop time | Call(s)/step    | Bandwidth 
+-----------------------------------------------------------------------------------------------------------
+Libraries:                               | 0.0682666       | 72.8        | 3               | 
+Kernels:                                 | 0.0237455       | 25.3        | 304             | 
+Copy host to device:                     | 0.000107392     |  0.1        | 13              | 0.0 GB/s
+Copy device to host:                     | 4.20598e-05     |  0.0        | 0               | 10.8 GB/s
+Alloc/Free on device:                    | 0.000506832     |  0.5        | 310             | 
+GPU: 98% Copy H<->D: 0.16% Alloc/free: 0.54% Comm: 0% CPU & I/O: 1.2%
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+                                              Post-resolution statistics
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Time of the post-resolution:                                               0.0153142      
+
+Total time for the whole computation                                       11.8121        
+
diff --git a/tests/GPU/TaylorGreen/TaylorGreen_BENCH.TU.is159479_cc120 b/tests/GPU/TaylorGreen/TaylorGreen_BENCH.TU.is159479_cc120
new file mode 100644
index 0000000000..dde5a18cbd
--- /dev/null
+++ b/tests/GPU/TaylorGreen/TaylorGreen_BENCH.TU.is159479_cc120
@@ -0,0 +1,75 @@
+                                             # Global performance file #
+
+This is the global file for tracking performance in TRUST. It stores aggregated quantities.
+More detailed statistics can be found in the TaylorGreen_BENCH_csv.TU file
+For time loop, only standard counters of level 1 are printed alongside your custom counters
+Time is given in seconds
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+                                              Context of the computation
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Date:     13-05-2026 -- 07:06:42
+OS:       is159479.intra.cea.fr__Linux__x86_64__6.12.0-160000.29-default__#1 SMP PREEMPT_DYNAMIC Fri May  1 12:45:19 UTC 2026 (6
+CPU model : AMD Ryzen Threadripper PRO 9965WX 24-Cores
+Total number of threads:48
+GPU model: NVIDIA RTX PRO 6000 Blackwell Workstation Edition
+CUDA runtime version: 13.0
+CUDA drivers version: 13.0
+Nb procs used for the computation: 1
+TRUST version: 1.9.8_beta
+Total number of elements used for the calculation: 1000000
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+                                           Computation start-up statistics
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Total time of the start-up:                                                3.16518        
+
+Number of calls to the linear solver per time step:                        2              
+Average time of the resolution of the linear problem per call:             0.354792       
+Average number of iteration of the linear solver per call:                 11             
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+                                                 Time loop statistics
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+The first time step is not accounted for the computation of the time loop statistics
+Total time of the time loop:                                               16.213         
+Number of time steps:                                                      388            
+Skipped time steps:                                                        1              
+Average time per time step:                                                0.0417862      
+Standard deviation between time steps:                                     0.00220827     
+Time elapsed in the skipped time steps:                                    0.153858       
+
+
+Standard counter description             | Time/step       | % loop time | Call(s)/step   
+------------------------------------------------------------------------------------------
+Linear solver resolutions Ax=B           | 0.0305818       | 73.2        | 3              
+Convection operator                      | 0.004429496     | 10.6        | 3              
+Diffusion operator                       | 0.001778696     |  4.3        | 3              
+Gradient operator                        | 0.0007466083    |  1.8        | 6              
+Divergence operator                      | 0.0008054549    |  1.9        | 4              
+Update ::mettre_a_jour                   | 0.0003303047    |  0.8        | 1              
+Computation of the time step dt          | 0.0001911036    |  0.5        | 2              
+Post-treatment operations                | 0.001386709     |  3.3        | 1              
+Other operations                         | 0.001536038     |  3.7        | 
+
+Average number of iteration of the linear solver per call:                 13.6           
+
+
+-----------------------------------------------------------------------------------------------------------
+                                                    GPU statistics
+-----------------------------------------------------------------------------------------------------------
+Counter description                      | Time per step   | % loop time | Call(s)/step    | Bandwidth 
+-----------------------------------------------------------------------------------------------------------
+Libraries:                               | 0.0302758       | 72.5        | 3               | 
+Kernels:                                 | 0.00976492      | 23.4        | 346             | 
+Copy host to device:                     | 7.26197e-05     |  0.2        | 13              | 0.0 GB/s
+Copy device to host:                     | 0.000337603     |  0.8        | 4               | 5.9 GB/s
+Alloc/Free on device:                    | 0.000283992     |  0.7        | 311             | 
+GPU: 96% Copy H<->D: 0.98% Alloc/free: 0.68% Comm: 0% CPU & I/O: 2.5%
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+                                              Post-resolution statistics
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Time of the post-resolution:                                               0.010984       
+
+Total time for the whole computation                                       19.5433        
+
diff --git a/tests/GPU/TaylorGreen/TaylorGreen_BENCH.TU.is247793_gfx1100 b/tests/GPU/TaylorGreen/TaylorGreen_BENCH.TU.is247793_gfx1100
new file mode 100644
index 0000000000..dc699fcd7d
--- /dev/null
+++ b/tests/GPU/TaylorGreen/TaylorGreen_BENCH.TU.is247793_gfx1100
@@ -0,0 +1,75 @@
+                                             # Global performance file #
+
+This is the global file for tracking performance in TRUST. It stores aggregated quantities.
+More detailed statistics can be found in the TaylorGreen_BENCH_csv.TU file
+For time loop, only standard counters of level 1 are printed alongside your custom counters
+Time is given in seconds
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+                                              Context of the computation
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Date:     22-05-2026 -- 19:06:05
+OS:       is247793__Linux__x86_64__6.8.0-107-generic__#107~22.04.1-Ubuntu SMP PREEMPT_DYNAMIC Wed Mar 18 23:40:43 UTC 
+CPU model : AMD Ryzen Threadripper PRO 7965WX 24-Cores
+Total number of threads:48
+GPU model: AMD Radeon PRO W7900
+HIP runtime version: 7.53
+HIP drivers version: 7.53
+Nb procs used for the computation: 1
+TRUST version: 1.9.8_beta
+Total number of elements used for the calculation: 1000000
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+                                           Computation start-up statistics
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Total time of the start-up:                                                6.74851        
+
+Number of calls to the linear solver per time step:                        2              
+Average time of the resolution of the linear problem per call:             0.962062       
+Average number of iteration of the linear solver per call:                 11             
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+                                                 Time loop statistics
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+The first time step is not accounted for the computation of the time loop statistics
+Total time of the time loop:                                               40.4216        
+Number of time steps:                                                      388            
+Skipped time steps:                                                        1              
+Average time per time step:                                                0.104179       
+Standard deviation between time steps:                                     0.0656626      
+Time elapsed in the skipped time steps:                                    0.433358       
+
+
+Standard counter description             | Time/step       | % loop time | Call(s)/step   
+------------------------------------------------------------------------------------------
+Linear solver resolutions Ax=B           | 0.0663882       | 63.7        | 3              
+Convection operator                      | 0.009758096     |  9.4        | 3              
+Diffusion operator                       | 0.006048223     |  5.8        | 3              
+Gradient operator                        | 0.002385386     |  2.3        | 6              
+Divergence operator                      | 0.002598763     |  2.5        | 4              
+Update ::mettre_a_jour                   | 0.001005287     |  1.0        | 1              
+Computation of the time step dt          | 0.000558219     |  0.5        | 2              
+Post-treatment operations                | 0.01075544      | 10.3        | 1              
+Other operations                         | 0.00468169      |  4.5        | 
+
+Average number of iteration of the linear solver per call:                 13.5           
+
+
+-----------------------------------------------------------------------------------------------------------
+                                                    GPU statistics
+-----------------------------------------------------------------------------------------------------------
+Counter description                      | Time per step   | % loop time | Call(s)/step    | Bandwidth 
+-----------------------------------------------------------------------------------------------------------
+Libraries:                               | 0.065981        | 63.3        | 3               | 
+Kernels:                                 | 0.0356524       | 34.2        | 346             | 
+Copy host to device:                     | 0.000291844     |  0.3        | 13              | 0.0 GB/s
+Copy device to host:                     | 0.000159518     |  0.2        | 4               | 12.5 GB/s
+Alloc/Free on device:                    | 0.000699643     |  0.7        | 311             | 
+GPU: 98% Copy H<->D: 0.43% Alloc/free: 0.67% Comm: 0% CPU & I/O: 1.3%
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+                                              Post-resolution statistics
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Time of the post-resolution:                                               0.0170791      
+
+Total time for the whole computation                                       47.6207        
+
diff --git a/tests/GPU/TaylorGreen/TaylorGreen_BENCH.TU.jean-zay_cc90 b/tests/GPU/TaylorGreen/TaylorGreen_BENCH.TU.jean-zay_cc90
new file mode 100644
index 0000000000..d0336672bc
--- /dev/null
+++ b/tests/GPU/TaylorGreen/TaylorGreen_BENCH.TU.jean-zay_cc90
@@ -0,0 +1,76 @@
+                                             # Global performance file #
+
+This is the global file for tracking performance in TRUST. It stores aggregated quantities.
+More detailed statistics can be found in the TaylorGreen_BENCH_csv.TU file
+For time loop, only standard counters of level 1 are printed alongside your custom counters
+Time is given in seconds
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+                                              Context of the computation
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Date:     10-06-2026 -- 10:46:31
+OS:       jzxh021__Linux__x86_64__5.14.0-570.116.1.el9_6.x86_64__#1 SMP PREEMPT_DYNAMIC Tue May 19 08:44:54 EDT 2026
+CPU model : Intel(R) Xeon(R) Platinum 8468
+Total number of threads:192
+GPU model: NVIDIA H100 80GB HBM3
+CUDA runtime version: 12.60
+CUDA drivers version: 13.20
+Nb procs used for the computation: 1
+TRUST version: 1.9.8_beta
+Total number of elements used for the calculation: 1000000
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+                                           Computation start-up statistics
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Total time of the start-up:                                                9.01156        
+
+Number of calls to the linear solver per time step:                        2              
+Average time of the resolution of the linear problem per call:             0.588518       
+Average number of iteration of the linear solver per call:                 11             
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+                                                 Time loop statistics
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+The first time step is not accounted for the computation of the time loop statistics
+Total time of the time loop:                                               2.0473         
+Number of time steps:                                                      49             
+Skipped time steps:                                                        1              
+Average time per time step:                                                0.0417815      
+Standard deviation between time steps:                                     0.00209294     
+Time elapsed in the skipped time steps:                                    0.265539       
+
+
+Standard counter description             | Time/step       | % loop time | Call(s)/step   
+------------------------------------------------------------------------------------------
+Linear solver resolutions Ax=B           | 0.0306315       | 73.3        | 3              
+Convection operator                      | 0.002527348     |  6.0        | 3              
+Diffusion operator                       | 0.001950996     |  4.7        | 3              
+Gradient operator                        | 0.001071137     |  2.6        | 6              
+Divergence operator                      | 0.001146943     |  2.7        | 4              
+Update ::mettre_a_jour                   | 0.0008361189    |  2.0        | 1              
+Computation of the time step dt          | 0.0001930345    |  0.5        | 2              
+Post-treatment operations                | 0.0003362419    |  0.8        | 1              
+Other operations                         | 0.003088216     |  7.4        | 
+
+Average number of iteration of the linear solver per call:                 14             
+
+
+-----------------------------------------------------------------------------------------------------------
+                                                    GPU statistics
+-----------------------------------------------------------------------------------------------------------
+Counter description                      | Time per step   | % loop time | Call(s)/step    | Bandwidth 
+-----------------------------------------------------------------------------------------------------------
+Libraries:                               | 0.0301188       | 72.1        | 3               | 
+Kernels:                                 | 0.00873178      | 20.9        | 304             | 
+Copy host to device:                     | 0.000165068     |  0.4        | 13              | 0.0 GB/s
+Copy device to host:                     | 0.000310021     |  0.7        | 12              | 1.5 GB/s
+Alloc/Free on device:                    | 7.35443e-06     |  0.0        | 286             | 
+GPU: 93% Copy H<->D: 1.1% Alloc/free: 0.018% Comm: 0% CPU & I/O: 5.9%
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+                                              Post-resolution statistics
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Time of the post-resolution:                                               0.022536       
+
+Total time for the whole computation                                       11.347         
+
+[Slurm] Power consumption (19 s):  0.422 kW  0.002 kWh  0.000 € (0.10€/kWh)
diff --git a/tests/GPU/TaylorGreen/TaylorGreen_BENCH.TU.lumi_gfx90a b/tests/GPU/TaylorGreen/TaylorGreen_BENCH.TU.lumi_gfx90a
new file mode 100644
index 0000000000..2c3595c17a
--- /dev/null
+++ b/tests/GPU/TaylorGreen/TaylorGreen_BENCH.TU.lumi_gfx90a
@@ -0,0 +1,76 @@
+                                             # Global performance file #
+
+This is the global file for tracking performance in TRUST. It stores aggregated quantities.
+More detailed statistics can be found in the TaylorGreen_BENCH_csv.TU file
+For time loop, only standard counters of level 1 are printed alongside your custom counters
+Time is given in seconds
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+                                              Context of the computation
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Date:     05-06-2026 -- 22:31:42
+OS:       nid005002__Linux__x86_64__6.4.0-150600.23.73_15.0.14-cray_shasta_c__#1 SMP Tue Oct 21 20:32:25 UTC 2025 (89d3c98)
+CPU model : AMD EPYC 7A53 64-Core Processor
+Total number of threads:128
+GPU model: AMD Instinct MI250X
+HIP runtime version: 6.43
+HIP drivers version: 6.43
+Nb procs used for the computation: 1
+TRUST version: 1.9.8_beta
+Total number of elements used for the calculation: 1000000
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+                                           Computation start-up statistics
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Total time of the start-up:                                                72.1727        
+
+Number of calls to the linear solver per time step:                        2              
+Average time of the resolution of the linear problem per call:             2.25663        
+Average number of iteration of the linear solver per call:                 11             
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+                                                 Time loop statistics
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+The first time step is not accounted for the computation of the time loop statistics
+Total time of the time loop:                                               4.57714        
+Number of time steps:                                                      49             
+Skipped time steps:                                                        1              
+Average time per time step:                                                0.0934111      
+Standard deviation between time steps:                                     0.117934       
+Time elapsed in the skipped time steps:                                    0.636137       
+
+
+Standard counter description             | Time/step       | % loop time | Call(s)/step   
+------------------------------------------------------------------------------------------
+Linear solver resolutions Ax=B           | 0.0451235       | 48.3        | 3              
+Convection operator                      | 0.01070777      | 11.5        | 3              
+Diffusion operator                       | 0.005400141     |  5.8        | 3              
+Gradient operator                        | 0.001926701     |  2.1        | 6              
+Divergence operator                      | 0.001946797     |  2.1        | 4              
+Update ::mettre_a_jour                   | 0.001422196     |  1.5        | 1              
+Computation of the time step dt          | 0.0003921027    |  0.4        | 2              
+Post-treatment operations                | 0.02267105      | 24.3        | 1              
+Other operations                         | 0.003820766     |  4.1        | 
+
+Average number of iteration of the linear solver per call:                 14             
+
+
+-----------------------------------------------------------------------------------------------------------
+                                                    GPU statistics
+-----------------------------------------------------------------------------------------------------------
+Counter description                      | Time per step   | % loop time | Call(s)/step    | Bandwidth 
+-----------------------------------------------------------------------------------------------------------
+Libraries:                               | 0.0447554       | 47.9        | 3               | 
+Kernels:                                 | 0.0460029       | 49.2        | 304             | 
+Copy host to device:                     | 0.000225169     |  0.2        | 13              | 0.0 GB/s
+Copy device to host:                     | 2.55554e-05     |  0.0        | 0               | 17.9 GB/s
+Alloc/Free on device:                    | 0.000507781     |  0.5        | 310             | 
+GPU: 97% Copy H<->D: 0.27% Alloc/free: 0.54% Comm: 0% CPU & I/O: 2%
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+                                              Post-resolution statistics
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Time of the post-resolution:                                               0.0320638      
+
+Total time for the whole computation                                       77.4181        
+
+[Slurm] Power consumption (96 s):  0.507 kW  0.014 kWh  0.001 € (0.10€/kWh)
diff --git a/tests/GPU/TaylorGreen/TaylorGreen_BENCH.TU.topaze_cc80 b/tests/GPU/TaylorGreen/TaylorGreen_BENCH.TU.topaze_cc80
new file mode 100644
index 0000000000..381f0b17a4
--- /dev/null
+++ b/tests/GPU/TaylorGreen/TaylorGreen_BENCH.TU.topaze_cc80
@@ -0,0 +1,76 @@
+                                             # Global performance file #
+
+This is the global file for tracking performance in TRUST. It stores aggregated quantities.
+More detailed statistics can be found in the TaylorGreen_BENCH_csv.TU file
+For time loop, only standard counters of level 1 are printed alongside your custom counters
+Time is given in seconds
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+                                              Context of the computation
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Date:     13-06-2026 -- 12:34:42
+OS:       topaze7062__Linux__x86_64__4.18.0-553.123.1.el8_10.x86_64__#1 SMP Mon May 4 13:45:48 EDT 2026
+CPU model : AMD EPYC 7763 64-Core Processor
+Total number of threads:256
+GPU model: NVIDIA A100-SXM4-80GB
+CUDA runtime version: 12.90
+CUDA drivers version: 13.0
+Nb procs used for the computation: 1
+TRUST version: 1.9.8_beta
+Total number of elements used for the calculation: 1000000
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+                                           Computation start-up statistics
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Total time of the start-up:                                                10.4623        
+
+Number of calls to the linear solver per time step:                        2              
+Average time of the resolution of the linear problem per call:             1.03733        
+Average number of iteration of the linear solver per call:                 11             
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+                                                 Time loop statistics
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+The first time step is not accounted for the computation of the time loop statistics
+Total time of the time loop:                                               2.76033        
+Number of time steps:                                                      49             
+Skipped time steps:                                                        1              
+Average time per time step:                                                0.0563332      
+Standard deviation between time steps:                                     0.00395212     
+Time elapsed in the skipped time steps:                                    0.283041       
+
+
+Standard counter description             | Time/step       | % loop time | Call(s)/step   
+------------------------------------------------------------------------------------------
+Linear solver resolutions Ax=B           | 0.0419811       | 74.5        | 3              
+Convection operator                      | 0.00381971      |  6.8        | 3              
+Diffusion operator                       | 0.002497447     |  4.4        | 3              
+Gradient operator                        | 0.001444911     |  2.6        | 6              
+Divergence operator                      | 0.001232232     |  2.2        | 4              
+Update ::mettre_a_jour                   | 0.001462763     |  2.6        | 1              
+Computation of the time step dt          | 0.000249541     |  0.4        | 2              
+Post-treatment operations                | 0.0005570403    |  1.0        | 1              
+Other operations                         | 0.003088481     |  5.5        | 
+
+Average number of iteration of the linear solver per call:                 14             
+
+
+-----------------------------------------------------------------------------------------------------------
+                                                    GPU statistics
+-----------------------------------------------------------------------------------------------------------
+Counter description                      | Time per step   | % loop time | Call(s)/step    | Bandwidth 
+-----------------------------------------------------------------------------------------------------------
+Libraries:                               | 0.0413206       | 73.4        | 3               | 
+Kernels:                                 | 0.0121898       | 21.6        | 304             | 
+Copy host to device:                     | 0.000149844     |  0.3        | 13              | 0.0 GB/s
+Copy device to host:                     | 0.000362689     |  0.6        | 12              | 1.3 GB/s
+Alloc/Free on device:                    | 9.97549e-06     |  0.0        | 286             | 
+GPU: 95% Copy H<->D: 0.91% Alloc/free: 0.018% Comm: 0% CPU & I/O: 4.1%
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+                                              Post-resolution statistics
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Time of the post-resolution:                                               0.0336353      
+
+Total time for the whole computation                                       13.5394        
+
+[Slurm] Power consumption (57 s):  0.407 kW  0.006 kWh  0.001 € (0.10€/kWh)
diff --git a/tests/GPU/TaylorGreen/TaylorGreen_EC.son_ref b/tests/GPU/TaylorGreen/TaylorGreen_EC.son_ref
new file mode 100644
index 0000000000..addc776b60
--- /dev/null
+++ b/tests/GPU/TaylorGreen/TaylorGreen_EC.son_ref
@@ -0,0 +1,14 @@
+# Temps        Energie_cinetique_totale
+0.00000000e+00 3.10062767e+01
+1.06625268e+00 2.47998448e+01
+2.07434992e+00 2.06924939e+01
+3.13906674e+00 1.73021176e+01
+4.31395623e+00 1.42913666e+01
+5.63905483e+00 1.16240944e+01
+7.15181113e+00 9.31943515e+00
+8.89094816e+00 7.32615578e+00
+1.08957949e+01 5.64517338e+00
+1.32159065e+01 4.28041341e+00
+1.59167146e+01 3.21474828e+00
+1.90915042e+01 2.40377983e+00
+2.28934723e+01 1.79118993e+00
diff --git a/tests/GPU/TaylorGreen/check_perf.sh b/tests/GPU/TaylorGreen/check_perf.sh
new file mode 120000
index 0000000000..6d20411c12
--- /dev/null
+++ b/tests/GPU/TaylorGreen/check_perf.sh
@@ -0,0 +1 @@
+../DomainFlowLES/check_perf.sh
\ No newline at end of file
diff --git a/tests/GPU/TaylorGreen/scaling.sh b/tests/GPU/TaylorGreen/scaling.sh
new file mode 120000
index 0000000000..e60cf18f07
--- /dev/null
+++ b/tests/GPU/TaylorGreen/scaling.sh
@@ -0,0 +1 @@
+../OpenMP_Iterateur/scaling.sh
\ No newline at end of file
diff --git a/tests/GPU/TaylorGreen/verifie b/tests/GPU/TaylorGreen/verifie
new file mode 100755
index 0000000000..4c1ba1ff1f
--- /dev/null
+++ b/tests/GPU/TaylorGreen/verifie
@@ -0,0 +1 @@
+compare_sonde TaylorGreen_EC.son_ref TaylorGreen_EC.son 1>verifie.log 2>&1 || exit -1
diff --git a/tests/GPU/VDF_90M_180GB/VDF_90M_180GB.data b/tests/GPU/VDF_90M_180GB/VDF_90M_180GB.data
new file mode 100644
index 0000000000..3da1278cbc
--- /dev/null
+++ b/tests/GPU/VDF_90M_180GB/VDF_90M_180GB.data
@@ -0,0 +1,114 @@
+# Test memory on device #
+# PARALLEL OK #
+Dimension 3
+
+Pb_hydraulique_turbulent pb
+
+Domaine dom_perio
+
+# BEGIN MESH #
+Mailler dom_perio
+{
+    Pave pave
+    {
+        /* warning dumb geometry */
+        Origine -30 0. 0.
+        Nombre_de_Noeuds 1001 301 301
+        Longueurs 30 2 10
+    }
+    {
+        Bord Periox      X = -30   0. <= Y <= 2. 0. <= Z <= 10.
+        Bord Periox      X = 0     0. <= Y <= 2. 0. <= Z <= 10.
+        Bord LowerWall   Y = 0.  -30. <= X <= 0. 0. <= Z <= 10.
+        Bord UpperWall   Y = 2.  -30. <= X <= 0. 0. <= Z <= 10.
+        Bord Perioz      Z = 0.  -30. <= X <= 0. 0. <= Y <= 2.
+        Bord Perioz      Z = 10. -30. <= X <= 0. 0. <= Y <= 2.
+    }
+}
+Declarer_bord_perio { domaine dom_perio bord Periox }
+Declarer_bord_perio { domaine dom_perio bord Perioz }
+# END MESH #
+
+# BEGIN PARTITION
+Partition dom_perio
+{
+    Partition_tool Metis { Nb_parts 4 }
+    Larg_joint 2
+    zones_name DOM
+}
+End
+END PARTITION #
+
+# BEGIN SCATTER
+Scatter DOM.Zones dom_perio
+END SCATTER #
+
+VDF dis 
+Lire dis { reorder { algo hilbert } }
+
+Runge_Kutta_ordre_3 sch
+Lire sch
+{
+	nb_pas_dt_max 0
+        dt_sauv -1
+	tinit 0
+	dt_impr 1e-6
+	facsec 2
+	precision_impr 8
+	tcpumax 23
+}
+
+Associer pb dom_perio
+Associer pb sch
+
+Discretiser pb dis
+
+Lire pb
+{
+	Fluide_incompressible
+	{
+		mu	champ_uniforme 1 3.5e-04
+		rho	champ_uniforme 1 1
+	}
+	Navier_Stokes_turbulent
+	{
+                Solveur_pression	AMG GCP { rtol 1e-15 impr }
+#
+		solveur_pression	petsc cli
+						{
+						-ksp_view
+						-ksp_type gmres
+						-ksp_norm_type unpreconditioned
+						-pc_type hypre
+						-pc_hypre_type boomeramg
+						-pc_mg_galerkin_mat_product_algorithm hypre
+						-pc_hypre_boomeramg_relax_type_all l1scaled-Jacobi
+						-pc_hypre_boomeramg_coarsen_type pmis
+						-pc_hypre_boomeramg_interp_type ext+i
+						-pc_hypre_boomeramg_strong_threshold 0.30
+						-pc_hypre_boomeramg_print_statistics 1
+						-ksp_rtol 1e-15 impr
+						}
+#
+		conditions_initiales	{
+					vitesse champ_uniforme 3 1 0 0
+					pression champ_uniforme 1 0
+					}
+		conditions_limites	{
+					Periox		periodique
+					Perioz		periodique
+					LowerWall	paroi_fixe
+					UpperWall	paroi_fixe
+					}
+		convection		{ centre4 }
+		diffusion		{ }
+		sources			{ canal_perio { bord Periox } }
+		modele_turbulence	null { }
+	}
+}
+
+EcritureLectureSpecial 0
+
+Resoudre pb
+
+Fin
diff --git a/tests/GPU/VEF_75M_190GB/VEF_75M_190GB.data b/tests/GPU/VEF_75M_190GB/VEF_75M_190GB.data
new file mode 100644
index 0000000000..39f491bb10
--- /dev/null
+++ b/tests/GPU/VEF_75M_190GB/VEF_75M_190GB.data
@@ -0,0 +1,121 @@
+# RAM on device #
+# copyToDevice 0 #
+dimension 3
+Pb_thermohydraulique_Turbulent pb1
+Domaine dom
+# BEGIN MESH #
+Mailler dom
+{
+    Pave Entree
+    {
+        Origine 0. 0. 0.
+        Nombre_de_Noeuds 58 58 571
+        Longueurs 1 1 10
+    }
+    {
+        Bord walls X = 0.  0. <= Y <= 1. 0. <= Z <= 10.
+        Bord walls X = 1.  0. <= Y <= 1. 0. <= Z <= 10.
+        Bord walls Y = 0.  0. <= X <= 1. 0. <= Z <= 10.
+        Bord walls Y = 1.  0. <= X <= 1. 0. <= Z <= 10.
+        Bord inlet Z = 0.  0. <= X <= 1. 0. <= Y <= 1.
+        Bord outlet Z = 10.  0. <= X <= 1. 0. <= Y <= 1.
+    }
+}
+Tetraedriser_homogene dom
+VerifierCoin dom {  }
+# END MESH #
+# BEGIN PARTITION
+Partition dom
+{
+    Partition_tool Metis { Nb_parts 8 }
+    Larg_joint 2
+    zones_name dom
+}
+End
+END PARTITION #
+
+# BEGIN SCATTER
+Scatter dom.Zones dom
+END SCATTER #
+
+vef dis 
+Lire dis { reorder { algo hilbert } }
+
+Runge_Kutta_Rationnel_ordre_2 sch
+Lire sch
+{
+    nb_pas_dt_max 0
+    dt_sauv -1
+    tinit 0.
+    tcpumax 47.00
+    dt_impr 0.0000001
+    dt_start dt_calc
+    tmax 2.
+    dt_min 1.e-10
+    dt_max 6.e-3
+    seuil_statio 1.e-14
+    facsec 1
+    diffusion_implicite 1
+    seuil_diffusion_implicite 1.e-10
+}
+
+
+Associer pb1 dom
+Associer pb1 sch
+Discretiser pb1 dis
+
+Lire pb1
+{
+
+    fluide_incompressible
+    {
+        gravite champ_uniforme 3 0 0 -9.81
+        mu Champ_Uniforme 1 0.008
+        rho Champ_Uniforme 1 995.2
+        lambda Champ_Uniforme 1 0.062
+        Cp Champ_Uniforme 1 4.1802
+        beta_th Champ_Uniforme 1 0.002902383982248589
+    }
+
+    Navier_Stokes_turbulent
+    {
+        solveur_pression AMG GCP { rtol 1.e-6 impr }
+        convection { muscl }
+        diffusion { }
+        conditions_initiales { vitesse Champ_uniforme 3 0. 0. 0. }
+        sources { boussinesq_temperature { T0 30. } }
+        conditions_limites
+        {
+            inlet frontiere_ouverte_vitesse_imposee Champ_front_uniforme 3 0. 0. 0.31
+            walls	 paroi_fixe
+            outlet frontiere_ouverte_pression_imposee Champ_front_uniforme 1 0
+        }
+
+        Modele_turbulence Sous_maille_wale
+        {
+            cw 0.5
+            turbulence_paroi negligeable
+        }
+    }
+    Convection_Diffusion_Temperature_Turbulent
+    {
+        diffusion { }
+        convection { muscl }
+        conditions_initiales { temperature champ_fonc_xyz dom 1 ((x*x+y*y)[(10e-3*10e-3)*(z[0.06))*20+20 }
+        boundary_conditions
+        {
+            outlet  frontiere_ouverte T_ext champ_front_uniforme 1 20
+            inlet  frontiere_ouverte_temperature_imposee  champ_front_uniforme 1 40
+            walls  Paroi_adiabatique
+        }
+        Modele_turbulence Prandtl
+        {
+            turbulence_paroi negligeable_scalaire
+        }
+    }
+
+}
+
+
+Resoudre pb1
+Fin
diff --git a/tests/GPU/cuDSS_cholesky/cuDSS_cholesky.data b/tests/GPU/cuDSS_cholesky/cuDSS_cholesky.data
index f3477328aa..d51cb9b4e2 100644
--- a/tests/GPU/cuDSS_cholesky/cuDSS_cholesky.data
+++ b/tests/GPU/cuDSS_cholesky/cuDSS_cholesky.data
@@ -40,7 +40,9 @@ END PARTITION #
 Scatter dom.Zones dom
 END SCATTER #
 
-vef dis
+vef dis 
+Lire dis { reorder { algo Hilbert } }
+
 Runge_Kutta_Rationnel_ordre_2 sch
 Lire sch
 {
diff --git a/tests/GPU/cuDSS_cholesky/cuDSS_cholesky_BENCH.TU.dalianvl_cc100 b/tests/GPU/cuDSS_cholesky/cuDSS_cholesky_BENCH.TU.dalianvl_cc100
new file mode 100644
index 0000000000..063ea275e2
--- /dev/null
+++ b/tests/GPU/cuDSS_cholesky/cuDSS_cholesky_BENCH.TU.dalianvl_cc100
@@ -0,0 +1,78 @@
+                                             # Global performance file #
+
+This is the global file for tracking performance in TRUST. It stores aggregated quantities.
+More detailed statistics can be found in the cuDSS_cholesky_BENCH_csv.TU file
+For time loop, only standard counters of level 1 are printed alongside your custom counters
+Time is given in seconds
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+                                              Context of the computation
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Date:     11-06-2026 -- 12:53:42
+OS:       dalianvl16__Linux__aarch64__5.14.0-503.40.1.el9_5.aarch64__#1 SMP PREEMPT_DYNAMIC Thu Apr 24 13:05:29 UTC 2025
+CPU model : Unknown Linux CPU
+Total number of threads:144
+GPU model: NVIDIA GB200
+CUDA runtime version: 13.0
+CUDA drivers version: 13.20
+Nb procs used for the computation: 1
+TRUST version: 1.9.8_beta
+Total number of elements used for the calculation: 2332800
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+                                           Computation start-up statistics
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Total time of the start-up:                                                13.2371        
+
+Number of calls to the linear solver per time step:                        2              
+Average time of the resolution of the linear problem per call:             0.120239       
+Average number of iteration of the linear solver per call:                 2              
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+                                                 Time loop statistics
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+The first time step is not accounted for the computation of the time loop statistics
+Total time of the time loop:                                               0.696072       
+Number of time steps:                                                      9              
+Skipped time steps:                                                        1              
+Average time per time step:                                                0.0773413      
+Standard deviation between time steps:                                     0.00315073     
+Time elapsed in the skipped time steps:                                    0.324934       
+
+
+Standard counter description             | Time/step       | % loop time | Call(s)/step   
+------------------------------------------------------------------------------------------
+Linear solver resolutions Ax=B           | 0.000121945     |  0.2        | 2              
+Convection operator                      | 0.008095081     | 10.5        | 4              
+Diffusion operator                       | 0.02787197      | 36.0        | 32             
+Gradient operator                        | 0.004787126     |  6.2        | 4              
+Divergence operator                      | 0.001358279     |  1.8        | 3              
+Source terms                             | 0.003056822     |  4.0        | 2              
+Update ::mettre_a_jour                   | 0.001711101     |  2.2        | 1              
+Solver for implicit diffusion            | 0.01594707      | 20.6        | 4              
+Computation of the time step dt          | 0.002858162     |  3.7        | 8              
+Turbulence model::update                 | 0.0005988777    |  0.8        | 1              
+Post-treatment operations                | 0.00524563      |  6.8        | 1              
+Other operations                         | 0.005689237     |  7.4        | 
+
+Average number of iteration of the linear solver per call:                 2              
+
+
+-----------------------------------------------------------------------------------------------------------
+                                                    GPU statistics
+-----------------------------------------------------------------------------------------------------------
+Counter description                      | Time per step   | % loop time | Call(s)/step    | Bandwidth 
+-----------------------------------------------------------------------------------------------------------
+Libraries:                               | 9.37107e-05     |  0.1        | 2               | 
+Kernels:                                 | 0.0693074       | 89.6        | 1063            | 
+Copy host to device:                     | 0.000347382     |  0.4        | 18              | 10.3 GB/s
+Copy device to host:                     | 0.000251933     |  0.3        | 7               | 41.9 GB/s
+Alloc/Free on device:                    | 0.000161753     |  0.2        | 0               | 
+GPU: 90% Copy H<->D: 0.77% Alloc/free: 0.21% Comm: 0% CPU & I/O: 9.3%
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+                                              Post-resolution statistics
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Time of the post-resolution:                                               0.157463       
+
+Total time for the whole computation                                       14.4155        
+
diff --git a/tests/GPU/cuDSS_cholesky/cuDSS_cholesky_BENCH.TU.eureka_cc89 b/tests/GPU/cuDSS_cholesky/cuDSS_cholesky_BENCH.TU.eureka_cc89
new file mode 100644
index 0000000000..126062ed7c
--- /dev/null
+++ b/tests/GPU/cuDSS_cholesky/cuDSS_cholesky_BENCH.TU.eureka_cc89
@@ -0,0 +1,78 @@
+                                             # Global performance file #
+
+This is the global file for tracking performance in TRUST. It stores aggregated quantities.
+More detailed statistics can be found in the cuDSS_cholesky_BENCH_csv.TU file
+For time loop, only standard counters of level 1 are printed alongside your custom counters
+Time is given in seconds
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+                                              Context of the computation
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Date:     11-05-2026 -- 08:38:14
+OS:       eureka__Linux__x86_64__6.17.0-20-generic__#20~24.04.1-Ubuntu SMP PREEMPT_DYNAMIC Thu Mar 19 01:28:37 UTC 2
+CPU model : INTEL(R) XEON(R) PLATINUM 8580
+Total number of threads:240
+GPU model: NVIDIA RTX 6000 Ada Generation
+CUDA runtime version: 12.90
+CUDA drivers version: 13.10
+Nb procs used for the computation: 1
+TRUST version: 1.9.8_beta
+Total number of elements used for the calculation: 2332800
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+                                           Computation start-up statistics
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Total time of the start-up:                                                58.8161        
+
+Number of calls to the linear solver per time step:                        2              
+Average time of the resolution of the linear problem per call:             12.2049        
+Average number of iteration of the linear solver per call:                 2              
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+                                                 Time loop statistics
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+The first time step is not accounted for the computation of the time loop statistics
+Total time of the time loop:                                               2.3888         
+Number of time steps:                                                      9              
+Skipped time steps:                                                        1              
+Average time per time step:                                                0.265422       
+Standard deviation between time steps:                                     0.00700682     
+Time elapsed in the skipped time steps:                                    1.30429        
+
+
+Standard counter description             | Time/step       | % loop time | Call(s)/step   
+------------------------------------------------------------------------------------------
+Linear solver resolutions Ax=B           | 0.10549         | 39.7        | 2              
+Convection operator                      | 0.0174541       |  6.6        | 4              
+Diffusion operator                       | 0.04629194      | 17.4        | 26             
+Gradient operator                        | 0.01120828      |  4.2        | 4              
+Divergence operator                      | 0.003497302     |  1.3        | 3              
+Source terms                             | 0.01068952      |  4.0        | 2              
+Update ::mettre_a_jour                   | 0.004092763     |  1.5        | 1              
+Solver for implicit diffusion            | 0.03844643      | 14.5        | 4              
+Computation of the time step dt          | 0.007731785     |  2.9        | 8              
+Turbulence model::update                 | 0.001743632     |  0.7        | 1              
+Post-treatment operations                | 0.006007037     |  2.3        | 1              
+Other operations                         | 0.01276892      |  4.8        | 
+
+Average number of iteration of the linear solver per call:                 2              
+
+
+-----------------------------------------------------------------------------------------------------------
+                                                    GPU statistics
+-----------------------------------------------------------------------------------------------------------
+Counter description                      | Time per step   | % loop time | Call(s)/step    | Bandwidth 
+-----------------------------------------------------------------------------------------------------------
+Libraries:                               | 0.105468        | 39.7        | 2               | 
+Kernels:                                 | 0.152195        | 57.3        | 906             | 
+Copy host to device:                     | 0.000530083     |  0.2        | 18              | 6.7 GB/s
+Copy device to host:                     | 0.00182647      |  0.7        | 7               | 5.8 GB/s
+Alloc/Free on device:                    | 0.00010853      |  0.0        | 0               | 
+GPU: 97% Copy H<->D: 0.89% Alloc/free: 0.041% Comm: 0% CPU & I/O: 2%
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+                                              Post-resolution statistics
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Time of the post-resolution:                                               0.0813076      
+
+Total time for the whole computation                                       62.5905        
+
diff --git a/tests/GPU/cuDSS_cholesky/cuDSS_cholesky_BENCH.TU.irene-amd-ccrt_cc70 b/tests/GPU/cuDSS_cholesky/cuDSS_cholesky_BENCH.TU.irene-amd-ccrt_cc70
index 89723c8543..f0274bdf4c 100644
--- a/tests/GPU/cuDSS_cholesky/cuDSS_cholesky_BENCH.TU.irene-amd-ccrt_cc70
+++ b/tests/GPU/cuDSS_cholesky/cuDSS_cholesky_BENCH.TU.irene-amd-ccrt_cc70
@@ -8,8 +8,8 @@ Time is given in seconds
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
                                               Context of the computation
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-Date:     20-03-2026 -- 15:20:40
-OS:       irene7067__Linux__x86_64__4.18.0-553.89.1.el8_10.x86_64__#1 SMP Sat Nov 29 00:49:18 EST 2025
+Date:     23-04-2026 -- 15:15:24
+OS:       irene7051__Linux__x86_64__4.18.0-553.89.1.el8_10.x86_64__#1 SMP Sat Nov 29 00:49:18 EST 2025
 CPU model : Intel(R) Xeon(R) Gold 6230 CPU @ 2.10GHz
 Total number of threads:80
 GPU model: Tesla V100-SXM2-16GB
@@ -22,38 +22,38 @@ Total number of elements used for the calculation: 2332800
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
                                            Computation start-up statistics
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-Total time of the start-up:                                                72.5068        
+Total time of the start-up:                                                75.6536        
 
 Number of calls to the linear solver per time step:                        2              
-Average time of the resolution of the linear problem per call:             13.6316        
+Average time of the resolution of the linear problem per call:             14.301         
 Average number of iteration of the linear solver per call:                 2              
 
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
                                                  Time loop statistics
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 The first time step is not accounted for the computation of the time loop statistics
-Total time of the time loop:                                               4.69841        
+Total time of the time loop:                                               3.67179        
 Number of time steps:                                                      9              
 Skipped time steps:                                                        1              
-Average time per time step:                                                0.522046       
-Standard deviation between time steps:                                     0.016306       
-Time elapsed in the skipped time steps:                                    1.94551        
+Average time per time step:                                                0.407977       
+Standard deviation between time steps:                                     0.0117863      
+Time elapsed in the skipped time steps:                                    1.8132         
 
 
 Standard counter description             | Time/step       | % loop time | Call(s)/step   
 ------------------------------------------------------------------------------------------
-Linear solver resolutions Ax=B           | 0.101177        | 19.4        | 2              
-Convection operator                      | 0.0427997       |  8.2        | 4              
-Diffusion operator                       | 0.1635086       | 31.3        | 26             
-Gradient operator                        | 0.03588981      |  6.9        | 4              
-Divergence operator                      | 0.02340071      |  4.5        | 3              
-Source terms                             | 0.03244575      |  6.2        | 2              
-Update ::mettre_a_jour                   | 0.01491838      |  2.9        | 1              
-Solver for implicit diffusion            | 0.04903136      |  9.4        | 4              
-Computation of the time step dt          | 0.03761318      |  7.2        | 8              
-Turbulence model::update                 | 0.004650237     |  0.9        | 1              
-Post-treatment operations                | 0.00782911      |  1.5        | 1              
-Other operations                         | 0.008781795     |  1.7        | 
+Linear solver resolutions Ax=B           | 0.12405         | 30.4        | 2              
+Convection operator                      | 0.02512303      |  6.2        | 4              
+Diffusion operator                       | 0.09736093      | 23.9        | 26             
+Gradient operator                        | 0.01811066      |  4.4        | 4              
+Divergence operator                      | 0.01293818      |  3.2        | 3              
+Source terms                             | 0.02176257      |  5.3        | 2              
+Update ::mettre_a_jour                   | 0.01019637      |  2.5        | 1              
+Solver for implicit diffusion            | 0.04906131      | 12.0        | 4              
+Computation of the time step dt          | 0.02536967      |  6.2        | 8              
+Turbulence model::update                 | 0.003636164     |  0.9        | 1              
+Post-treatment operations                | 0.008549231     |  2.1        | 1              
+Other operations                         | 0.01181943      |  2.9        | 
 
 Average number of iteration of the linear solver per call:                 2              
 
@@ -63,17 +63,17 @@ Average number of iteration of the linear solver per call:                 2
 -----------------------------------------------------------------------------------------------------------
 Counter description                      | Time per step   | % loop time | Call(s)/step    | Bandwidth 
 -----------------------------------------------------------------------------------------------------------
-Libraries:                               | 0.10115         | 19.4        | 2               | 
-Kernels:                                 | 0.410303        | 78.6        | 908             | 
-Copy host to device:                     | 0.00108591      |  0.2        | 18              | 3.3 GB/s
-Copy device to host:                     | 0.00235037      |  0.5        | 7               | 4.5 GB/s
-Alloc/Free on device:                    | 8.16001e-05     |  0.0        | 0               | 
-GPU: 98% Copy H<->D: 0.66% Alloc/free: 0.016% Comm: 0% CPU & I/O: 1.4%
+Libraries:                               | 0.124019        | 30.4        | 2               | 
+Kernels:                                 | 0.272361        | 66.8        | 906             | 
+Copy host to device:                     | 0.00113228      |  0.3        | 18              | 3.2 GB/s
+Copy device to host:                     | 0.00262643      |  0.6        | 7               | 4.0 GB/s
+Alloc/Free on device:                    | 9.4755e-05      |  0.0        | 0               | 
+GPU: 97% Copy H<->D: 0.92% Alloc/free: 0.023% Comm: 0% CPU & I/O: 1.9%
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
                                               Post-resolution statistics
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-Time of the post-resolution:                                               0.163355       
+Time of the post-resolution:                                               0.174586       
 
-Total time for the whole computation                                       79.3141        
+Total time for the whole computation                                       81.3132        
 
-[Slurm] Power consumption (104 s):  0.176 kW  0.005 kWh  0.001 € (0.10€/kWh)
+[Slurm] Power consumption (99 s):  0.201 kW  0.006 kWh  0.001 € (0.10€/kWh)
diff --git a/tests/GPU/cuDSS_cholesky/cuDSS_cholesky_BENCH.TU.is157091_cc86 b/tests/GPU/cuDSS_cholesky/cuDSS_cholesky_BENCH.TU.is157091_cc86
index f65252b26c..e21c107726 100644
--- a/tests/GPU/cuDSS_cholesky/cuDSS_cholesky_BENCH.TU.is157091_cc86
+++ b/tests/GPU/cuDSS_cholesky/cuDSS_cholesky_BENCH.TU.is157091_cc86
@@ -8,13 +8,13 @@ Time is given in seconds
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
                                               Context of the computation
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-Date:     10-03-2026 -- 08:45:30
+Date:     31-05-2026 -- 09:28:51
 OS:       is157091__Linux__x86_64__5.11.12-300.fc34.x86_64__#1 SMP Wed Apr 7 16:31:13 UTC 2021
 CPU model : AMD Ryzen Threadripper PRO 5975WX 32-Cores
 Total number of threads:64
 GPU model: NVIDIA RTX A6000
 CUDA runtime version: 12.90
-CUDA drivers version: 12.70
+CUDA drivers version: 13.20
 Nb procs used for the computation: 1
 TRUST version: 1.9.8_beta
 Total number of elements used for the calculation: 2332800
@@ -22,38 +22,38 @@ Total number of elements used for the calculation: 2332800
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
                                            Computation start-up statistics
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-Total time of the start-up:                                                52.0033        
+Total time of the start-up:                                                37.9452        
 
 Number of calls to the linear solver per time step:                        2              
-Average time of the resolution of the linear problem per call:             10.7848        
+Average time of the resolution of the linear problem per call:             10.8771        
 Average number of iteration of the linear solver per call:                 2              
 
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
                                                  Time loop statistics
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 The first time step is not accounted for the computation of the time loop statistics
-Total time of the time loop:                                               4.45148        
+Total time of the time loop:                                               3.85018        
 Number of time steps:                                                      9              
 Skipped time steps:                                                        1              
-Average time per time step:                                                0.494609       
-Standard deviation between time steps:                                     0.0137016      
-Time elapsed in the skipped time steps:                                    1.21885        
+Average time per time step:                                                0.427798       
+Standard deviation between time steps:                                     0.0119921      
+Time elapsed in the skipped time steps:                                    1.148          
 
 
 Standard counter description             | Time/step       | % loop time | Call(s)/step   
 ------------------------------------------------------------------------------------------
-Linear solver resolutions Ax=B           | 0.117044        | 23.7        | 2              
-Convection operator                      | 0.04088689      |  8.3        | 4              
-Diffusion operator                       | 0.1189428       | 24.0        | 26             
-Gradient operator                        | 0.03040632      |  6.1        | 4              
-Divergence operator                      | 0.01833053      |  3.7        | 3              
-Source terms                             | 0.03895697      |  7.9        | 2              
-Update ::mettre_a_jour                   | 0.01322621      |  2.7        | 1              
-Solver for implicit diffusion            | 0.05433776      | 11.0        | 4              
-Computation of the time step dt          | 0.04061406      |  8.2        | 8              
-Turbulence model::update                 | 0.005045901     |  1.0        | 1              
-Post-treatment operations                | 0.005178893     |  1.0        | 1              
-Other operations                         | 0.01163865      |  2.4        | 
+Linear solver resolutions Ax=B           | 0.117641        | 27.5        | 2              
+Convection operator                      | 0.03657583      |  8.5        | 4              
+Diffusion operator                       | 0.09317057      | 21.8        | 26             
+Gradient operator                        | 0.01422524      |  3.3        | 4              
+Divergence operator                      | 0.01207189      |  2.8        | 3              
+Source terms                             | 0.03228191      |  7.5        | 2              
+Update ::mettre_a_jour                   | 0.01044167      |  2.4        | 1              
+Solver for implicit diffusion            | 0.05658411      | 13.2        | 4              
+Computation of the time step dt          | 0.03139697      |  7.3        | 8              
+Turbulence model::update                 | 0.004705404     |  1.1        | 1              
+Post-treatment operations                | 0.005553535     |  1.3        | 1              
+Other operations                         | 0.01315024      |  3.1        | 
 
 Average number of iteration of the linear solver per call:                 2              
 
@@ -63,16 +63,16 @@ Average number of iteration of the linear solver per call:                 2
 -----------------------------------------------------------------------------------------------------------
 Counter description                      | Time per step   | % loop time | Call(s)/step    | Bandwidth 
 -----------------------------------------------------------------------------------------------------------
-Libraries:                               | 0.117025        | 23.7        | 2               | 
-Kernels:                                 | 0.371188        | 75.0        | 908             | 
-Copy host to device:                     | 0.000477292     |  0.1        | 18              | 7.5 GB/s
-Copy device to host:                     | 0.00108064      |  0.2        | 7               | 9.8 GB/s
-Alloc/Free on device:                    | 0.000145335     |  0.0        | 0               | 
-GPU: 99% Copy H<->D: 0.31% Alloc/free: 0.029% Comm: 0% CPU & I/O: 0.95%
+Libraries:                               | 0.117615        | 27.5        | 2               | 
+Kernels:                                 | 0.303267        | 70.9        | 906             | 
+Copy host to device:                     | 0.000490062     |  0.1        | 18              | 7.3 GB/s
+Copy device to host:                     | 0.00116166      |  0.3        | 7               | 9.1 GB/s
+Alloc/Free on device:                    | 0.000149718     |  0.0        | 0               | 
+GPU: 98% Copy H<->D: 0.39% Alloc/free: 0.035% Comm: 0% CPU & I/O: 1.2%
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
                                               Post-resolution statistics
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-Time of the post-resolution:                                               0.0710111      
+Time of the post-resolution:                                               0.0719171      
 
-Total time for the whole computation                                       57.7447        
+Total time for the whole computation                                       43.0153        
 
diff --git a/tests/GPU/cuDSS_cholesky/cuDSS_cholesky_BENCH.TU.is159479_cc120 b/tests/GPU/cuDSS_cholesky/cuDSS_cholesky_BENCH.TU.is159479_cc120
new file mode 100644
index 0000000000..c73f9b79df
--- /dev/null
+++ b/tests/GPU/cuDSS_cholesky/cuDSS_cholesky_BENCH.TU.is159479_cc120
@@ -0,0 +1,78 @@
+                                             # Global performance file #
+
+This is the global file for tracking performance in TRUST. It stores aggregated quantities.
+More detailed statistics can be found in the cuDSS_cholesky_BENCH_csv.TU file
+For time loop, only standard counters of level 1 are printed alongside your custom counters
+Time is given in seconds
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+                                              Context of the computation
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Date:     24-04-2026 -- 14:38:43
+OS:       is159479.intra.cea.fr__Linux__x86_64__6.12.0-160000.26-default__#1 SMP PREEMPT_DYNAMIC Thu Feb  5 00:00:11 UTC 2026 (f
+CPU model : AMD Ryzen Threadripper PRO 9965WX 24-Cores
+Total number of threads:48
+GPU model: NVIDIA RTX PRO 6000 Blackwell Workstation Edition
+CUDA runtime version: 13.0
+CUDA drivers version: 13.0
+Nb procs used for the computation: 1
+TRUST version: 1.9.8_beta
+Total number of elements used for the calculation: 2332800
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+                                           Computation start-up statistics
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Total time of the start-up:                                                39.9352        
+
+Number of calls to the linear solver per time step:                        2              
+Average time of the resolution of the linear problem per call:             8.43065        
+Average number of iteration of the linear solver per call:                 2              
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+                                                 Time loop statistics
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+The first time step is not accounted for the computation of the time loop statistics
+Total time of the time loop:                                               1.75322        
+Number of time steps:                                                      9              
+Skipped time steps:                                                        1              
+Average time per time step:                                                0.194802       
+Standard deviation between time steps:                                     0.00396668     
+Time elapsed in the skipped time steps:                                    0.821774       
+
+
+Standard counter description             | Time/step       | % loop time | Call(s)/step   
+------------------------------------------------------------------------------------------
+Linear solver resolutions Ax=B           | 0.101463        | 52.1        | 2              
+Convection operator                      | 0.01183454      |  6.1        | 4              
+Diffusion operator                       | 0.0286888       | 14.7        | 26             
+Gradient operator                        | 0.006335487     |  3.3        | 4              
+Divergence operator                      | 0.001810582     |  0.9        | 3              
+Source terms                             | 0.006615974     |  3.4        | 2              
+Update ::mettre_a_jour                   | 0.002287811     |  1.2        | 1              
+Solver for implicit diffusion            | 0.01991212      | 10.2        | 4              
+Computation of the time step dt          | 0.004273607     |  2.2        | 8              
+Turbulence model::update                 | 0.001026106     |  0.5        | 1              
+Post-treatment operations                | 0.003897243     |  2.0        | 1              
+Other operations                         | 0.006656335     |  3.4        | 
+
+Average number of iteration of the linear solver per call:                 2              
+
+
+-----------------------------------------------------------------------------------------------------------
+                                                    GPU statistics
+-----------------------------------------------------------------------------------------------------------
+Counter description                      | Time per step   | % loop time | Call(s)/step    | Bandwidth 
+-----------------------------------------------------------------------------------------------------------
+Libraries:                               | 0.101454        | 52.1        | 2               | 
+Kernels:                                 | 0.0885208       | 45.4        | 906             | 
+Copy host to device:                     | 0.000343567     |  0.2        | 18              | 10.4 GB/s
+Copy device to host:                     | 0.00150662      |  0.8        | 7               | 7.0 GB/s
+Alloc/Free on device:                    | 4.49228e-05     |  0.0        | 0               | 
+GPU: 98% Copy H<->D: 0.95% Alloc/free: 0.023% Comm: 0% CPU & I/O: 1.5%
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+                                              Post-resolution statistics
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Time of the post-resolution:                                               0.0577461      
+
+Total time for the whole computation                                       42.5679        
+
diff --git a/tests/GPU/cuDSS_cholesky/cuDSS_cholesky_BENCH.TU.jean-zay_cc90 b/tests/GPU/cuDSS_cholesky/cuDSS_cholesky_BENCH.TU.jean-zay_cc90
index 2356b4437a..5aaf2bc0e6 100644
--- a/tests/GPU/cuDSS_cholesky/cuDSS_cholesky_BENCH.TU.jean-zay_cc90
+++ b/tests/GPU/cuDSS_cholesky/cuDSS_cholesky_BENCH.TU.jean-zay_cc90
@@ -8,13 +8,13 @@ Time is given in seconds
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
                                               Context of the computation
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-Date:     20-03-2026 -- 14:10:34
-OS:       jzxh080__Linux__x86_64__5.14.0-570.58.1.el9_6.x86_64__#1 SMP PREEMPT_DYNAMIC Tue Oct 21 04:15:07 EDT 2025
+Date:     10-06-2026 -- 14:11:55
+OS:       jzxh126__Linux__x86_64__5.14.0-570.116.1.el9_6.x86_64__#1 SMP PREEMPT_DYNAMIC Tue May 19 08:44:54 EDT 2026
 CPU model : Intel(R) Xeon(R) Platinum 8468
 Total number of threads:192
 GPU model: NVIDIA H100 80GB HBM3
 CUDA runtime version: 12.60
-CUDA drivers version: 13.0
+CUDA drivers version: 13.20
 Nb procs used for the computation: 1
 TRUST version: 1.9.8_beta
 Total number of elements used for the calculation: 2332800
@@ -22,38 +22,38 @@ Total number of elements used for the calculation: 2332800
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
                                            Computation start-up statistics
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-Total time of the start-up:                                                51.2627        
+Total time of the start-up:                                                42.7384        
 
 Number of calls to the linear solver per time step:                        2              
-Average time of the resolution of the linear problem per call:             11.2967        
+Average time of the resolution of the linear problem per call:             11.1825        
 Average number of iteration of the linear solver per call:                 2              
 
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
                                                  Time loop statistics
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 The first time step is not accounted for the computation of the time loop statistics
-Total time of the time loop:                                               1.56552        
+Total time of the time loop:                                               1.39083        
 Number of time steps:                                                      9              
 Skipped time steps:                                                        1              
-Average time per time step:                                                0.173947       
-Standard deviation between time steps:                                     0.00701508     
-Time elapsed in the skipped time steps:                                    1.0813         
+Average time per time step:                                                0.154536       
+Standard deviation between time steps:                                     0.00788354     
+Time elapsed in the skipped time steps:                                    0.991765       
 
 
 Standard counter description             | Time/step       | % loop time | Call(s)/step   
 ------------------------------------------------------------------------------------------
-Linear solver resolutions Ax=B           | 0.0661068       | 38.0        | 2              
-Convection operator                      | 0.01267389      |  7.3        | 4              
-Diffusion operator                       | 0.03157946      | 18.2        | 26             
-Gradient operator                        | 0.01002847      |  5.8        | 4              
-Divergence operator                      | 0.004405446     |  2.5        | 3              
-Source terms                             | 0.006924221     |  4.0        | 2              
-Update ::mettre_a_jour                   | 0.00359039      |  2.1        | 1              
-Solver for implicit diffusion            | 0.01661915      |  9.6        | 4              
-Computation of the time step dt          | 0.005447637     |  3.1        | 8              
-Turbulence model::update                 | 0.001024498     |  0.6        | 1              
-Post-treatment operations                | 0.009382817     |  5.4        | 1              
-Other operations                         | 0.006164344     |  3.5        | 
+Linear solver resolutions Ax=B           | 0.0572004       | 37.0        | 2              
+Convection operator                      | 0.009876482     |  6.4        | 4              
+Diffusion operator                       | 0.02682544      | 17.4        | 26             
+Gradient operator                        | 0.008371504     |  5.4        | 4              
+Divergence operator                      | 0.001878353     |  1.2        | 3              
+Source terms                             | 0.006324317     |  4.1        | 2              
+Update ::mettre_a_jour                   | 0.002631955     |  1.7        | 1              
+Solver for implicit diffusion            | 0.01911372      | 12.4        | 4              
+Computation of the time step dt          | 0.004478554     |  2.9        | 8              
+Turbulence model::update                 | 0.000924802     |  0.6        | 1              
+Post-treatment operations                | 0.009516202     |  6.2        | 1              
+Other operations                         | 0.007394542     |  4.8        | 
 
 Average number of iteration of the linear solver per call:                 2              
 
@@ -63,17 +63,17 @@ Average number of iteration of the linear solver per call:                 2
 -----------------------------------------------------------------------------------------------------------
 Counter description                      | Time per step   | % loop time | Call(s)/step    | Bandwidth 
 -----------------------------------------------------------------------------------------------------------
-Libraries:                               | 0.0660849       | 38.0        | 2               | 
-Kernels:                                 | 0.0954356       | 54.9        | 908             | 
-Copy host to device:                     | 0.000601093     |  0.3        | 18              | 5.9 GB/s
-Copy device to host:                     | 0.00153109      |  0.9        | 7               | 6.9 GB/s
-Alloc/Free on device:                    | 7.77541e-05     |  0.0        | 0               | 
-GPU: 93% Copy H<->D: 1.2% Alloc/free: 0.045% Comm: 0% CPU & I/O: 5.9%
+Libraries:                               | 0.0571762       | 37.0        | 2               | 
+Kernels:                                 | 0.0847694       | 54.9        | 906             | 
+Copy host to device:                     | 0.000616722     |  0.4        | 18              | 5.8 GB/s
+Copy device to host:                     | 0.00155482      |  1.0        | 7               | 6.8 GB/s
+Alloc/Free on device:                    | 7.43548e-05     |  0.0        | 0               | 
+GPU: 92% Copy H<->D: 1.4% Alloc/free: 0.048% Comm: 0% CPU & I/O: 6.7%
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
                                               Post-resolution statistics
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-Time of the post-resolution:                                               0.111628       
+Time of the post-resolution:                                               0.125482       
 
-Total time for the whole computation                                       54.0211        
+Total time for the whole computation                                       45.2465        
 
-[Slurm] Power consumption (63 s):  0.452 kW  0.008 kWh  0.001 € (0.10€/kWh)
+[Slurm] Power consumption (59 s):  0.438 kW  0.007 kWh  0.001 € (0.10€/kWh)
diff --git a/tests/GPU/cuDSS_cholesky/cuDSS_cholesky_BENCH.TU.topaze_cc80 b/tests/GPU/cuDSS_cholesky/cuDSS_cholesky_BENCH.TU.topaze_cc80
index 6cb49dcfcb..181c563808 100644
--- a/tests/GPU/cuDSS_cholesky/cuDSS_cholesky_BENCH.TU.topaze_cc80
+++ b/tests/GPU/cuDSS_cholesky/cuDSS_cholesky_BENCH.TU.topaze_cc80
@@ -8,8 +8,8 @@ Time is given in seconds
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
                                               Context of the computation
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-Date:     20-02-2026 -- 17:27:08
-OS:       topaze7005__Linux__x86_64__4.18.0-553.89.1.el8_10.x86_64__#1 SMP Sat Nov 29 00:49:18 EST 2025
+Date:     13-06-2026 -- 17:17:22
+OS:       topaze7048__Linux__x86_64__4.18.0-553.123.1.el8_10.x86_64__#1 SMP Mon May 4 13:45:48 EDT 2026
 CPU model : AMD EPYC 7763 64-Core Processor
 Total number of threads:256
 GPU model: NVIDIA A100-SXM4-80GB
@@ -22,38 +22,38 @@ Total number of elements used for the calculation: 2332800
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
                                            Computation start-up statistics
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-Total time of the start-up:                                                62.3042        
+Total time of the start-up:                                                46.3126        
 
 Number of calls to the linear solver per time step:                        2              
-Average time of the resolution of the linear problem per call:             12.4547        
+Average time of the resolution of the linear problem per call:             12.2661        
 Average number of iteration of the linear solver per call:                 2              
 
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
                                                  Time loop statistics
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 The first time step is not accounted for the computation of the time loop statistics
-Total time of the time loop:                                               2.45396        
+Total time of the time loop:                                               1.98361        
 Number of time steps:                                                      9              
 Skipped time steps:                                                        1              
-Average time per time step:                                                0.272662       
-Standard deviation between time steps:                                     0.00993732     
-Time elapsed in the skipped time steps:                                    1.29729        
+Average time per time step:                                                0.220401       
+Standard deviation between time steps:                                     0.00668609     
+Time elapsed in the skipped time steps:                                    1.13073        
 
 
 Standard counter description             | Time/step       | % loop time | Call(s)/step   
 ------------------------------------------------------------------------------------------
-Linear solver resolutions Ax=B           | 0.0858845       | 20.6        | 2              
-Convection operator                      | 0.02602534      |  6.2        | 4              
-Diffusion operator                       | 0.06160948      | 14.8        | 26             
-Gradient operator                        | 0.01633848      |  3.9        | 4              
-Divergence operator                      | 0.005998426     |  1.4        | 3              
-Source terms                             | 0.01078654      |  2.6        | 2              
-Update ::mettre_a_jour                   | 0.004897438     |  1.2        | 1              
-Solver for implicit diffusion            | 0.029447        |  7.1        | 4              
-Computation of the time step dt          | 0.01235725      |  3.0        | 8              
-Turbulence model::update                 | 0.001674552     |  0.4        | 1              
-Post-treatment operations                | 0.008454185     |  2.0        | 1              
-Other operations                         | 0.009188697     |  2.2        | 
+Linear solver resolutions Ax=B           | 0.0839011       | 38.1        | 2              
+Convection operator                      | 0.01539531      |  7.0        | 4              
+Diffusion operator                       | 0.04144135      | 18.8        | 26             
+Gradient operator                        | 0.01224873      |  5.6        | 4              
+Divergence operator                      | 0.002501917     |  1.1        | 3              
+Source terms                             | 0.00925932      |  4.2        | 2              
+Update ::mettre_a_jour                   | 0.003451533     |  1.6        | 1              
+Solver for implicit diffusion            | 0.02763385      | 12.5        | 4              
+Computation of the time step dt          | 0.007391328     |  3.4        | 8              
+Turbulence model::update                 | 0.001449264     |  0.7        | 1              
+Post-treatment operations                | 0.006868499     |  3.1        | 1              
+Other operations                         | 0.008858503     |  4.0        | 
 
 Average number of iteration of the linear solver per call:                 2              
 
@@ -63,16 +63,17 @@ Average number of iteration of the linear solver per call:                 2
 -----------------------------------------------------------------------------------------------------------
 Counter description                      | Time per step   | % loop time | Call(s)/step    | Bandwidth 
 -----------------------------------------------------------------------------------------------------------
-Libraries:                               | 0.0858593       | 31.5        | 2               | 
-Kernels:                                 | 0.176086        | 64.6        | 908             | 
-Copy host to device:                     | 0.000506159     |  0.2        | 18              | 7.1 GB/s
-Copy device to host:                     | 0.000966553     |  0.4        | 7               | 10.9 GB/s
-Alloc/Free on device:                    | 9.52723e-05     |  0.0        | 0               | 
-GPU: 96% Copy H<->D: 0.54% Alloc/free: 0.035% Comm: 0% CPU & I/O: 3.4%
+Libraries:                               | 0.0838766       | 38.1        | 2               | 
+Kernels:                                 | 0.127721        | 57.9        | 906             | 
+Copy host to device:                     | 0.000500837     |  0.2        | 18              | 7.1 GB/s
+Copy device to host:                     | 0.000939364     |  0.4        | 7               | 11.2 GB/s
+Alloc/Free on device:                    | 9.52792e-05     |  0.0        | 0               | 
+GPU: 96% Copy H<->D: 0.65% Alloc/free: 0.043% Comm: 0% CPU & I/O: 3.3%
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
                                               Post-resolution statistics
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-Time of the post-resolution:                                               0.203789       
+Time of the post-resolution:                                               0.188939       
 
-Total time for the whole computation                                       66.2593        
+Total time for the whole computation                                       49.6159        
 
+[Slurm] Power consumption (79 s):  0.765 kW  0.017 kWh  0.002 € (0.10€/kWh)
diff --git a/tests/GPU/thermohydraulique_VDF_DNS/check_perf.sh b/tests/GPU/thermohydraulique_VDF_DNS/check_perf.sh
new file mode 120000
index 0000000000..6d20411c12
--- /dev/null
+++ b/tests/GPU/thermohydraulique_VDF_DNS/check_perf.sh
@@ -0,0 +1 @@
+../DomainFlowLES/check_perf.sh
\ No newline at end of file
diff --git a/tests/GPU/thermohydraulique_VDF_DNS/thermohydraulique_VDF_DNS.data b/tests/GPU/thermohydraulique_VDF_DNS/thermohydraulique_VDF_DNS.data
new file mode 100644
index 0000000000..c0d2a26856
--- /dev/null
+++ b/tests/GPU/thermohydraulique_VDF_DNS/thermohydraulique_VDF_DNS.data
@@ -0,0 +1,584 @@
+# ThermoHydraulique 3D : bali metal  VDF - Boussinesq #
+# PARALLEL OK #
+dimension 3
+Pb_Thermohydraulique pb
+
+Domaine dom
+
+# BEGIN MESH #
+Mailler dom
+{
+    Pave Entree
+    {
+        /* warning dumb geometry */
+        Origine 0. 0. 0.
+        Nombre_de_Noeuds 11 11 11
+        /* Nombre_de_Noeuds 101 101 101 */
+        Longueurs 1 1 1
+    }
+    {
+        Bord gauche X = 0.  0. <= Y <= 1. 0. <= Z <= 1.
+        Bord droite X = 1.  0. <= Y <= 1. 0. <= Z <= 1.
+        Bord haut   Y = 0.  0. <= X <= 1. 0. <= Z <= 1.
+        Bord bas    Y = 1.  0. <= X <= 1. 0. <= Z <= 1.
+        Bord devant   Z = 0.  0. <= X <= 1. 0. <= Y <= 1.
+        Bord derriere   Z = 1.  0. <= X <= 1. 0. <= Y <= 1.
+    }
+}
+# END MESH #
+
+# BEGIN PARTITION
+Partition dom
+{
+    Partition_tool Metis { Nb_parts 2 }
+    Larg_joint 2
+    zones_name DOM
+}
+End
+END PARTITION #
+
+# BEGIN SCATTER
+Scatter DOM.Zones dom
+END SCATTER #
+
+VDF dis
+Lire dis { reorder { algo hilbert } }
+
+Scheme_euler_explicit sch_ex 
+Read sch_ex
+{
+nb_pas_dt_max 10
+        tinit 0.
+        tmax 3000.
+        dt_min 1.e-11
+        dt_max 0.1
+        dt_impr 5.
+        dt_sauv 5.
+        dt_start dt_calc
+        seuil_statio 1.e-8
+        facsec 1
+        diffusion_implicite 1
+        tcpumax 23.5 		/* Le calcul s'arretera proprement apr�s 23h30 */
+}
+
+
+
+Associate pb dom
+Associate pb sch_ex
+
+Discretize pb dis
+
+# extraction des surfaces, conditions limites pour le post-traitement # 
+Domaine haut
+Extraire_surface { domaine haut probleme pb avec_certains_bords 1 haut } 		
+Domaine bas
+Extraire_surface { domaine bas probleme pb avec_certains_bords 1 bas } 
+Domaine gauche				
+Extraire_surface { domaine gauche probleme pb avec_certains_bords 1 gauche }
+Domaine droite
+Extraire_surface { domaine droite probleme pb avec_certains_bords 1 droite }
+Domaine milieu
+Extraire_surface { domaine milieu probleme pb condition_elements 0.09<z<0.11 } # to check #
+
+Lire pb
+{
+       # PARAMETRES PHYSIQUES DE L'EAU  � 20 C #
+       Fluide_Incompressible
+       {
+        gravite champ_uniforme 3 0. -9.81 0.
+	mu	Champ_Uniforme 1 4.5696e-3
+	rho	Champ_Uniforme 1 6720.
+	lambda	Champ_Uniforme 1 20.
+	Cp	Champ_Uniforme 1 674.
+	beta_th	Champ_Uniforme 1 3.0e-5
+        }
+	
+
+	Navier_Stokes_Standard
+	{
+		# solveur_pression petsc Cholesky { } #
+	# 	solveur_pression GCP {
+                      seuil 1.e-12 impr
+                      precond ssor {
+                          omega 1.55 
+                      }
+                } #
+                solveur_pression AMG GCP { atol 1.e-12 impr }
+		convection { amont }
+		diffusion { }
+		sources { boussinesq_temperature { T0 2141. }  }
+		
+		conditions_initiales
+		{
+			vitesse Champ_Uniforme 3 0. 0. 0.
+		}
+		
+		conditions_limites
+		{
+			haut 	 paroi_fixe
+			bas 	 paroi_fixe
+			droite	 paroi_fixe
+			gauche	 paroi_fixe
+			devant   paroi_fixe
+			derriere paroi_fixe
+		}
+	}
+	
+	
+	
+	Convection_Diffusion_Temperature
+	{
+		convection { Quick }
+		diffusion { }
+		sources { }
+		
+		conditions_initiales
+		{
+			Temperature Champ_Uniforme 1 2141.
+		}
+		
+		conditions_limites
+		{
+			haut 	  paroi_flux_impose  Ch_front_input  { nom flux_ex probleme pb nb_comp 1  initial_value 1 -0.e6 }
+			bas 	  paroi_flux_impose Champ_front_Uniforme 1 1.E6 
+			droite 	  paroi_adiabatique
+			gauche    paroi_temperature_imposee Champ_front_Uniforme 1 1658.
+			devant    paroi_adiabatique
+			derriere  paroi_adiabatique
+		}
+	}
+	
+	
+	Postraitements
+	{
+		# post-traitement du domaine entier #		
+		post_dom {		
+			Format lml
+			fichier thermohydraulique_VDF_DNS 
+			Definition_champs 
+			{
+				# moyenne global de la temp�rature sur tout le domaine # 				
+				Tmoy Reduction_0D { methode moyenne source Refchamp { Pb_champ pb temperature } } 
+
+				# moyenne selon X de la temp�rature sur le domaine milieu # 
+				Tmoy_X_milieu Reduction_0D { 
+					methode moyenne source Interpolation { 
+						domaine milieu localisation elem source Refchamp { Pb_champ pb temperature } } }
+
+				# moyenne de la vitesse selon X sur la surface haut # 
+				VmoyX_haut Reduction_0D { 
+					methode moyenne source Extraction { 
+						domaine haut nom_frontiere haut source Transformation { 
+							methode composante numero 0 localisation elem source Refchamp { Pb_champ pb vitesse } } } } 
+
+				# moyenne de la vitesse selon Z sur la surface gauche # 
+				VmoyZ_gauche Reduction_0D { 
+					methode moyenne source Extraction { 
+						domaine gauche nom_frontiere gauche source Transformation { 
+							methode composante numero 2 localisation elem source Refchamp { Pb_champ pb vitesse } } } } 
+
+				# gradtT #
+				gradT Gradient {
+				source refchamp { Pb_champ pb temperature }
+				}
+
+				# gradx temperature #
+				gradTx Transformation { 
+				       localisation elem
+				       methode composante numero 0
+				       sources_reference { gradT }
+				       }
+
+				# grady temperature #
+				gradTy Transformation { 
+				       localisation elem
+				       methode composante numero 1
+				       sources_reference { gradT }
+				       }
+
+				# gradz temperature #
+				gradTz Transformation { 
+				       localisation elem
+				       methode composante numero 2
+				       sources_reference { gradT }
+				       }
+			} 
+
+			Probes
+			{ 
+                                                                                                # xdeb ydeb zdeb xfin yfin zfin #
+        		       sonde_temp_back_wall      temperature periode 10 segment 2000   0.0  0.13   0.10  2.0  0.13   0.10
+        		       sonde_temp_ymiddle_zmiddle_x      temperature periode 10 segment 2000   0.0  0.065   0.10  2.0  0.065   0.10
+        		       sonde_temp_ymiddle_zepoxy_x      temperature periode 10 segment 2000   0.0  0.065   0.19  2.0  0.065   0.19
+        		       sonde_temp_ymiddle_zheating_x      temperature periode 10 segment 2000   0.0  0.065   0.001  2.0  0.065   0.001
+        		       sonde_temp_ymiddle_z200mm_x      temperature periode 10 segment 2000   0.0  0.065   0.05  2.0  0.065   0.05
+        		       sonde_temp_ymiddle_z1200mm_x      temperature periode 10 segment 2000   0.0  0.065   0.15  2.0  0.065   0.15
+			       sonde_temp_heating_plate  temperature periode 10 segment 2000   0.0   0.065  0.0    2.0   0.065  0.0
+			       sonde_temp_epoxy_plate    temperature periode 10 segment 2000    0.0   0.065  0.2   2.0   0.065  0.2
+
+			       sonde_temp_ymiddle_x1mm_z    temperature periode 10 segment 200    0.001   0.065   0.0    0.001   0.065   0.20
+			       sonde_temp_ymiddle_x1cm_z    temperature periode 10 segment 200    0.01   0.065   0.0    0.01   0.065   0.20
+			       sonde_temp_ymiddle_x2cm_z    temperature periode 10 segment 200    0.02   0.065   0.0    0.02   0.065   0.20
+			       sonde_temp_ymiddle_x4cm_z    temperature periode 10 segment 200    0.04   0.065   0.0    0.04   0.065   0.20
+			       sonde_temp_ymiddle_x8cm_z    temperature periode 10 segment 200    0.08   0.065   0.0    0.08   0.065   0.20
+			       sonde_temp_ymiddle_x15cm_z    temperature periode 10 segment 200   0.15   0.065   0.0    0.15   0.065   0.20
+			       sonde_temp_ymiddle_x25cm_z    temperature periode 10 segment 200   0.25   0.065   0.0    0.25   0.065   0.20
+			       sonde_temp_ymiddle_x200cm_z    temperature periode 10 segment 200  0.200   0.065   0.0    0.200   0.065   0.20
+			       sonde_temp_ymiddle_x100cm_z    temperature periode 10 segment 200     1.   0.065   0.0    1.   0.065   0.20
+			       sonde_temp_ymiddle_x150cm_z    temperature periode 10 segment 200   1.5   0.065   0.0    1.5   0.065   0.20
+			       sonde_temp_ymiddle_x170cm_z    temperature periode 10 segment 200  1.70   0.065   0.0    1.70   0.065   0.20
+			       sonde_temp_ymiddle_x180cm_z    temperature periode 10 segment 200   1.80   0.065   0.0    1.80   0.065   0.20
+			       sonde_temp_ymiddle_x190cm_z    temperature periode 10 segment 200  1.90   0.065   0.0    1.90   0.065   0.20
+			       sonde_temp_ymiddle_x195cm_z    temperature periode 10 segment 200  1.95   0.065   0.0    1.95   0.065   0.20
+			       sonde_temp_ymiddle_x198cm_z    temperature periode 10 segment 200  1.98   0.065   0.0    1.98   0.065   0.20
+			       sonde_temp_ymiddle_x199cm_z    temperature periode 10 segment 200  1.99   0.065   0.0    1.99   0.065   0.20
+
+    			       sonde_gradTx_back_wall      gradTx periode 10 segment 2000   0.0  0.13   0.10  2.0  0.13   0.10
+        		       sonde_gradTx_ymiddle_zmiddle_x      gradTx periode 10 segment 2000   0.0  0.065   0.10  2.0  0.065   0.10
+        		       sonde_gradTx_ymiddle_zepoxy_x      gradTx periode 10 segment 2000   0.0  0.065   0.19  2.0  0.065   0.19
+        		       sonde_gradTx_ymiddle_zheating_x      gradTx periode 10 segment 2000   0.0  0.065   0.001  2.0  0.065   0.001
+        		       sonde_gradTx_ymiddle_z200mm_x      gradTx periode 10 segment 2000   0.0  0.065   0.05  2.0  0.065   0.05
+        		       sonde_gradTx_ymiddle_z1200mm_x      gradTx periode 10 segment 2000   0.0  0.065   0.15  2.0  0.065   0.15
+			       sonde_gradTx_heating_plate  gradTx periode 10 segment 2000   0.0   0.065  0.0    2.0   0.065  0.0
+			       sonde_gradTx_epoxy_plate    gradTx periode 10 segment 2000    0.0   0.065  0.2   2.0   0.065  0.2
+
+			       sonde_gradTx_ymiddle_x1mm_z    gradTx periode 10 segment 200    0.001   0.065   0.0    0.001   0.065   0.20
+			       sonde_gradTx_ymiddle_x1cm_z    gradTx periode 10 segment 200    0.01   0.065   0.0    0.01   0.065   0.20
+			       sonde_gradTx_ymiddle_x2cm_z    gradTx periode 10 segment 200    0.02   0.065   0.0    0.02   0.065   0.20
+			       sonde_gradTx_ymiddle_x4cm_z    gradTx periode 10 segment 200    0.04   0.065   0.0    0.04   0.065   0.20
+			       sonde_gradTx_ymiddle_x8cm_z    gradTx periode 10 segment 200    0.08   0.065   0.0    0.08   0.065   0.20
+			       sonde_gradTx_ymiddle_x15cm_z    gradTx periode 10 segment 200   0.15   0.065   0.0    0.15   0.065   0.20
+			       sonde_gradTx_ymiddle_x25cm_z    gradTx periode 10 segment 200   0.25   0.065   0.0    0.25   0.065   0.20
+			       sonde_gradTx_ymiddle_x200cm_z    gradTx periode 10 segment 200  0.200   0.065   0.0    0.200   0.065   0.20
+			       sonde_gradTx_ymiddle_x100cm_z    gradTx periode 10 segment 200     1.   0.065   0.0    1.   0.065   0.20
+			       sonde_gradTx_ymiddle_x150cm_z    gradTx periode 10 segment 200   1.5   0.065   0.0    1.5   0.065   0.20
+			       sonde_gradTx_ymiddle_x170cm_z    gradTx periode 10 segment 200  1.70   0.065   0.0    1.70   0.065   0.20
+			       sonde_gradTx_ymiddle_x180cm_z    gradTx periode 10 segment 200   1.80   0.065   0.0    1.80   0.065   0.20
+			       sonde_gradTx_ymiddle_x190cm_z    gradTx periode 10 segment 200  1.90   0.065   0.0    1.90   0.065   0.20
+			       sonde_gradTx_ymiddle_x195cm_z    gradTx periode 10 segment 200  1.95   0.065   0.0    1.95   0.065   0.20
+			       sonde_gradTx_ymiddle_x198cm_z    gradTx periode 10 segment 200  1.98   0.065   0.0    1.98   0.065   0.20
+			       sonde_gradTx_ymiddle_x199cm_z    gradTx periode 10 segment 200  1.99   0.065   0.0    1.99   0.065   0.20
+
+                          
+        	               sonde_gradTy_back_wall      gradTy periode 10 segment 2000   0.0  0.13   0.10  2.0  0.13   0.10
+        		       sonde_gradTy_ymiddle_zmiddle_x      gradTy periode 10 segment 2000   0.0  0.065   0.10  2.0  0.065   0.10
+        		       sonde_gradTy_ymiddle_zepoxy_x      gradTy periode 10 segment 2000   0.0  0.065   0.19  2.0  0.065   0.19
+        		       sonde_gradTy_ymiddle_zheating_x      gradTy periode 10 segment 2000   0.0  0.065   0.001  2.0  0.065   0.001
+        		       sonde_gradTy_ymiddle_z200mm_x      gradTy periode 10 segment 2000   0.0  0.065   0.05  2.0  0.065   0.05
+        		       sonde_gradTy_ymiddle_z1200mm_x      gradTy periode 10 segment 2000   0.0  0.065   0.15  2.0  0.065   0.15
+			       sonde_gradTy_heating_plate  gradTy periode 10 segment 2000   0.0   0.065  0.0    2.0   0.065  0.0
+			       sonde_gradTy_epoxy_plate    gradTy periode 10 segment 2000    0.0   0.065  0.2   2.0   0.065  0.2
+
+			       sonde_gradTy_ymiddle_x1mm_z    gradTy periode 10 segment 200    0.001   0.065   0.0    0.001   0.065   0.20
+			       sonde_gradTy_ymiddle_x1cm_z    gradTy periode 10 segment 200    0.01   0.065   0.0    0.01   0.065   0.20
+			       sonde_gradTy_ymiddle_x2cm_z    gradTy periode 10 segment 200    0.02   0.065   0.0    0.02   0.065   0.20
+			       sonde_gradTy_ymiddle_x4cm_z    gradTy periode 10 segment 200    0.04   0.065   0.0    0.04   0.065   0.20
+			       sonde_gradTy_ymiddle_x8cm_z    gradTy periode 10 segment 200    0.08   0.065   0.0    0.08   0.065   0.20
+			       sonde_gradTy_ymiddle_x15cm_z    gradTy periode 10 segment 200   0.15   0.065   0.0    0.15   0.065   0.20
+			       sonde_gradTy_ymiddle_x25cm_z    gradTy periode 10 segment 200   0.25   0.065   0.0    0.25   0.065   0.20
+			       sonde_gradTy_ymiddle_x200cm_z gradTy periode 10 segment 200  0.200   0.065   0.0    0.200   0.065   0.20
+			       sonde_gradTy_ymiddle_x100cm_z    gradTy periode 10 segment 200     1.   0.065   0.0    1.   0.065   0.20
+			       sonde_gradTy_ymiddle_x150cm_z    gradTy periode 10 segment 200   1.5   0.065   0.0    1.5   0.065   0.20
+			       sonde_gradTy_ymiddle_x170cm_z    gradTy periode 10 segment 200  1.70   0.065   0.0    1.70   0.065   0.20
+			       sonde_gradTy_ymiddle_x180cm_z    gradTy periode 10 segment 200   1.80   0.065   0.0    1.80   0.065   0.20
+			       sonde_gradTy_ymiddle_x190cm_z    gradTy periode 10 segment 200  1.90   0.065   0.0    1.90   0.065   0.20
+			       sonde_gradTy_ymiddle_x195cm_z    gradTy periode 10 segment 200  1.95   0.065   0.0    1.95   0.065   0.20
+			       sonde_gradTy_ymiddle_x198cm_z    gradTy periode 10 segment 200  1.98   0.065   0.0    1.98   0.065   0.20
+			       sonde_gradTy_ymiddle_x199cm_z    gradTy periode 10 segment 200  1.99   0.065   0.0    1.99   0.065   0.20
+			       sonde_gradTy_ymiddle_x2m_z    gradTy periode 10 segment 200     1.999   0.065   0.0    1.999   0.065   0.20
+                               
+			       sonde_gradTz_back_wall      gradTy periode 10 segment 2000   0.0  0.13   0.10  2.0  0.13   0.10
+        		       sonde_gradTz_ymiddle_zmiddle_x      gradTy periode 10 segment 2000   0.0  0.065   0.10  2.0  0.065   0.10
+        		       sonde_gradTz_ymiddle_zepoxy_x      gradTy periode 10 segment 2000   0.0  0.065   0.19  2.0  0.065   0.19
+        		       sonde_gradTz_ymiddle_zheating_x      gradTy periode 10 segment 2000   0.0  0.065   0.001  2.0  0.065   0.001
+        		       sonde_gradTz_ymiddle_z200mm_x      gradTy periode 10 segment 2000   0.0  0.065   0.05  2.0  0.065   0.05
+        		       sonde_gradTz_ymiddle_z1200mm_x      gradTy periode 10 segment 2000   0.0  0.065   0.15  2.0  0.065   0.15
+			       sonde_gradTz_heating_plate  gradTy periode 10 segment 2000   0.0   0.065  0.0    2.0   0.065  0.0
+			       sonde_gradTz_epoxy_plate    gradTy periode 10 segment 2000    0.0   0.065  0.2   2.0   0.065  0.2
+
+			       sonde_gradTz_ymiddle_x1mm_z    gradTy periode 10 segment 200    0.001   0.065   0.0    0.001   0.065   0.20
+			       sonde_gradTz_ymiddle_x1cm_z    gradTy periode 10 segment 200    0.01   0.065   0.0    0.01   0.065   0.20
+			       sonde_gradTz_ymiddle_x2cm_z    gradTy periode 10 segment 200    0.02   0.065   0.0    0.02   0.065   0.20
+			       sonde_gradTz_ymiddle_x4cm_z    gradTy periode 10 segment 200    0.04   0.065   0.0    0.04   0.065   0.20
+			       sonde_gradTz_ymiddle_x8cm_z    gradTy periode 10 segment 200    0.08   0.065   0.0    0.08   0.065   0.20
+			       sonde_gradTz_ymiddle_x15cm_z    gradTy periode 10 segment 200   0.15   0.065   0.0    0.15   0.065   0.20
+			       sonde_gradTz_ymiddle_x25cm_z    gradTy periode 10 segment 200   0.25   0.065   0.0    0.25   0.065   0.20
+			       sonde_gradTz_ymiddle_x200cm_z gradTy periode 10 segment 200  0.200   0.065   0.0    0.200   0.065   0.20
+			       sonde_gradTz_ymiddle_x1m_z    gradTy periode 10 segment 200     1.   0.065   0.0    1.   0.065   0.20
+			       sonde_gradTz_ymiddle_x1_5m_z    gradTy periode 10 segment 200   1.5   0.065   0.0    1.5   0.065   0.20
+			       sonde_gradTz_ymiddle_x170cm_z    gradTy periode 10 segment 200  1.70   0.065   0.0    1.70   0.065   0.20
+			       sonde_gradTz_ymiddle_x180cm_z    gradTy periode 10 segment 200   1.80   0.065   0.0    1.80   0.065   0.20
+			       sonde_gradTz_ymiddle_x190cm_z    gradTy periode 10 segment 200  1.90   0.065   0.0    1.90   0.065   0.20
+			       sonde_gradTz_ymiddle_x195cm_z    gradTy periode 10 segment 200  1.95   0.065   0.0    1.95   0.065   0.20
+			       sonde_gradTz_ymiddle_x198cm_z    gradTy periode 10 segment 200  1.98   0.065   0.0    1.98   0.065   0.20
+			       sonde_gradTz_ymiddle_x199cm_z    gradTy periode 10 segment 200  1.99   0.065   0.0    1.99   0.065   0.20
+			       sonde_gradTz_ymiddle_x2m_z    gradTy periode 10 segment 200     1.999   0.065   0.0    1.999   0.065   0.20
+
+			       sonde_vx_back_wall      vitessex periode 10 segment 2000   0.0  0.13   0.10  2.0  0.13   0.10
+        		       sonde_vx_ymiddle_zmiddle_x      vitessex periode 10 segment 2000   0.0  0.065   0.10  2.0  0.065   0.10
+        		       sonde_vx_ymiddle_zepoxy_x      vitessex periode 10 segment 2000   0.0  0.065   0.19  2.0  0.065   0.19
+        		       sonde_vx_ymiddle_zheating_x      vitessex periode 10 segment 2000   0.0  0.065   0.001  2.0  0.065   0.001
+        		       sonde_vx_ymiddle_z200mm_x      vitessex periode 10 segment 2000   0.0  0.065   0.05  2.0  0.065   0.05
+        		       sonde_vx_ymiddle_z1200mm_x      vitessex periode 10 segment 2000   0.0  0.065   0.15  2.0  0.065   0.15
+			       sonde_vx_heating_plate  vitessex periode 10 segment 2000   0.0   0.065  0.0    2.0   0.065  0.0
+			       sonde_vx_epoxy_plate    vitessex periode 10 segment 2000    0.0   0.065  0.2   2.0   0.065  0.2
+
+			       sonde_vx_ymiddle_x1mm_z    vitessex periode 10 segment 200    0.001   0.065   0.0    0.001   0.065   0.20
+			       sonde_vx_ymiddle_x1cm_z    vitessex periode 10 segment 200    0.01   0.065   0.0    0.01   0.065   0.20
+			       sonde_vx_ymiddle_x2cm_z    vitessex periode 10 segment 200    0.02   0.065   0.0    0.02   0.065   0.20
+			       sonde_vx_ymiddle_x4cm_z    vitessex periode 10 segment 200    0.04   0.065   0.0    0.04   0.065   0.20
+			       sonde_vx_ymiddle_x8cm_z    vitessex periode 10 segment 200    0.08   0.065   0.0    0.08   0.065   0.20
+			       sonde_vx_ymiddle_x15cm_z    vitessex periode 10 segment 200   0.15   0.065   0.0    0.15   0.065   0.20
+			       sonde_vx_ymiddle_x25cm_z    vitessex periode 10 segment 200   0.25   0.065   0.0    0.25   0.065   0.20
+			       sonde_vx_ymiddle_x200cm_z vitessex periode 10 segment 200  0.200   0.065   0.0    0.200   0.065   0.20
+			       sonde_vx_ymiddle_x100cm_z    vitessex periode 10 segment 200     1.   0.065   0.0    1.   0.065   0.20
+			       sonde_vx_ymiddle_x150cm_z    vitessex periode 10 segment 200   1.5   0.065   0.0    1.5   0.065   0.20
+			       sonde_vx_ymiddle_x170cm_z    vitessex periode 10 segment 200  1.70   0.065   0.0    1.70   0.065   0.20
+			       sonde_vx_ymiddle_x180cm_z    vitessex periode 10 segment 200   1.80   0.065   0.0    1.80   0.065   0.20
+			       sonde_vx_ymiddle_x190cm_z    vitessex periode 10 segment 200  1.90   0.065   0.0    1.90   0.065   0.20
+			       sonde_vx_ymiddle_x195cm_z    vitessex periode 10 segment 200  1.95   0.065   0.0    1.95   0.065   0.20
+			       sonde_vx_ymiddle_x198cm_z    vitessex periode 10 segment 200  1.98   0.065   0.0    1.98   0.065   0.20
+			       sonde_vx_ymiddle_x199cm_z    vitessex periode 10 segment 200  1.99   0.065   0.0    1.99   0.065   0.20
+
+
+			       sonde_vy_back_wall      vitessey periode 10 segment 2000   0.0  0.13   0.10  2.0  0.13   0.10
+        		       sonde_vy_ymiddle_zmiddle_x      vitessey periode 10 segment 2000   0.0  0.065   0.10  2.0  0.065   0.10
+        		       sonde_vy_ymiddle_zepoxy_x      vitessey periode 10 segment 2000   0.0  0.065   0.19  2.0  0.065   0.19
+        		       sonde_vy_ymiddle_zheating_x      vitessey periode 10 segment 2000   0.0  0.065   0.001  2.0  0.065   0.001
+        		       sonde_vy_ymiddle_z200mm_x      vitessey periode 10 segment 2000   0.0  0.065   0.05  2.0  0.065   0.05
+        		       sonde_vy_ymiddle_z1200mm_x      vitessey periode 10 segment 2000   0.0  0.065   0.15  2.0  0.065   0.15
+			       sonde_vy_heating_plate  vitessey periode 10 segment 2000   0.0   0.065  0.0    2.0   0.065  0.0
+			       sonde_vy_epoxy_plate    vitessey periode 10 segment 2000    0.0   0.065  0.2   2.0   0.065  0.2
+
+			       sonde_vy_ymiddle_x1mm_z    vitessey periode 10 segment 200    0.001   0.065   0.0    0.001   0.065   0.20
+			       sonde_vy_ymiddle_x1cm_z    vitessey periode 10 segment 200    0.01   0.065   0.0    0.01   0.065   0.20
+			       sonde_vy_ymiddle_x2cm_z    vitessey periode 10 segment 200    0.02   0.065   0.0    0.02   0.065   0.20
+			       sonde_vy_ymiddle_x4cm_z    vitessey periode 10 segment 200    0.04   0.065   0.0    0.04   0.065   0.20
+			       sonde_vy_ymiddle_x8cm_z    vitessey periode 10 segment 200    0.08   0.065   0.0    0.08   0.065   0.20
+			       sonde_vy_ymiddle_x15cm_z    vitessey periode 10 segment 200   0.15   0.065   0.0    0.15   0.065   0.20
+			       sonde_vy_ymiddle_x25cm_z    vitessey periode 10 segment 200   0.25   0.065   0.0    0.25   0.065   0.20
+			       sonde_vy_ymiddle_x200cm_z vitessey periode 10 segment 200  0.200   0.065   0.0    0.200   0.065   0.20
+			       sonde_vy_ymiddle_x100cm_z    vitessey periode 10 segment 200     1.   0.065   0.0    1.   0.065   0.20
+			       sonde_vy_ymiddle_x150cm_z    vitessey periode 10 segment 200   1.5   0.065   0.0    1.5   0.065   0.20
+			       sonde_vy_ymiddle_x170cm_z    vitessey periode 10 segment 200  1.70   0.065   0.0    1.70   0.065   0.20
+			       sonde_vy_ymiddle_x180cm_z    vitessey periode 10 segment 200   1.80   0.065   0.0    1.80   0.065   0.20
+			       sonde_vy_ymiddle_x190cm_z    vitessey periode 10 segment 200  1.90   0.065   0.0    1.90   0.065   0.20
+			       sonde_vy_ymiddle_x195cm_z    vitessey periode 10 segment 200  1.95   0.065   0.0    1.95   0.065   0.20
+			       sonde_vy_ymiddle_x198cm_z    vitessey periode 10 segment 200  1.98   0.065   0.0    1.98   0.065   0.20
+			       sonde_vy_ymiddle_x199cm_z    vitessey periode 10 segment 200  1.99   0.065   0.0    1.99   0.065   0.20
+
+
+        		       sonde_vz_back_wall      vitessez periode 10 segment 2000   0.0  0.13   0.10  2.0  0.13   0.10
+        		       sonde_vz_ymiddle_zmiddle_x      vitessez periode 10 segment 2000   0.0  0.065   0.10  2.0  0.065   0.10
+        		       sonde_vz_ymiddle_zepoxy_x      vitessez periode 10 segment 2000   0.0  0.065   0.19  2.0  0.065   0.19
+        		       sonde_vz_ymiddle_zheating_x      vitessez periode 10 segment 2000   0.0  0.065   0.001  2.0  0.065   0.001
+        		       sonde_vz_ymiddle_z200mm_x      vitessez periode 10 segment 2000   0.0  0.065   0.05  2.0  0.065   0.05
+        		       sonde_vz_ymiddle_z1200mm_x      vitessez periode 10 segment 2000   0.0  0.065   0.15  2.0  0.065   0.15
+			       sonde_vz_heating_plate  vitessez periode 10 segment 2000   0.0   0.065  0.0    2.0   0.065  0.0
+			       sonde_vz_epoxy_plate    vitessez periode 10 segment 2000    0.0   0.065  0.2   2.0   0.065  0.2
+
+			       sonde_vz_ymiddle_x1mm_z    vitessez periode 10 segment 200    0.001   0.065   0.0    0.001   0.065   0.20
+			       sonde_vz_ymiddle_x1cm_z    vitessez periode 10 segment 200    0.01   0.065   0.0    0.01   0.065   0.20
+			       sonde_vz_ymiddle_x2cm_z    vitessez periode 10 segment 200    0.02   0.065   0.0    0.02   0.065   0.20
+			       sonde_vz_ymiddle_x4cm_z    vitessez periode 10 segment 200    0.04   0.065   0.0    0.04   0.065   0.20
+			       sonde_vz_ymiddle_x8cm_z    vitessez periode 10 segment 200    0.08   0.065   0.0    0.08   0.065   0.20
+			       sonde_vz_ymiddle_x15cm_z    vitessez periode 10 segment 200   0.15   0.065   0.0    0.15   0.065   0.20
+			       sonde_vz_ymiddle_x25cm_z    vitessez periode 10 segment 200   0.25   0.065   0.0    0.25   0.065   0.20
+			       sonde_vz_ymiddle_x200cm_z vitessez periode 10 segment 200  0.200   0.065   0.0    0.200   0.065   0.20
+			       sonde_vz_ymiddle_x100cm_z    vitessez periode 10 segment 200     1.   0.065   0.0    1.   0.065   0.20
+			       sonde_vz_ymiddle_x150cm_z    vitessez periode 10 segment 200   1.5   0.065   0.0    1.5   0.065   0.20
+			       sonde_vz_ymiddle_x170cm_z    vitessez periode 10 segment 200  1.70   0.065   0.0    1.70   0.065   0.20
+			       sonde_vz_ymiddle_x180cm_z    vitessez periode 10 segment 200   1.80   0.065   0.0    1.80   0.065   0.20
+			       sonde_vz_ymiddle_x190cm_z    vitessez periode 10 segment 200  1.90   0.065   0.0    1.90   0.065   0.20
+			       sonde_vz_ymiddle_x195cm_z    vitessez periode 10 segment 200  1.95   0.065   0.0    1.95   0.065   0.20
+			       sonde_vz_ymiddle_x198cm_z    vitessez periode 10 segment 200  1.98   0.065   0.0    1.98   0.065   0.20
+			       sonde_vz_ymiddle_x199cm_z    vitessez periode 10 segment 200  1.99   0.065   0.0    1.99   0.065   0.20
+
+			       
+				# ecriture des sondes des champs definis pr�cedemment # 
+				sonde_Tmoy		  Tmoy 	        periode 1 numero_elem_sur_maitre 0
+				sonde_Tmoy_X_milieu       Tmoy_X_milieu periode 1 numero_elem_sur_maitre 0	
+				sonde_VmoyX_haut	  VmoyX_haut    periode 1 numero_elem_sur_maitre 0
+				sonde_VmoyZ_gauche	  VmoyZ_gauche  periode 1 numero_elem_sur_maitre 0 				 
+
+			} 
+			Champs dt_post 10.
+			{
+				vitesse elem
+				temperature elem
+				# vitesse faces #
+				vorticite elem			
+			}
+			# collecte des statistiques entre t_deb et t_fin #
+			/* Statistiques dt_post 5 {
+				t_deb 600 t_fin 3000
+				moyenne vitesse elem
+				ecart_type vitesse elem
+				moyenne temperature elem
+				ecart_type temperature elem
+			} */
+		} 
+		
+		# post-traitement du domaine gauche #
+		post_gauche {
+			domaine gauche
+			Format lata
+			fichier Post_Rayleigh_gauche
+			Definition_champs 
+			{
+				# R�cup�ration de la temp�rature aux �l�ments de la surface gauche #	 			
+				tempelem_gauche Extraction { 
+					domaine gauche nom_frontiere gauche methode trace source Refchamp { 
+						Pb_champ pb temperature } }
+		
+				# R�cup�ration de la temp�rature � l'interface de la surface gauche # 
+				tempfront_gauche Extraction { 
+					domaine gauche nom_frontiere gauche methode champ_frontiere source Refchamp { 
+						Pb_champ pb temperature } }
+
+				# tempelem - tempfront # 
+				delta_T_gauche Transformation {
+					methode formule expression 1 (tempelem_gauche-tempfront_gauche) sources_reference { tempelem_gauche , tempfront_gauche } }
+
+				# moyenne de delta_T � la surface gauche # 
+				delta_T_gauche_moy Reduction_0D  { methode moyenne source_reference delta_T_gauche }
+			
+			} 
+			Probes 
+			{
+				sonde_delta_T_gauche_moy delta_T_gauche_moy periode 1. numero_elem_sur_maitre 0	
+				sonde_tempelem_gauche tempelem_gauche periode 1. numero_elem_sur_maitre 0	
+				sonde_tempfront_gauche tempfront_gauche periode 1. numero_elem_sur_maitre 0	
+			}
+			Champs dt_post 5.
+			{
+				delta_T_gauche
+				vitesse elem
+				temperature elem
+			}
+			/* Statistiques dt_post 5 {
+				t_deb 600 t_fin 3000
+				moyenne vitesse elem
+				ecart_type vitesse elem
+				moyenne temperature elem
+				ecart_type temperature elem
+			} */
+		} 
+
+		# post-traitement du domaine haut #
+		post_haut {
+			domaine haut
+			Format lata
+			fichier Post_Rayleigh_haut
+			Definition_champs 
+			{
+				# R�cup�ration de la temp�rature aux �l�ments de la surface haut #	 			
+				 tempelem_haut Extraction { 
+					domaine haut nom_frontiere haut methode trace source Refchamp { 
+						Pb_champ pb temperature } }
+		
+					# moyenne de delta_T � la surface haut # 
+			
+			      T_haut_moy Reduction_0D { methode moyenne source_reference tempelem_haut }
+
+				# Flux a la surface haut - tocheck # 
+                   fluxelem_haut Transformation { 
+		        	methode formule expression 1 0.2*5.68e-8*(tempelem_haut^4-400.^4) source_reference tempelem_haut }
+
+				# R�cup�ration de la temp�rature � l'interface de la surface haut # 
+				 tempfront_haut Extraction { 
+					domaine haut nom_frontiere haut methode champ_frontiere source Refchamp { 
+						Pb_champ pb temperature } }
+
+				# tempelem - tempfront # 
+				 delta_T_haut Transformation {
+					methode formule expression 1 (tempelem_haut-tempfront_haut) sources_reference { tempelem_haut , tempfront_haut } }
+
+				# moyenne de delta_T � la surface haut # 
+				 delta_T_haut_moy Reduction_0D  { methode moyenne source_reference delta_T_haut }
+			}
+			Probes 
+			{
+				sonde_delta_T_haut_moy delta_T_haut_moy periode 1. numero_elem_sur_maitre 0	
+				sonde_T_haut_moy T_haut_moy periode 1. numero_elem_sur_maitre 0	
+				sonde_tempfront_haut tempfront_haut periode 1. numero_elem_sur_maitre 0
+				sonde_fluxelem_haut fluxelem_haut periode 1. numero_elem_sur_maitre 0
+			}
+			Champs dt_post 10. 
+			{
+				tempelem_haut
+				vitesse elem
+				temperature elem			
+			}    
+			/* Statistiques dt_post 5 {
+				t_deb 600 t_fin 3000
+				moyenne vitesse elem
+				ecart_type vitesse elem
+				moyenne temperature elem
+				ecart_type temperature elem
+			} */
+		}  
+		
+        # post-traitement du domaine bas #
+		post_bas {
+			domaine bas
+			Format lata
+			fichier Post_Rayleigh_bas
+			Definition_champs 
+			{
+				# R�cup�ration de la temp�rature aux �l�ments de la surface bas #	 			
+				 tempelem_bas Extraction { 
+					domaine bas nom_frontiere bas methode trace source Refchamp { 
+						Pb_champ pb temperature } }
+		
+					# moyenne de delta_T � la surface bas # 
+			
+			      T_bas_moy Reduction_0D { methode moyenne source_reference tempelem_bas }
+
+               
+				# R�cup�ration de la temp�rature � l'interface de la surface bas # 
+				 tempfront_bas Extraction { 
+					domaine bas nom_frontiere bas methode champ_frontiere source Refchamp { 
+						Pb_champ pb temperature } }
+
+				# tempelem - tempfront # 
+				 delta_T_bas Transformation {
+					methode formule expression 1 (tempelem_bas-tempfront_bas) sources_reference { tempelem_bas , tempfront_bas } }
+
+				# moyenne de delta_T � la surface bas # 
+				 delta_T_bas_moy Reduction_0D  { methode moyenne source_reference delta_T_bas }
+			}
+			Probes 
+			{
+				sonde_delta_T_bas_moy delta_T_bas_moy periode 1. numero_elem_sur_maitre 0	
+				sonde_T_bas_moy T_bas_moy periode 1. numero_elem_sur_maitre 0	
+				sonde_tempfront_bas tempfront_bas periode 1. numero_elem_sur_maitre 0
+			}
+			Champs dt_post 10. 
+			{
+				tempelem_bas
+				vitesse elem
+				temperature elem			
+			}    
+			/* Statistiques dt_post 5 {
+				t_deb 600 t_fin 3000
+				moyenne vitesse elem
+				ecart_type vitesse elem
+				moyenne temperature elem
+				ecart_type temperature elem
+			} */
+		}  
+	
+	}
+     #   Resume_last_time binaire ../run0/Rayleigh_0.2_400K_pb.sauv #
+    # Reprise binaire  4U_3D_2000x130x100_para_pb.sauv #
+} 
+
+# Imprimer_flux dom { haut bas droite gauche devant derriere } #
+# Imprimer_flux_sum dom { haut bas droite gauche devant derriere  } #
+
+EcritureLectureSpecial 0
+Solve pb # to comment with ICoCo #
+End
+
+
diff --git a/tests/GPU/thermohydraulique_VDF_DNS/thermohydraulique_VDF_DNS.lml.gz b/tests/GPU/thermohydraulique_VDF_DNS/thermohydraulique_VDF_DNS.lml.gz
new file mode 100644
index 0000000000..7f72ab509d
Binary files /dev/null and b/tests/GPU/thermohydraulique_VDF_DNS/thermohydraulique_VDF_DNS.lml.gz differ
diff --git a/tests/GPU/thermohydraulique_VDF_DNS/thermohydraulique_VDF_DNS_BENCH.TU.adastra_gfx90a b/tests/GPU/thermohydraulique_VDF_DNS/thermohydraulique_VDF_DNS_BENCH.TU.adastra_gfx90a
new file mode 100644
index 0000000000..259c9dc554
--- /dev/null
+++ b/tests/GPU/thermohydraulique_VDF_DNS/thermohydraulique_VDF_DNS_BENCH.TU.adastra_gfx90a
@@ -0,0 +1,78 @@
+                                             # Global performance file #
+
+This is the global file for tracking performance in TRUST. It stores aggregated quantities.
+More detailed statistics can be found in the thermohydraulique_VDF_DNS_BENCH_csv.TU file
+For time loop, only standard counters of level 1 are printed alongside your custom counters
+Time is given in seconds
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+                                              Context of the computation
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Date:     08-06-2026 -- 16:43:07
+OS:       g1300__Linux__x86_64__5.14.0-570.69.1.el9_6.x86_64__#1 SMP PREEMPT_DYNAMIC Tue Nov 25 01:30:14 EST 2025
+CPU model : AMD EPYC 7A53 64-Core Processor
+Total number of threads:128
+GPU model: AMD Instinct MI250X
+HIP runtime version: 6.43
+HIP drivers version: 6.43
+Nb procs used for the computation: 1
+TRUST version: 1.9.8_beta
+Total number of elements used for the calculation: 1000000
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+                                           Computation start-up statistics
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Total time of the start-up:                                                25.6207        
+
+Number of calls to the linear solver per time step:                        2              
+Average time of the resolution of the linear problem per call:             0.912859       
+Average number of iteration of the linear solver per call:                 0              
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+                                                 Time loop statistics
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+The first time step is not accounted for the computation of the time loop statistics
+Total time of the time loop:                                               3.81293        
+Number of time steps:                                                      9              
+Skipped time steps:                                                        1              
+Average time per time step:                                                0.423659       
+Standard deviation between time steps:                                     0.976933       
+Time elapsed in the skipped time steps:                                    0.226085       
+
+
+Standard counter description             | Time/step       | % loop time | Call(s)/step   
+------------------------------------------------------------------------------------------
+Linear solver resolutions Ax=B           | 0.022411        |  5.3        | 1              
+Convection operator                      | 0.00202218      |  0.5        | 2              
+Diffusion operator                       | 0.0240692       |  5.7        | 9              
+Gradient operator                        | 0.008624134     |  2.0        | 2              
+Divergence operator                      | 0.0004805299    |  0.1        | 2              
+Source terms                             | 0.0003243424    |  0.1        | 1              
+Update ::mettre_a_jour                   | 0.0006458732    |  0.2        | 1              
+Solver for implicit diffusion            | 0.003458095     |  0.8        | 2              
+Computation of the time step dt          | 0.001143179     |  0.3        | 6              
+Post-treatment operations                | 0.3663823       | 86.5        | 1              
+Other operations                         | -0.005902315    | -1.4        | 
+
+Average number of iteration of the linear solver per call:                 20             
+
+
+-----------------------------------------------------------------------------------------------------------
+                                                    GPU statistics
+-----------------------------------------------------------------------------------------------------------
+Counter description                      | Time per step   | % loop time | Call(s)/step    | Bandwidth 
+-----------------------------------------------------------------------------------------------------------
+Libraries:                               | 0.0222595       |  5.3        | 1               | 
+Kernels:                                 | 0.03434         |  8.1        | 451             | 
+Copy host to device:                     | 0.00270684      |  0.6        | 38              | 16.8 GB/s
+Copy device to host:                     | 0.00466711      |  1.1        | 31              | 18.8 GB/s
+Alloc/Free on device:                    | 9.62469e-05     |  0.0        | 386             | 
+GPU: 13% Copy H<->D: 1.7% Alloc/free: 0.023% Comm: 0% CPU & I/O: 85%
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+                                              Post-resolution statistics
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Time of the post-resolution:                                               0.0747186      
+
+Total time for the whole computation                                       29.7344        
+
+[Slurm] Power consumption (39 s):  0.426 kW  0.005 kWh  0.000 € (0.10€/kWh)
diff --git a/tests/GPU/thermohydraulique_VDF_DNS/thermohydraulique_VDF_DNS_BENCH.TU.adastra_gfx942 b/tests/GPU/thermohydraulique_VDF_DNS/thermohydraulique_VDF_DNS_BENCH.TU.adastra_gfx942
new file mode 100644
index 0000000000..2195d121df
--- /dev/null
+++ b/tests/GPU/thermohydraulique_VDF_DNS/thermohydraulique_VDF_DNS_BENCH.TU.adastra_gfx942
@@ -0,0 +1,78 @@
+                                             # Global performance file #
+
+This is the global file for tracking performance in TRUST. It stores aggregated quantities.
+More detailed statistics can be found in the thermohydraulique_VDF_DNS_BENCH_csv.TU file
+For time loop, only standard counters of level 1 are printed alongside your custom counters
+Time is given in seconds
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+                                              Context of the computation
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Date:     08-06-2026 -- 14:56:53
+OS:       a1002__Linux__x86_64__5.14.0-570.69.1.el9_6.x86_64__#1 SMP PREEMPT_DYNAMIC Tue Nov 25 01:30:14 EST 2025
+CPU model : AMD Instinct MI300A Accelerator
+Total number of threads:192
+GPU model: AMD Instinct MI300A
+HIP runtime version: 6.43
+HIP drivers version: 6.43
+Nb procs used for the computation: 1
+TRUST version: 1.9.8_beta
+Total number of elements used for the calculation: 1000000
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+                                           Computation start-up statistics
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Total time of the start-up:                                                18.456         
+
+Number of calls to the linear solver per time step:                        2              
+Average time of the resolution of the linear problem per call:             0.749276       
+Average number of iteration of the linear solver per call:                 0              
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+                                                 Time loop statistics
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+The first time step is not accounted for the computation of the time loop statistics
+Total time of the time loop:                                               2.8676         
+Number of time steps:                                                      9              
+Skipped time steps:                                                        1              
+Average time per time step:                                                0.318622       
+Standard deviation between time steps:                                     0.744298       
+Time elapsed in the skipped time steps:                                    0.180249       
+
+
+Standard counter description             | Time/step       | % loop time | Call(s)/step   
+------------------------------------------------------------------------------------------
+Linear solver resolutions Ax=B           | 0.0146931       |  4.6        | 1              
+Convection operator                      | 0.001107729     |  0.3        | 2              
+Diffusion operator                       | 0.01717939      |  5.4        | 9              
+Gradient operator                        | 0.009569378     |  3.0        | 2              
+Divergence operator                      | 0.0002454842    |  0.1        | 2              
+Source terms                             | 0.0002103628    |  0.1        | 1              
+Update ::mettre_a_jour                   | 0.0004310533    |  0.1        | 1              
+Solver for implicit diffusion            | 0.002886217     |  0.9        | 2              
+Computation of the time step dt          | 0.0006566232    |  0.2        | 6              
+Post-treatment operations                | 0.2791213       | 87.6        | 1              
+Other operations                         | -0.007478771    | -2.3        | 
+
+Average number of iteration of the linear solver per call:                 20             
+
+
+-----------------------------------------------------------------------------------------------------------
+                                                    GPU statistics
+-----------------------------------------------------------------------------------------------------------
+Counter description                      | Time per step   | % loop time | Call(s)/step    | Bandwidth 
+-----------------------------------------------------------------------------------------------------------
+Libraries:                               | 0.0145741       |  4.6        | 1               | 
+Kernels:                                 | 0.0155426       |  4.9        | 451             | 
+Copy host to device:                     | 0.001705        |  0.5        | 38              | 26.6 GB/s
+Copy device to host:                     | 0.00263336      |  0.8        | 31              | 33.3 GB/s
+Alloc/Free on device:                    | 0.000196487     |  0.1        | 386             | 
+GPU: 9.5% Copy H<->D: 1.4% Alloc/free: 0.062% Comm: 0% CPU & I/O: 89%
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+                                              Post-resolution statistics
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Time of the post-resolution:                                               0.073254       
+
+Total time for the whole computation                                       21.5772        
+
+[Slurm] Power consumption (29 s):  0.606 kW  0.005 kWh  0.000 € (0.10€/kWh)
diff --git a/tests/GPU/thermohydraulique_VDF_DNS/thermohydraulique_VDF_DNS_BENCH.TU.dalianvl_cc100 b/tests/GPU/thermohydraulique_VDF_DNS/thermohydraulique_VDF_DNS_BENCH.TU.dalianvl_cc100
new file mode 100644
index 0000000000..5d0c4c1dc9
--- /dev/null
+++ b/tests/GPU/thermohydraulique_VDF_DNS/thermohydraulique_VDF_DNS_BENCH.TU.dalianvl_cc100
@@ -0,0 +1,77 @@
+                                             # Global performance file #
+
+This is the global file for tracking performance in TRUST. It stores aggregated quantities.
+More detailed statistics can be found in the thermohydraulique_VDF_DNS_BENCH_csv.TU file
+For time loop, only standard counters of level 1 are printed alongside your custom counters
+Time is given in seconds
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+                                              Context of the computation
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Date:     22-05-2026 -- 12:19:59
+OS:       dalianvl14__Linux__aarch64__5.14.0-503.40.1.el9_5.aarch64__#1 SMP PREEMPT_DYNAMIC Thu Apr 24 13:05:29 UTC 2025
+CPU model : Unknown Linux CPU
+Total number of threads:144
+GPU model: NVIDIA GB200
+CUDA runtime version: 13.0
+CUDA drivers version: 13.20
+Nb procs used for the computation: 1
+TRUST version: 1.9.8_beta
+Total number of elements used for the calculation: 1000000
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+                                           Computation start-up statistics
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Total time of the start-up:                                                10.3147        
+
+Number of calls to the linear solver per time step:                        2              
+Average time of the resolution of the linear problem per call:             0.47066        
+Average number of iteration of the linear solver per call:                 0              
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+                                                 Time loop statistics
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+The first time step is not accounted for the computation of the time loop statistics
+Total time of the time loop:                                               2.46847        
+Number of time steps:                                                      9              
+Skipped time steps:                                                        1              
+Average time per time step:                                                0.274274       
+Standard deviation between time steps:                                     0.653925       
+Time elapsed in the skipped time steps:                                    0.53959        
+
+
+Standard counter description             | Time/step       | % loop time | Call(s)/step   
+------------------------------------------------------------------------------------------
+Linear solver resolutions Ax=B           | 0.0130017       |  4.7        | 1              
+Convection operator                      | 0.0007011693    |  0.3        | 2              
+Diffusion operator                       | 0.01002715      |  3.7        | 9              
+Gradient operator                        | 0.004981093     |  1.8        | 2              
+Divergence operator                      | 0.0002038184    |  0.1        | 2              
+Source terms                             | 0.0001672706    |  0.1        | 1              
+Update ::mettre_a_jour                   | 0.000417205     |  0.2        | 1              
+Solver for implicit diffusion            | 0.001707723     |  0.6        | 2              
+Computation of the time step dt          | 0.0004988829    |  0.2        | 6              
+Post-treatment operations                | 0.2453269       | 89.4        | 1              
+Other operations                         | -0.002758624    | -1.0        | 
+
+Average number of iteration of the linear solver per call:                 20             
+
+
+-----------------------------------------------------------------------------------------------------------
+                                                    GPU statistics
+-----------------------------------------------------------------------------------------------------------
+Counter description                      | Time per step   | % loop time | Call(s)/step    | Bandwidth 
+-----------------------------------------------------------------------------------------------------------
+Libraries:                               | 0.0127201       |  4.6        | 1               | 
+Kernels:                                 | 0.00998931      |  3.6        | 451             | 
+Copy host to device:                     | 0.00101425      |  0.4        | 38              | 44.8 GB/s
+Copy device to host:                     | 0.0014345       |  0.5        | 31              | 61.2 GB/s
+Alloc/Free on device:                    | 0.000276302     |  0.1        | 386             | 
+GPU: 8.3% Copy H<->D: 0.89% Alloc/free: 0.1% Comm: 0% CPU & I/O: 91%
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+                                              Post-resolution statistics
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Time of the post-resolution:                                               0.0525768      
+
+Total time for the whole computation                                       13.3754        
+
diff --git a/tests/GPU/thermohydraulique_VDF_DNS/thermohydraulique_VDF_DNS_BENCH.TU.eureka_cc89 b/tests/GPU/thermohydraulique_VDF_DNS/thermohydraulique_VDF_DNS_BENCH.TU.eureka_cc89
new file mode 100644
index 0000000000..1235b5aa36
--- /dev/null
+++ b/tests/GPU/thermohydraulique_VDF_DNS/thermohydraulique_VDF_DNS_BENCH.TU.eureka_cc89
@@ -0,0 +1,77 @@
+                                             # Global performance file #
+
+This is the global file for tracking performance in TRUST. It stores aggregated quantities.
+More detailed statistics can be found in the thermohydraulique_VDF_DNS_BENCH_csv.TU file
+For time loop, only standard counters of level 1 are printed alongside your custom counters
+Time is given in seconds
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+                                              Context of the computation
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Date:     11-05-2026 -- 08:38:31
+OS:       eureka__Linux__x86_64__6.17.0-20-generic__#20~24.04.1-Ubuntu SMP PREEMPT_DYNAMIC Thu Mar 19 01:28:37 UTC 2
+CPU model : INTEL(R) XEON(R) PLATINUM 8580
+Total number of threads:240
+GPU model: NVIDIA RTX 6000 Ada Generation
+CUDA runtime version: 12.90
+CUDA drivers version: 13.10
+Nb procs used for the computation: 1
+TRUST version: 1.9.8_beta
+Total number of elements used for the calculation: 1000000
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+                                           Computation start-up statistics
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Total time of the start-up:                                                12.0825        
+
+Number of calls to the linear solver per time step:                        2              
+Average time of the resolution of the linear problem per call:             0.808736       
+Average number of iteration of the linear solver per call:                 0              
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+                                                 Time loop statistics
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+The first time step is not accounted for the computation of the time loop statistics
+Total time of the time loop:                                               2.32686        
+Number of time steps:                                                      9              
+Skipped time steps:                                                        1              
+Average time per time step:                                                0.25854        
+Standard deviation between time steps:                                     0.526476       
+Time elapsed in the skipped time steps:                                    0.237277       
+
+
+Standard counter description             | Time/step       | % loop time | Call(s)/step   
+------------------------------------------------------------------------------------------
+Linear solver resolutions Ax=B           | 0.0296802       | 11.5        | 1              
+Convection operator                      | 0.002672108     |  1.0        | 2              
+Diffusion operator                       | 0.02081252      |  8.1        | 9              
+Gradient operator                        | 0.006097233     |  2.4        | 2              
+Divergence operator                      | 0.003118208     |  1.2        | 2              
+Source terms                             | 0.0002127497    |  0.1        | 1              
+Update ::mettre_a_jour                   | 0.001786451     |  0.7        | 1              
+Solver for implicit diffusion            | 0.001692247     |  0.7        | 2              
+Computation of the time step dt          | 0.0007426396    |  0.3        | 6              
+Post-treatment operations                | 0.197774        | 76.5        | 1              
+Other operations                         | -0.006048129    | -2.3        | 
+
+Average number of iteration of the linear solver per call:                 29.3           
+
+
+-----------------------------------------------------------------------------------------------------------
+                                                    GPU statistics
+-----------------------------------------------------------------------------------------------------------
+Counter description                      | Time per step   | % loop time | Call(s)/step    | Bandwidth 
+-----------------------------------------------------------------------------------------------------------
+Libraries:                               | 0.0295874       | 11.4        | 1               | 
+Kernels:                                 | 0.01683         |  6.5        | 430             | 
+Copy host to device:                     | 0.00693595      |  2.7        | 29              | 9.9 GB/s
+Copy device to host:                     | 0.00794341      |  3.1        | 23              | 14.1 GB/s
+Alloc/Free on device:                    | 9.93497e-05     |  0.0        | 385             | 
+GPU: 18% Copy H<->D: 5.8% Alloc/free: 0.038% Comm: 0% CPU & I/O: 76%
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+                                              Post-resolution statistics
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Time of the post-resolution:                                               0.0105536      
+
+Total time for the whole computation                                       14.6572        
+
diff --git a/tests/GPU/thermohydraulique_VDF_DNS/thermohydraulique_VDF_DNS_BENCH.TU.irene-amd-ccrt_cc70 b/tests/GPU/thermohydraulique_VDF_DNS/thermohydraulique_VDF_DNS_BENCH.TU.irene-amd-ccrt_cc70
new file mode 100644
index 0000000000..ce8b04d16c
--- /dev/null
+++ b/tests/GPU/thermohydraulique_VDF_DNS/thermohydraulique_VDF_DNS_BENCH.TU.irene-amd-ccrt_cc70
@@ -0,0 +1,78 @@
+                                             # Global performance file #
+
+This is the global file for tracking performance in TRUST. It stores aggregated quantities.
+More detailed statistics can be found in the thermohydraulique_VDF_DNS_BENCH_csv.TU file
+For time loop, only standard counters of level 1 are printed alongside your custom counters
+Time is given in seconds
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+                                              Context of the computation
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Date:     23-04-2026 -- 09:48:39
+OS:       irene7050__Linux__x86_64__4.18.0-553.89.1.el8_10.x86_64__#1 SMP Sat Nov 29 00:49:18 EST 2025
+CPU model : Intel(R) Xeon(R) Gold 6230 CPU @ 2.10GHz
+Total number of threads:80
+GPU model: Tesla V100-SXM2-16GB
+CUDA runtime version: 12.90
+CUDA drivers version: 13.0
+Nb procs used for the computation: 1
+TRUST version: 1.9.8_beta
+Total number of elements used for the calculation: 1000000
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+                                           Computation start-up statistics
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Total time of the start-up:                                                15.0231        
+
+Number of calls to the linear solver per time step:                        2              
+Average time of the resolution of the linear problem per call:             0.87689        
+Average number of iteration of the linear solver per call:                 0              
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+                                                 Time loop statistics
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+The first time step is not accounted for the computation of the time loop statistics
+Total time of the time loop:                                               5.3121         
+Number of time steps:                                                      9              
+Skipped time steps:                                                        1              
+Average time per time step:                                                0.590233       
+Standard deviation between time steps:                                     1.32085        
+Time elapsed in the skipped time steps:                                    0.448604       
+
+
+Standard counter description             | Time/step       | % loop time | Call(s)/step   
+------------------------------------------------------------------------------------------
+Linear solver resolutions Ax=B           | 0.0376823       |  6.4        | 1              
+Convection operator                      | 0.001876149     |  0.3        | 2              
+Diffusion operator                       | 0.04526071      |  7.7        | 9              
+Gradient operator                        | 0.007839558     |  1.3        | 2              
+Divergence operator                      | 0.0006154726    |  0.1        | 2              
+Source terms                             | 0.00030727      |  0.1        | 1              
+Update ::mettre_a_jour                   | 0.0006910987    |  0.1        | 1              
+Solver for implicit diffusion            | 0.003822609     |  0.6        | 2              
+Computation of the time step dt          | 0.001210896     |  0.2        | 6              
+Post-treatment operations                | 0.4956301       | 84.0        | 1              
+Other operations                         | -0.004703008    | -0.8        | 
+
+Average number of iteration of the linear solver per call:                 24.9           
+
+
+-----------------------------------------------------------------------------------------------------------
+                                                    GPU statistics
+-----------------------------------------------------------------------------------------------------------
+Counter description                      | Time per step   | % loop time | Call(s)/step    | Bandwidth 
+-----------------------------------------------------------------------------------------------------------
+Libraries:                               | 0.0375116       |  6.4        | 1               | 
+Kernels:                                 | 0.0249549       |  4.2        | 432             | 
+Copy host to device:                     | 0.0121441       |  2.1        | 27              | 4.2 GB/s
+Copy device to host:                     | 0.025129        |  4.3        | 23              | 4.6 GB/s
+Alloc/Free on device:                    | 0.000107331     |  0.0        | 384             | 
+GPU: 11% Copy H<->D: 6.3% Alloc/free: 0.018% Comm: 0% CPU & I/O: 83%
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+                                              Post-resolution statistics
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Time of the post-resolution:                                               0.074717       
+
+Total time for the whole computation                                       20.8585        
+
+[Slurm] Power consumption (36 s):  0.246 kW  0.002 kWh  0.000 € (0.10€/kWh)
diff --git a/tests/GPU/thermohydraulique_VDF_DNS/thermohydraulique_VDF_DNS_BENCH.TU.is157091_cc86 b/tests/GPU/thermohydraulique_VDF_DNS/thermohydraulique_VDF_DNS_BENCH.TU.is157091_cc86
new file mode 100644
index 0000000000..e68443d67e
--- /dev/null
+++ b/tests/GPU/thermohydraulique_VDF_DNS/thermohydraulique_VDF_DNS_BENCH.TU.is157091_cc86
@@ -0,0 +1,77 @@
+                                             # Global performance file #
+
+This is the global file for tracking performance in TRUST. It stores aggregated quantities.
+More detailed statistics can be found in the thermohydraulique_VDF_DNS_BENCH_csv.TU file
+For time loop, only standard counters of level 1 are printed alongside your custom counters
+Time is given in seconds
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+                                              Context of the computation
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Date:     14-05-2026 -- 16:27:50
+OS:       is157091__Linux__x86_64__5.11.12-300.fc34.x86_64__#1 SMP Wed Apr 7 16:31:13 UTC 2021
+CPU model : AMD Ryzen Threadripper PRO 5975WX 32-Cores
+Total number of threads:64
+GPU model: NVIDIA RTX A6000
+CUDA runtime version: 12.90
+CUDA drivers version: 13.20
+Nb procs used for the computation: 1
+TRUST version: 1.9.8_beta
+Total number of elements used for the calculation: 1000000
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+                                           Computation start-up statistics
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Total time of the start-up:                                                10.9568        
+
+Number of calls to the linear solver per time step:                        2              
+Average time of the resolution of the linear problem per call:             0.567296       
+Average number of iteration of the linear solver per call:                 0              
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+                                                 Time loop statistics
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+The first time step is not accounted for the computation of the time loop statistics
+Total time of the time loop:                                               3.25948        
+Number of time steps:                                                      9              
+Skipped time steps:                                                        1              
+Average time per time step:                                                0.362164       
+Standard deviation between time steps:                                     0.767551       
+Time elapsed in the skipped time steps:                                    0.197513       
+
+
+Standard counter description             | Time/step       | % loop time | Call(s)/step   
+------------------------------------------------------------------------------------------
+Linear solver resolutions Ax=B           | 0.0344571       |  9.5        | 1              
+Convection operator                      | 0.002472719     |  0.7        | 2              
+Diffusion operator                       | 0.02741157      |  7.6        | 9              
+Gradient operator                        | 0.007760346     |  2.1        | 2              
+Divergence operator                      | 0.0006208222    |  0.2        | 2              
+Source terms                             | 0.0003084709    |  0.1        | 1              
+Update ::mettre_a_jour                   | 0.0006127068    |  0.2        | 1              
+Solver for implicit diffusion            | 0.003998419     |  1.1        | 2              
+Computation of the time step dt          | 0.001362192     |  0.4        | 6              
+Post-treatment operations                | 0.2878818       | 79.5        | 1              
+Other operations                         | -0.004721987    | -1.3        | 
+
+Average number of iteration of the linear solver per call:                 20             
+
+
+-----------------------------------------------------------------------------------------------------------
+                                                    GPU statistics
+-----------------------------------------------------------------------------------------------------------
+Counter description                      | Time per step   | % loop time | Call(s)/step    | Bandwidth 
+-----------------------------------------------------------------------------------------------------------
+Libraries:                               | 0.0342252       |  9.5        | 1               | 
+Kernels:                                 | 0.0249121       |  6.9        | 451             | 
+Copy host to device:                     | 0.00424511      |  1.2        | 38              | 10.7 GB/s
+Copy device to host:                     | 0.00826155      |  2.3        | 31              | 10.6 GB/s
+Alloc/Free on device:                    | 0.000147109     |  0.0        | 386             | 
+GPU: 16% Copy H<->D: 3.5% Alloc/free: 0.041% Comm: 0% CPU & I/O: 80%
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+                                              Post-resolution statistics
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Time of the post-resolution:                                               0.0714031      
+
+Total time for the whole computation                                       14.4852        
+
diff --git a/tests/GPU/thermohydraulique_VDF_DNS/thermohydraulique_VDF_DNS_BENCH.TU.is159479_cc120 b/tests/GPU/thermohydraulique_VDF_DNS/thermohydraulique_VDF_DNS_BENCH.TU.is159479_cc120
new file mode 100644
index 0000000000..9cde3f4150
--- /dev/null
+++ b/tests/GPU/thermohydraulique_VDF_DNS/thermohydraulique_VDF_DNS_BENCH.TU.is159479_cc120
@@ -0,0 +1,77 @@
+                                             # Global performance file #
+
+This is the global file for tracking performance in TRUST. It stores aggregated quantities.
+More detailed statistics can be found in the thermohydraulique_VDF_DNS_BENCH_csv.TU file
+For time loop, only standard counters of level 1 are printed alongside your custom counters
+Time is given in seconds
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+                                              Context of the computation
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Date:     13-05-2026 -- 07:07:50
+OS:       is159479.intra.cea.fr__Linux__x86_64__6.12.0-160000.29-default__#1 SMP PREEMPT_DYNAMIC Fri May  1 12:45:19 UTC 2026 (6
+CPU model : AMD Ryzen Threadripper PRO 9965WX 24-Cores
+Total number of threads:48
+GPU model: NVIDIA RTX PRO 6000 Blackwell Workstation Edition
+CUDA runtime version: 13.0
+CUDA drivers version: 13.0
+Nb procs used for the computation: 1
+TRUST version: 1.9.8_beta
+Total number of elements used for the calculation: 1000000
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+                                           Computation start-up statistics
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Total time of the start-up:                                                6.99314        
+
+Number of calls to the linear solver per time step:                        2              
+Average time of the resolution of the linear problem per call:             0.359449       
+Average number of iteration of the linear solver per call:                 0              
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+                                                 Time loop statistics
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+The first time step is not accounted for the computation of the time loop statistics
+Total time of the time loop:                                               1.67615        
+Number of time steps:                                                      9              
+Skipped time steps:                                                        1              
+Average time per time step:                                                0.186239       
+Standard deviation between time steps:                                     0.390992       
+Time elapsed in the skipped time steps:                                    0.162419       
+
+
+Standard counter description             | Time/step       | % loop time | Call(s)/step   
+------------------------------------------------------------------------------------------
+Linear solver resolutions Ax=B           | 0.0150849       |  8.1        | 1              
+Convection operator                      | 0.0008391611    |  0.5        | 2              
+Diffusion operator                       | 0.02027865      | 10.9        | 9              
+Gradient operator                        | 0.006390957     |  3.4        | 2              
+Divergence operator                      | 0.000193898     |  0.1        | 2              
+Source terms                             | 0.0001278147    |  0.1        | 1              
+Update ::mettre_a_jour                   | 0.0002891871    |  0.2        | 1              
+Solver for implicit diffusion            | 0.001259632     |  0.7        | 2              
+Computation of the time step dt          | 0.0004311387    |  0.2        | 6              
+Post-treatment operations                | 0.1466799       | 78.8        | 1              
+Other operations                         | -0.005336367    | -2.9        | 
+
+Average number of iteration of the linear solver per call:                 20             
+
+
+-----------------------------------------------------------------------------------------------------------
+                                                    GPU statistics
+-----------------------------------------------------------------------------------------------------------
+Counter description                      | Time per step   | % loop time | Call(s)/step    | Bandwidth 
+-----------------------------------------------------------------------------------------------------------
+Libraries:                               | 0.0149707       |  8.0        | 1               | 
+Kernels:                                 | 0.00987719      |  5.3        | 451             | 
+Copy host to device:                     | 0.00382741      |  2.1        | 38              | 11.9 GB/s
+Copy device to host:                     | 0.00974978      |  5.2        | 31              | 9.0 GB/s
+Alloc/Free on device:                    | 0.000131351     |  0.1        | 386             | 
+GPU: 13% Copy H<->D: 7.3% Alloc/free: 0.071% Comm: 0% CPU & I/O: 79%
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+                                              Post-resolution statistics
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Time of the post-resolution:                                               0.0120666      
+
+Total time for the whole computation                                       8.84379        
+
diff --git a/tests/GPU/thermohydraulique_VDF_DNS/thermohydraulique_VDF_DNS_BENCH.TU.is246827_cc86 b/tests/GPU/thermohydraulique_VDF_DNS/thermohydraulique_VDF_DNS_BENCH.TU.is246827_cc86
new file mode 100644
index 0000000000..93bebc81c5
--- /dev/null
+++ b/tests/GPU/thermohydraulique_VDF_DNS/thermohydraulique_VDF_DNS_BENCH.TU.is246827_cc86
@@ -0,0 +1,52 @@
+Statistiques d'initialisation du calcul
+
+Temps total                       10.2829
+
+Statistiques de resolution du probleme
+
+Temps total                       6.3659
+
+
+Timesteps                         10
+Secondes / pas de temps           0.636587
+Dont solveurs Ax=B                0.158250 24% (1 appel/pas de temps)
+Dont solveur diffusion_implicite  0.042880  6% (2 appels/pas de temps)
+Dont mettre_a_jour                0.083516 13% (1 appel/pas de temps)
+Dont operateurs convection        0.016030  2% (2 appels/pas de temps)
+Dont operateurs diffusion         0.062878  9% (8.9 appels/pas de temps)
+Dont operateurs gradient          0.010380  1% (2.2 appels/pas de temps)
+Dont operateurs divergence        0.004090  0% (2 appels/pas de temps)
+Dont operateurs source            0.001426  0% (1 appel/pas de temps)
+Dont operations postraitement     0.241783 37% (1 appel/pas de temps)
+Dont calcul dt                    0.003010  0% (6 appels/pas de temps)
+Dont calcul divers                0.012344  1% (0 appels/pas de temps)
+Nb solveur / pas de temps         1
+Secondes / solveur                0.15825
+Iterations / solveur              18
+GPU statistics per time step (experimental):
+Libraries : 0.157395 s 24.7%  1.0 calls
+Kernels   : 0.105369 s 16.6% 406.9 calls
+Copy H2D  : 0.023752 s  3.7% 51.2 calls  9.2 GB/s
+Copy D2H  : 0.021397 s  3.4% 39.6 calls 10.0 GB/s
+Alloc/Free: 0.005759 s  0.9% 403.0 calls
+GPU: 41.2% Copy H<->D: 7% Alloc/Free: 0.9% Comm: 0% CPU & Others: 50.7%
+I/O:
+
+Timesteps = number of time steps
+Nb solveur = number of linear system resolutions
+Nb assemblage implicite = number of matrix assemblies for the implicit scheme
+Iterations = average number of iterations of the solver
+Communications = fraction of the time spent
+                 in communications between processors (excluding io files)
+Network latency = time of one mpsum measured by an internal bench over 0.1s
+Network bandwidth = maximum on all processors
+                    of the average bandwidth of send_recv operations
+Waiting time = estimation of the waiting time of the different processors
+
+Max_waiting_time big    => probably due to a bad partitioning
+Communications > 30%    => too many processors or network too slow
+
+Statistiques de post resolution
+
+Temps total                       0.014188
+
diff --git a/tests/GPU/thermohydraulique_VDF_DNS/thermohydraulique_VDF_DNS_BENCH.TU.is247793_gfx1100 b/tests/GPU/thermohydraulique_VDF_DNS/thermohydraulique_VDF_DNS_BENCH.TU.is247793_gfx1100
new file mode 100644
index 0000000000..0a9068ceab
--- /dev/null
+++ b/tests/GPU/thermohydraulique_VDF_DNS/thermohydraulique_VDF_DNS_BENCH.TU.is247793_gfx1100
@@ -0,0 +1,77 @@
+                                             # Global performance file #
+
+This is the global file for tracking performance in TRUST. It stores aggregated quantities.
+More detailed statistics can be found in the thermohydraulique_VDF_DNS_BENCH_csv.TU file
+For time loop, only standard counters of level 1 are printed alongside your custom counters
+Time is given in seconds
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+                                              Context of the computation
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Date:     22-05-2026 -- 19:07:04
+OS:       is247793__Linux__x86_64__6.8.0-107-generic__#107~22.04.1-Ubuntu SMP PREEMPT_DYNAMIC Wed Mar 18 23:40:43 UTC 
+CPU model : AMD Ryzen Threadripper PRO 7965WX 24-Cores
+Total number of threads:48
+GPU model: AMD Radeon PRO W7900
+HIP runtime version: 7.53
+HIP drivers version: 7.53
+Nb procs used for the computation: 1
+TRUST version: 1.9.8_beta
+Total number of elements used for the calculation: 1000000
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+                                           Computation start-up statistics
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Total time of the start-up:                                                9.83297        
+
+Number of calls to the linear solver per time step:                        2              
+Average time of the resolution of the linear problem per call:             0.987836       
+Average number of iteration of the linear solver per call:                 0              
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+                                                 Time loop statistics
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+The first time step is not accounted for the computation of the time loop statistics
+Total time of the time loop:                                               2.41001        
+Number of time steps:                                                      9              
+Skipped time steps:                                                        1              
+Average time per time step:                                                0.267778       
+Standard deviation between time steps:                                     0.514588       
+Time elapsed in the skipped time steps:                                    0.258782       
+
+
+Standard counter description             | Time/step       | % loop time | Call(s)/step   
+------------------------------------------------------------------------------------------
+Linear solver resolutions Ax=B           | 0.0385379       | 14.4        | 1              
+Convection operator                      | 0.00245368      |  0.9        | 2              
+Diffusion operator                       | 0.02310321      |  8.6        | 9              
+Gradient operator                        | 0.006615284     |  2.5        | 2              
+Divergence operator                      | 0.0006505396    |  0.2        | 2              
+Source terms                             | 0.0003957816    |  0.1        | 1              
+Update ::mettre_a_jour                   | 0.0007646496    |  0.3        | 1              
+Solver for implicit diffusion            | 0.004011812     |  1.5        | 2              
+Computation of the time step dt          | 0.001353264     |  0.5        | 6              
+Post-treatment operations                | 0.193474        | 72.3        | 1              
+Other operations                         | -0.003581728    | -1.3        | 
+
+Average number of iteration of the linear solver per call:                 20             
+
+
+-----------------------------------------------------------------------------------------------------------
+                                                    GPU statistics
+-----------------------------------------------------------------------------------------------------------
+Counter description                      | Time per step   | % loop time | Call(s)/step    | Bandwidth 
+-----------------------------------------------------------------------------------------------------------
+Libraries:                               | 0.0383825       | 14.3        | 1               | 
+Kernels:                                 | 0.0315465       | 11.8        | 451             | 
+Copy host to device:                     | 0.00322612      |  1.2        | 38              | 14.1 GB/s
+Copy device to host:                     | 0.0041323       |  1.5        | 31              | 21.2 GB/s
+Alloc/Free on device:                    | 0.00016324      |  0.1        | 386             | 
+GPU: 26% Copy H<->D: 2.7% Alloc/free: 0.061% Comm: 0% CPU & I/O: 71%
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+                                              Post-resolution statistics
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Time of the post-resolution:                                               0.0369687      
+
+Total time for the whole computation                                       12.5387        
+
diff --git a/tests/GPU/thermohydraulique_VDF_DNS/thermohydraulique_VDF_DNS_BENCH.TU.jean-zay_cc90 b/tests/GPU/thermohydraulique_VDF_DNS/thermohydraulique_VDF_DNS_BENCH.TU.jean-zay_cc90
new file mode 100644
index 0000000000..555ca2462e
--- /dev/null
+++ b/tests/GPU/thermohydraulique_VDF_DNS/thermohydraulique_VDF_DNS_BENCH.TU.jean-zay_cc90
@@ -0,0 +1,78 @@
+                                             # Global performance file #
+
+This is the global file for tracking performance in TRUST. It stores aggregated quantities.
+More detailed statistics can be found in the thermohydraulique_VDF_DNS_BENCH_csv.TU file
+For time loop, only standard counters of level 1 are printed alongside your custom counters
+Time is given in seconds
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+                                              Context of the computation
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Date:     10-06-2026 -- 10:48:07
+OS:       jzxh021__Linux__x86_64__5.14.0-570.116.1.el9_6.x86_64__#1 SMP PREEMPT_DYNAMIC Tue May 19 08:44:54 EDT 2026
+CPU model : Intel(R) Xeon(R) Platinum 8468
+Total number of threads:192
+GPU model: NVIDIA H100 80GB HBM3
+CUDA runtime version: 12.60
+CUDA drivers version: 13.20
+Nb procs used for the computation: 1
+TRUST version: 1.9.8_beta
+Total number of elements used for the calculation: 1000000
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+                                           Computation start-up statistics
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Total time of the start-up:                                                16.2378        
+
+Number of calls to the linear solver per time step:                        2              
+Average time of the resolution of the linear problem per call:             0.573866       
+Average number of iteration of the linear solver per call:                 0              
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+                                                 Time loop statistics
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+The first time step is not accounted for the computation of the time loop statistics
+Total time of the time loop:                                               3.01618        
+Number of time steps:                                                      9              
+Skipped time steps:                                                        1              
+Average time per time step:                                                0.335131       
+Standard deviation between time steps:                                     0.769179       
+Time elapsed in the skipped time steps:                                    0.269408       
+
+
+Standard counter description             | Time/step       | % loop time | Call(s)/step   
+------------------------------------------------------------------------------------------
+Linear solver resolutions Ax=B           | 0.0156995       |  4.7        | 1              
+Convection operator                      | 0.0008767052    |  0.3        | 2              
+Diffusion operator                       | 0.02406568      |  7.2        | 9              
+Gradient operator                        | 0.009106745     |  2.7        | 2              
+Divergence operator                      | 0.0002421182    |  0.1        | 2              
+Source terms                             | 0.000181716     |  0.1        | 1              
+Update ::mettre_a_jour                   | 0.0004172798    |  0.1        | 1              
+Solver for implicit diffusion            | 0.00191558      |  0.6        | 2              
+Computation of the time step dt          | 0.0005445146    |  0.2        | 6              
+Post-treatment operations                | 0.2885673       | 86.1        | 1              
+Other operations                         | -0.006486277    | -1.9        | 
+
+Average number of iteration of the linear solver per call:                 20             
+
+
+-----------------------------------------------------------------------------------------------------------
+                                                    GPU statistics
+-----------------------------------------------------------------------------------------------------------
+Counter description                      | Time per step   | % loop time | Call(s)/step    | Bandwidth 
+-----------------------------------------------------------------------------------------------------------
+Libraries:                               | 0.0154814       |  4.6        | 1               | 
+Kernels:                                 | 0.0136429       |  4.1        | 451             | 
+Copy host to device:                     | 0.00519298      |  1.5        | 38              | 8.7 GB/s
+Copy device to host:                     | 0.0112822       |  3.4        | 31              | 7.8 GB/s
+Alloc/Free on device:                    | 0.000159493     |  0.0        | 386             | 
+GPU: 8.7% Copy H<->D: 4.9% Alloc/free: 0.048% Comm: 0% CPU & I/O: 86%
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+                                              Post-resolution statistics
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Time of the post-resolution:                                               0.0593804      
+
+Total time for the whole computation                                       19.5828        
+
+[Slurm] Power consumption (28 s):  0.432 kW  0.003 kWh  0.000 € (0.10€/kWh)
diff --git a/tests/GPU/thermohydraulique_VDF_DNS/thermohydraulique_VDF_DNS_BENCH.TU.lumi_gfx90a b/tests/GPU/thermohydraulique_VDF_DNS/thermohydraulique_VDF_DNS_BENCH.TU.lumi_gfx90a
new file mode 100644
index 0000000000..28ea817dc8
--- /dev/null
+++ b/tests/GPU/thermohydraulique_VDF_DNS/thermohydraulique_VDF_DNS_BENCH.TU.lumi_gfx90a
@@ -0,0 +1,78 @@
+                                             # Global performance file #
+
+This is the global file for tracking performance in TRUST. It stores aggregated quantities.
+More detailed statistics can be found in the thermohydraulique_VDF_DNS_BENCH_csv.TU file
+For time loop, only standard counters of level 1 are printed alongside your custom counters
+Time is given in seconds
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+                                              Context of the computation
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Date:     15-05-2026 -- 21:14:43
+OS:       nid005002__Linux__x86_64__6.4.0-150600.23.73_15.0.14-cray_shasta_c__#1 SMP Tue Oct 21 20:32:25 UTC 2025 (89d3c98)
+CPU model : AMD EPYC 7A53 64-Core Processor
+Total number of threads:128
+GPU model: AMD Instinct MI250X
+HIP runtime version: 6.43
+HIP drivers version: 6.43
+Nb procs used for the computation: 1
+TRUST version: 1.9.8_beta
+Total number of elements used for the calculation: 1000000
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+                                           Computation start-up statistics
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Total time of the start-up:                                                57.6288        
+
+Number of calls to the linear solver per time step:                        2              
+Average time of the resolution of the linear problem per call:             1.5162         
+Average number of iteration of the linear solver per call:                 0              
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+                                                 Time loop statistics
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+The first time step is not accounted for the computation of the time loop statistics
+Total time of the time loop:                                               3.5764         
+Number of time steps:                                                      9              
+Skipped time steps:                                                        1              
+Average time per time step:                                                0.397378       
+Standard deviation between time steps:                                     0.910304       
+Time elapsed in the skipped time steps:                                    0.2462         
+
+
+Standard counter description             | Time/step       | % loop time | Call(s)/step   
+------------------------------------------------------------------------------------------
+Linear solver resolutions Ax=B           | 0.0196565       |  4.9        | 1              
+Convection operator                      | 0.002379149     |  0.6        | 2              
+Diffusion operator                       | 0.025488        |  6.4        | 9              
+Gradient operator                        | 0.007580062     |  1.9        | 2              
+Divergence operator                      | 0.0005331556    |  0.1        | 2              
+Source terms                             | 0.0003113889    |  0.1        | 1              
+Update ::mettre_a_jour                   | 0.0006967458    |  0.2        | 1              
+Solver for implicit diffusion            | 0.003233921     |  0.8        | 2              
+Computation of the time step dt          | 0.001146262     |  0.3        | 6              
+Post-treatment operations                | 0.3413949       | 85.9        | 1              
+Other operations                         | -0.005041789    | -1.3        | 
+
+Average number of iteration of the linear solver per call:                 20             
+
+
+-----------------------------------------------------------------------------------------------------------
+                                                    GPU statistics
+-----------------------------------------------------------------------------------------------------------
+Counter description                      | Time per step   | % loop time | Call(s)/step    | Bandwidth 
+-----------------------------------------------------------------------------------------------------------
+Libraries:                               | 0.0195123       |  4.9        | 1               | 
+Kernels:                                 | 0.0348402       |  8.8        | 451             | 
+Copy host to device:                     | 0.00272454      |  0.7        | 38              | 16.7 GB/s
+Copy device to host:                     | 0.00472031      |  1.2        | 31              | 18.6 GB/s
+Alloc/Free on device:                    | 9.4555e-05      |  0.0        | 386             | 
+GPU: 14% Copy H<->D: 1.9% Alloc/free: 0.024% Comm: 0% CPU & I/O: 84%
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+                                              Post-resolution statistics
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Time of the post-resolution:                                               0.0660653      
+
+Total time for the whole computation                                       61.5175        
+
+[Slurm] Power consumption (82 s):  0.461 kW  0.010 kWh  0.001 € (0.10€/kWh)
diff --git a/tests/GPU/thermohydraulique_VDF_DNS/thermohydraulique_VDF_DNS_BENCH.TU.topaze_cc80 b/tests/GPU/thermohydraulique_VDF_DNS/thermohydraulique_VDF_DNS_BENCH.TU.topaze_cc80
new file mode 100644
index 0000000000..98731ede4d
--- /dev/null
+++ b/tests/GPU/thermohydraulique_VDF_DNS/thermohydraulique_VDF_DNS_BENCH.TU.topaze_cc80
@@ -0,0 +1,78 @@
+                                             # Global performance file #
+
+This is the global file for tracking performance in TRUST. It stores aggregated quantities.
+More detailed statistics can be found in the thermohydraulique_VDF_DNS_BENCH_csv.TU file
+For time loop, only standard counters of level 1 are printed alongside your custom counters
+Time is given in seconds
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+                                              Context of the computation
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Date:     15-05-2026 -- 14:01:08
+OS:       topaze7040__Linux__x86_64__4.18.0-553.89.1.el8_10.x86_64__#1 SMP Sat Nov 29 00:49:18 EST 2025
+CPU model : AMD EPYC 7763 64-Core Processor
+Total number of threads:256
+GPU model: NVIDIA A100-SXM4-80GB
+CUDA runtime version: 12.90
+CUDA drivers version: 13.0
+Nb procs used for the computation: 1
+TRUST version: 1.9.8_beta
+Total number of elements used for the calculation: 1000000
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+                                           Computation start-up statistics
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Total time of the start-up:                                                15.3546        
+
+Number of calls to the linear solver per time step:                        2              
+Average time of the resolution of the linear problem per call:             0.753724       
+Average number of iteration of the linear solver per call:                 0              
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+                                                 Time loop statistics
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+The first time step is not accounted for the computation of the time loop statistics
+Total time of the time loop:                                               3.64246        
+Number of time steps:                                                      9              
+Skipped time steps:                                                        1              
+Average time per time step:                                                0.404717       
+Standard deviation between time steps:                                     0.950988       
+Time elapsed in the skipped time steps:                                    0.357684       
+
+
+Standard counter description             | Time/step       | % loop time | Call(s)/step   
+------------------------------------------------------------------------------------------
+Linear solver resolutions Ax=B           | 0.019992        |  4.9        | 1              
+Convection operator                      | 0.001205197     |  0.3        | 2              
+Diffusion operator                       | 0.02073481      |  5.1        | 9              
+Gradient operator                        | 0.007618354     |  1.9        | 2              
+Divergence operator                      | 0.0003536566    |  0.1        | 2              
+Source terms                             | 0.0002187154    |  0.1        | 1              
+Update ::mettre_a_jour                   | 0.0004763972    |  0.1        | 1              
+Solver for implicit diffusion            | 0.002284928     |  0.6        | 2              
+Computation of the time step dt          | 0.0007091229    |  0.2        | 6              
+Post-treatment operations                | 0.3566838       | 88.1        | 1              
+Other operations                         | -0.005559456    | -1.4        | 
+
+Average number of iteration of the linear solver per call:                 20             
+
+
+-----------------------------------------------------------------------------------------------------------
+                                                    GPU statistics
+-----------------------------------------------------------------------------------------------------------
+Counter description                      | Time per step   | % loop time | Call(s)/step    | Bandwidth 
+-----------------------------------------------------------------------------------------------------------
+Libraries:                               | 0.019737        |  4.9        | 1               | 
+Kernels:                                 | 0.0149773       |  3.7        | 451             | 
+Copy host to device:                     | 0.00328589      |  0.8        | 38              | 13.8 GB/s
+Copy device to host:                     | 0.0061287       |  1.5        | 31              | 14.3 GB/s
+Alloc/Free on device:                    | 0.000240941     |  0.1        | 386             | 
+GPU: 8.6% Copy H<->D: 2.3% Alloc/free: 0.06% Comm: 0% CPU & I/O: 89%
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+                                              Post-resolution statistics
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Time of the post-resolution:                                               0.0820633      
+
+Total time for the whole computation                                       19.4368        
+
+[Slurm] Power consumption (62 s):  0.435 kW  0.007 kWh  0.001 € (0.10€/kWh)
diff --git a/tests/GPU/thermohydraulique_VEF_DNS/thermohydraulique_VEF_DNS.data b/tests/GPU/thermohydraulique_VEF_DNS/thermohydraulique_VEF_DNS.data
index 5143a22e67..901bfc3f8b 100644
--- a/tests/GPU/thermohydraulique_VEF_DNS/thermohydraulique_VEF_DNS.data
+++ b/tests/GPU/thermohydraulique_VEF_DNS/thermohydraulique_VEF_DNS.data
@@ -41,7 +41,8 @@ END PARTITION #
 Scatter DOM.Zones dom
 END SCATTER #
 
-VEFPreP1B dis
+VEFPreP1B dis 
+Lire dis { reorder { algo Hilbert } }
 
 runge_kutta_ordre_3 sch_ex
 Read sch_ex
diff --git a/tests/GPU/thermohydraulique_VEF_DNS/thermohydraulique_VEF_DNS_BENCH.TU.adastra_gfx90a b/tests/GPU/thermohydraulique_VEF_DNS/thermohydraulique_VEF_DNS_BENCH.TU.adastra_gfx90a
index abc6fb98e2..9a6ba42432 100644
--- a/tests/GPU/thermohydraulique_VEF_DNS/thermohydraulique_VEF_DNS_BENCH.TU.adastra_gfx90a
+++ b/tests/GPU/thermohydraulique_VEF_DNS/thermohydraulique_VEF_DNS_BENCH.TU.adastra_gfx90a
@@ -8,8 +8,8 @@ Time is given in seconds
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
                                               Context of the computation
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-Date:     20-02-2026 -- 18:18:45
-OS:       g1081__Linux__x86_64__5.14.0-570.69.1.el9_6.x86_64__#1 SMP PREEMPT_DYNAMIC Tue Nov 25 01:30:14 EST 2025
+Date:     23-04-2026 -- 09:39:29
+OS:       g1182__Linux__x86_64__5.14.0-570.69.1.el9_6.x86_64__#1 SMP PREEMPT_DYNAMIC Tue Nov 25 01:30:14 EST 2025
 CPU model : AMD EPYC 7A53 64-Core Processor
 Total number of threads:128
 GPU model: AMD Instinct MI250X
@@ -22,37 +22,37 @@ Total number of elements used for the calculation: 1638400
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
                                            Computation start-up statistics
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-Total time of the start-up:                                                36.3455        
+Total time of the start-up:                                                38.5416        
 
 Number of calls to the linear solver per time step:                        2              
-Average time of the resolution of the linear problem per call:             0.817213       
+Average time of the resolution of the linear problem per call:             1.01807        
 Average number of iteration of the linear solver per call:                 0              
 
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
                                                  Time loop statistics
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 The first time step is not accounted for the computation of the time loop statistics
-Total time of the time loop:                                               8.34709        
+Total time of the time loop:                                               7.03316        
 Number of time steps:                                                      9              
 Skipped time steps:                                                        1              
-Average time per time step:                                                0.927454       
-Standard deviation between time steps:                                     0.186576       
-Time elapsed in the skipped time steps:                                    1.10284        
+Average time per time step:                                                0.781463       
+Standard deviation between time steps:                                     0.174335       
+Time elapsed in the skipped time steps:                                    0.919797       
 
 
 Standard counter description             | Time/step       | % loop time | Call(s)/step   
 ------------------------------------------------------------------------------------------
-Linear solver resolutions Ax=B           | 0.0353757       |  3.4        | 3              
-Convection operator                      | 0.2771735       | 26.4        | 6              
-Diffusion operator                       | 0.1017226       |  9.7        | 24             
-Gradient operator                        | 0.08935723      |  8.5        | 6              
-Divergence operator                      | 0.01984002      |  1.9        | 4              
-Source terms                             | 0.1596252       | 15.2        | 6              
-Update ::mettre_a_jour                   | 0.02635399      |  2.5        | 1              
-Solver for implicit diffusion            | 0.02084072      |  2.0        | 6              
-Computation of the time step dt          | 0.003133064     |  0.3        | 10             
-Post-treatment operations                | 0.1788569       | 17.0        | 1              
-Other operations                         | 0.01517503      |  1.4        | 
+Linear solver resolutions Ax=B           | 0.0294224       |  3.8        | 3              
+Convection operator                      | 0.2453152       | 31.4        | 6              
+Diffusion operator                       | 0.08785235      | 11.2        | 24             
+Gradient operator                        | 0.04586587      |  5.9        | 6              
+Divergence operator                      | 0.01216054      |  1.6        | 4              
+Source terms                             | 0.1386433       | 17.7        | 6              
+Update ::mettre_a_jour                   | 0.02083962      |  2.7        | 1              
+Solver for implicit diffusion            | 0.01898591      |  2.4        | 6              
+Computation of the time step dt          | 0.00293547      |  0.4        | 10             
+Post-treatment operations                | 0.1645779       | 21.1        | 1              
+Other operations                         | 0.01486426      |  1.9        | 
 
 Average number of iteration of the linear solver per call:                 8              
 
@@ -62,17 +62,17 @@ Average number of iteration of the linear solver per call:                 8
 -----------------------------------------------------------------------------------------------------------
 Counter description                      | Time per step   | % loop time | Call(s)/step    | Bandwidth 
 -----------------------------------------------------------------------------------------------------------
-Libraries:                               | 0.0348663       |  3.8        | 3               | 
-Kernels:                                 | 0.828195        | 89.3        | 1807            | 
-Copy host to device:                     | 0.000956014     |  0.1        | 43              | 3.1 GB/s
-Copy device to host:                     | 0.00563713      |  0.6        | 98              | 11.0 GB/s
-Alloc/Free on device:                    | 0.000106537     |  0.0        | 4               | 
-GPU: 93% Copy H<->D: 0.71% Alloc/free: 0.011% Comm: 0% CPU & I/O: 6.2%
+Libraries:                               | 0.0288868       |  3.7        | 3               | 
+Kernels:                                 | 0.688197        | 88.1        | 1804            | 
+Copy host to device:                     | 0.000964265     |  0.1        | 43              | 3.0 GB/s
+Copy device to host:                     | 0.0056269       |  0.7        | 98              | 11.0 GB/s
+Alloc/Free on device:                    | 0.000106762     |  0.0        | 4               | 
+GPU: 92% Copy H<->D: 0.84% Alloc/free: 0.014% Comm: 0% CPU & I/O: 7.4%
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
                                               Post-resolution statistics
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-Time of the post-resolution:                                               1.18684        
+Time of the post-resolution:                                               1.18641        
 
-Total time for the whole computation                                       46.9823        
+Total time for the whole computation                                       47.681         
 
-Power consumption: 0 kW 0 kWh 0 € (0.12€/kWh)
+[Slurm] Power consumption (55 s):  0.473 kW  0.007 kWh  0.001 € (0.10€/kWh)
diff --git a/tests/GPU/thermohydraulique_VEF_DNS/thermohydraulique_VEF_DNS_BENCH.TU.adastra_gfx942 b/tests/GPU/thermohydraulique_VEF_DNS/thermohydraulique_VEF_DNS_BENCH.TU.adastra_gfx942
index d45c73d182..8c6d846b16 100644
--- a/tests/GPU/thermohydraulique_VEF_DNS/thermohydraulique_VEF_DNS_BENCH.TU.adastra_gfx942
+++ b/tests/GPU/thermohydraulique_VEF_DNS/thermohydraulique_VEF_DNS_BENCH.TU.adastra_gfx942
@@ -8,8 +8,8 @@ Time is given in seconds
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
                                               Context of the computation
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-Date:     17-03-2026 -- 18:10:43
-OS:       a1007__Linux__x86_64__5.14.0-570.69.1.el9_6.x86_64__#1 SMP PREEMPT_DYNAMIC Tue Nov 25 01:30:14 EST 2025
+Date:     23-04-2026 -- 15:01:05
+OS:       a1003__Linux__x86_64__5.14.0-570.69.1.el9_6.x86_64__#1 SMP PREEMPT_DYNAMIC Tue Nov 25 01:30:14 EST 2025
 CPU model : AMD Instinct MI300A Accelerator
 Total number of threads:192
 GPU model: AMD Instinct MI300A
@@ -22,37 +22,37 @@ Total number of elements used for the calculation: 1638400
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
                                            Computation start-up statistics
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-Total time of the start-up:                                                55.4268        
+Total time of the start-up:                                                45.1512        
 
 Number of calls to the linear solver per time step:                        2              
-Average time of the resolution of the linear problem per call:             1.14745        
+Average time of the resolution of the linear problem per call:             1.25045        
 Average number of iteration of the linear solver per call:                 0              
 
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
                                                  Time loop statistics
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 The first time step is not accounted for the computation of the time loop statistics
-Total time of the time loop:                                               2.77306        
+Total time of the time loop:                                               2.72452        
 Number of time steps:                                                      9              
 Skipped time steps:                                                        1              
-Average time per time step:                                                0.308118       
-Standard deviation between time steps:                                     0.141243       
-Time elapsed in the skipped time steps:                                    0.601196       
+Average time per time step:                                                0.302725       
+Standard deviation between time steps:                                     0.144304       
+Time elapsed in the skipped time steps:                                    0.608773       
 
 
 Standard counter description             | Time/step       | % loop time | Call(s)/step   
 ------------------------------------------------------------------------------------------
-Linear solver resolutions Ax=B           | 0.0180691       |  5.9        | 3              
-Convection operator                      | 0.05506105      | 17.9        | 6              
-Diffusion operator                       | 0.02738197      |  8.9        | 24             
-Gradient operator                        | 0.01970771      |  6.4        | 6              
-Divergence operator                      | 0.005881149     |  1.9        | 4              
-Source terms                             | 0.0142448       |  4.6        | 6              
-Update ::mettre_a_jour                   | 0.01042488      |  3.4        | 1              
-Solver for implicit diffusion            | 0.01383581      |  4.5        | 6              
-Computation of the time step dt          | 0.002407566     |  0.8        | 10             
-Post-treatment operations                | 0.1322655       | 42.9        | 1              
-Other operations                         | 0.008838375     |  2.9        | 
+Linear solver resolutions Ax=B           | 0.016609        |  5.5        | 3              
+Convection operator                      | 0.05170634      | 17.1        | 6              
+Diffusion operator                       | 0.027014        |  8.9        | 24             
+Gradient operator                        | 0.0191032       |  6.3        | 6              
+Divergence operator                      | 0.004540961     |  1.5        | 4              
+Source terms                             | 0.01447686      |  4.8        | 6              
+Update ::mettre_a_jour                   | 0.009973257     |  3.3        | 1              
+Solver for implicit diffusion            | 0.01398563      |  4.6        | 6              
+Computation of the time step dt          | 0.002454303     |  0.8        | 10             
+Post-treatment operations                | 0.1340098       | 44.3        | 1              
+Other operations                         | 0.008851631     |  2.9        | 
 
 Average number of iteration of the linear solver per call:                 8              
 
@@ -62,17 +62,17 @@ Average number of iteration of the linear solver per call:                 8
 -----------------------------------------------------------------------------------------------------------
 Counter description                      | Time per step   | % loop time | Call(s)/step    | Bandwidth 
 -----------------------------------------------------------------------------------------------------------
-Libraries:                               | 0.0175936       |  5.7        | 3               | 
-Kernels:                                 | 0.23352         | 75.8        | 1807            | 
-Copy host to device:                     | 0.000902083     |  0.3        | 43              | 3.2 GB/s
-Copy device to host:                     | 0.00451282      |  1.5        | 98              | 13.7 GB/s
-Alloc/Free on device:                    | 4.65706e-05     |  0.0        | 4               | 
-GPU: 81% Copy H<->D: 1.8% Alloc/free: 0.015% Comm: 0% CPU & I/O: 17%
+Libraries:                               | 0.0162141       |  5.4        | 3               | 
+Kernels:                                 | 0.227653        | 75.2        | 1804            | 
+Copy host to device:                     | 0.000941173     |  0.3        | 43              | 3.1 GB/s
+Copy device to host:                     | 0.00467396      |  1.5        | 98              | 13.3 GB/s
+Alloc/Free on device:                    | 4.81214e-05     |  0.0        | 4               | 
+GPU: 81% Copy H<->D: 1.9% Alloc/free: 0.016% Comm: 0% CPU & I/O: 18%
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
                                               Post-resolution statistics
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-Time of the post-resolution:                                               1.11388        
+Time of the post-resolution:                                               1.02665        
 
-Total time for the whole computation                                       59.915         
+Total time for the whole computation                                       49.5112        
 
-[Slurm] Power consumption (69 s):  0.680 kW  0.013 kWh  0.001 € (0.10€/kWh)
+[Slurm] Power consumption (59 s):  0.654 kW  0.011 kWh  0.001 € (0.10€/kWh)
diff --git a/tests/GPU/thermohydraulique_VEF_DNS/thermohydraulique_VEF_DNS_BENCH.TU.dalianvl_cc100 b/tests/GPU/thermohydraulique_VEF_DNS/thermohydraulique_VEF_DNS_BENCH.TU.dalianvl_cc100
new file mode 100644
index 0000000000..15616e173d
--- /dev/null
+++ b/tests/GPU/thermohydraulique_VEF_DNS/thermohydraulique_VEF_DNS_BENCH.TU.dalianvl_cc100
@@ -0,0 +1,77 @@
+                                             # Global performance file #
+
+This is the global file for tracking performance in TRUST. It stores aggregated quantities.
+More detailed statistics can be found in the thermohydraulique_VEF_DNS_BENCH_csv.TU file
+For time loop, only standard counters of level 1 are printed alongside your custom counters
+Time is given in seconds
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+                                              Context of the computation
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Date:     22-05-2026 -- 12:20:32
+OS:       dalianvl14__Linux__aarch64__5.14.0-503.40.1.el9_5.aarch64__#1 SMP PREEMPT_DYNAMIC Thu Apr 24 13:05:29 UTC 2025
+CPU model : Unknown Linux CPU
+Total number of threads:144
+GPU model: NVIDIA GB200
+CUDA runtime version: 13.0
+CUDA drivers version: 13.20
+Nb procs used for the computation: 1
+TRUST version: 1.9.8_beta
+Total number of elements used for the calculation: 1638400
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+                                           Computation start-up statistics
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Total time of the start-up:                                                24.612         
+
+Number of calls to the linear solver per time step:                        2              
+Average time of the resolution of the linear problem per call:             0.896434       
+Average number of iteration of the linear solver per call:                 0              
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+                                                 Time loop statistics
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+The first time step is not accounted for the computation of the time loop statistics
+Total time of the time loop:                                               1.50874        
+Number of time steps:                                                      9              
+Skipped time steps:                                                        1              
+Average time per time step:                                                0.167637       
+Standard deviation between time steps:                                     0.112427       
+Time elapsed in the skipped time steps:                                    0.291127       
+
+
+Standard counter description             | Time/step       | % loop time | Call(s)/step   
+------------------------------------------------------------------------------------------
+Linear solver resolutions Ax=B           | 0.016792        | 10.0        | 3              
+Convection operator                      | 0.01345494      |  8.0        | 6              
+Diffusion operator                       | 0.0119605       |  7.1        | 24             
+Gradient operator                        | 0.005281847     |  3.2        | 6              
+Divergence operator                      | 0.00148673      |  0.9        | 4              
+Source terms                             | 0.0224019       | 13.4        | 6              
+Update ::mettre_a_jour                   | 0.005473263     |  3.3        | 1              
+Solver for implicit diffusion            | 0.007801915     |  4.7        | 6              
+Computation of the time step dt          | 0.001045642     |  0.6        | 10             
+Post-treatment operations                | 0.07443955      | 44.4        | 1              
+Other operations                         | 0.007499072     |  4.5        | 
+
+Average number of iteration of the linear solver per call:                 8              
+
+
+-----------------------------------------------------------------------------------------------------------
+                                                    GPU statistics
+-----------------------------------------------------------------------------------------------------------
+Counter description                      | Time per step   | % loop time | Call(s)/step    | Bandwidth 
+-----------------------------------------------------------------------------------------------------------
+Libraries:                               | 0.0164142       |  9.8        | 3               | 
+Kernels:                                 | 0.0958071       | 57.2        | 1632            | 
+Copy host to device:                     | 0.000911807     |  0.5        | 51              | 3.2 GB/s
+Copy device to host:                     | 0.00192771      |  1.1        | 40              | 32.1 GB/s
+Alloc/Free on device:                    | 0.00105571      |  0.6        | 23              | 
+GPU: 67% Copy H<->D: 1.7% Alloc/free: 0.63% Comm: 0% CPU & I/O: 31%
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+                                              Post-resolution statistics
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Time of the post-resolution:                                               0.793499       
+
+Total time for the whole computation                                       27.2054        
+
diff --git a/tests/GPU/thermohydraulique_VEF_DNS/thermohydraulique_VEF_DNS_BENCH.TU.eureka_cc89 b/tests/GPU/thermohydraulique_VEF_DNS/thermohydraulique_VEF_DNS_BENCH.TU.eureka_cc89
new file mode 100644
index 0000000000..d1437d5c09
--- /dev/null
+++ b/tests/GPU/thermohydraulique_VEF_DNS/thermohydraulique_VEF_DNS_BENCH.TU.eureka_cc89
@@ -0,0 +1,77 @@
+                                             # Global performance file #
+
+This is the global file for tracking performance in TRUST. It stores aggregated quantities.
+More detailed statistics can be found in the thermohydraulique_VEF_DNS_BENCH_csv.TU file
+For time loop, only standard counters of level 1 are printed alongside your custom counters
+Time is given in seconds
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+                                              Context of the computation
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Date:     11-05-2026 -- 08:39:12
+OS:       eureka__Linux__x86_64__6.17.0-20-generic__#20~24.04.1-Ubuntu SMP PREEMPT_DYNAMIC Thu Mar 19 01:28:37 UTC 2
+CPU model : INTEL(R) XEON(R) PLATINUM 8580
+Total number of threads:240
+GPU model: NVIDIA RTX 6000 Ada Generation
+CUDA runtime version: 12.90
+CUDA drivers version: 13.10
+Nb procs used for the computation: 1
+TRUST version: 1.9.8_beta
+Total number of elements used for the calculation: 1638400
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+                                           Computation start-up statistics
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Total time of the start-up:                                                36.5273        
+
+Number of calls to the linear solver per time step:                        2              
+Average time of the resolution of the linear problem per call:             1.08866        
+Average number of iteration of the linear solver per call:                 0              
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+                                                 Time loop statistics
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+The first time step is not accounted for the computation of the time loop statistics
+Total time of the time loop:                                               6.4139         
+Number of time steps:                                                      9              
+Skipped time steps:                                                        1              
+Average time per time step:                                                0.712655       
+Standard deviation between time steps:                                     0.291946       
+Time elapsed in the skipped time steps:                                    1.19671        
+
+
+Standard counter description             | Time/step       | % loop time | Call(s)/step   
+------------------------------------------------------------------------------------------
+Linear solver resolutions Ax=B           | 0.0288126       |  4.0        | 3              
+Convection operator                      | 0.04361792      |  6.1        | 6              
+Diffusion operator                       | 0.02553571      |  3.6        | 24             
+Gradient operator                        | 0.01168674      |  1.6        | 6              
+Divergence operator                      | 0.003601375     |  0.5        | 4              
+Source terms                             | 0.02653658      |  3.7        | 6              
+Update ::mettre_a_jour                   | 0.01541138      |  2.2        | 1              
+Solver for implicit diffusion            | 0.01887595      |  2.6        | 6              
+Computation of the time step dt          | 0.003067977     |  0.4        | 10             
+Post-treatment operations                | 0.5206003       | 73.1        | 1              
+Other operations                         | 0.01490858      |  2.1        | 
+
+Average number of iteration of the linear solver per call:                 8              
+
+
+-----------------------------------------------------------------------------------------------------------
+                                                    GPU statistics
+-----------------------------------------------------------------------------------------------------------
+Counter description                      | Time per step   | % loop time | Call(s)/step    | Bandwidth 
+-----------------------------------------------------------------------------------------------------------
+Libraries:                               | 0.028531        |  4.0        | 3               | 
+Kernels:                                 | 0.620912        | 87.1        | 1745            | 
+Copy host to device:                     | 0.000704046     |  0.1        | 43              | 4.1 GB/s
+Copy device to host:                     | 0.00607506      |  0.9        | 102             | 10.2 GB/s
+Alloc/Free on device:                    | 0.000138622     |  0.0        | 4               | 
+GPU: 91% Copy H<->D: 0.95% Alloc/free: 0.019% Comm: 0% CPU & I/O: 7.9%
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+                                              Post-resolution statistics
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Time of the post-resolution:                                               0.3616         
+
+Total time for the whole computation                                       44.4995        
+
diff --git a/tests/GPU/thermohydraulique_VEF_DNS/thermohydraulique_VEF_DNS_BENCH.TU.irene-amd-ccrt_cc70 b/tests/GPU/thermohydraulique_VEF_DNS/thermohydraulique_VEF_DNS_BENCH.TU.irene-amd-ccrt_cc70
index 41f24b0a65..659cabc7f3 100644
--- a/tests/GPU/thermohydraulique_VEF_DNS/thermohydraulique_VEF_DNS_BENCH.TU.irene-amd-ccrt_cc70
+++ b/tests/GPU/thermohydraulique_VEF_DNS/thermohydraulique_VEF_DNS_BENCH.TU.irene-amd-ccrt_cc70
@@ -8,8 +8,8 @@ Time is given in seconds
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
                                               Context of the computation
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-Date:     20-03-2026 -- 15:22:09
-OS:       irene7067__Linux__x86_64__4.18.0-553.89.1.el8_10.x86_64__#1 SMP Sat Nov 29 00:49:18 EST 2025
+Date:     23-04-2026 -- 15:17:50
+OS:       irene7051__Linux__x86_64__4.18.0-553.89.1.el8_10.x86_64__#1 SMP Sat Nov 29 00:49:18 EST 2025
 CPU model : Intel(R) Xeon(R) Gold 6230 CPU @ 2.10GHz
 Total number of threads:80
 GPU model: Tesla V100-SXM2-16GB
@@ -22,37 +22,37 @@ Total number of elements used for the calculation: 1638400
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
                                            Computation start-up statistics
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-Total time of the start-up:                                                50.0545        
+Total time of the start-up:                                                52.508         
 
 Number of calls to the linear solver per time step:                        2              
-Average time of the resolution of the linear problem per call:             2.13913        
+Average time of the resolution of the linear problem per call:             1.6874         
 Average number of iteration of the linear solver per call:                 0              
 
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
                                                  Time loop statistics
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 The first time step is not accounted for the computation of the time loop statistics
-Total time of the time loop:                                               5.19813        
+Total time of the time loop:                                               4.33648        
 Number of time steps:                                                      9              
 Skipped time steps:                                                        1              
-Average time per time step:                                                0.57757        
-Standard deviation between time steps:                                     0.16654        
-Time elapsed in the skipped time steps:                                    1.45793        
+Average time per time step:                                                0.481832       
+Standard deviation between time steps:                                     0.178332       
+Time elapsed in the skipped time steps:                                    1.23974        
 
 
 Standard counter description             | Time/step       | % loop time | Call(s)/step   
 ------------------------------------------------------------------------------------------
-Linear solver resolutions Ax=B           | 0.0491332       |  8.5        | 3              
-Convection operator                      | 0.1156491       | 20.0        | 6              
-Diffusion operator                       | 0.06374568      | 11.0        | 24             
-Gradient operator                        | 0.03774611      |  6.5        | 6              
-Divergence operator                      | 0.02187735      |  3.8        | 4              
-Source terms                             | 0.05641414      |  9.8        | 6              
-Update ::mettre_a_jour                   | 0.03031997      |  5.2        | 1              
-Solver for implicit diffusion            | 0.02578301      |  4.5        | 6              
-Computation of the time step dt          | 0.003343664     |  0.6        | 10             
-Post-treatment operations                | 0.1561775       | 27.0        | 1              
-Other operations                         | 0.01738065      |  3.0        | 
+Linear solver resolutions Ax=B           | 0.0441547       |  9.2        | 3              
+Convection operator                      | 0.08919729      | 18.5        | 6              
+Diffusion operator                       | 0.03954753      |  8.2        | 24             
+Gradient operator                        | 0.01919536      |  4.0        | 6              
+Divergence operator                      | 0.0121496       |  2.5        | 4              
+Source terms                             | 0.04608156      |  9.6        | 6              
+Update ::mettre_a_jour                   | 0.02597848      |  5.4        | 1              
+Solver for implicit diffusion            | 0.02597088      |  5.4        | 6              
+Computation of the time step dt          | 0.003337014     |  0.7        | 10             
+Post-treatment operations                | 0.1580431       | 32.8        | 1              
+Other operations                         | 0.01817609      |  3.8        | 
 
 Average number of iteration of the linear solver per call:                 8              
 
@@ -62,17 +62,17 @@ Average number of iteration of the linear solver per call:                 8
 -----------------------------------------------------------------------------------------------------------
 Counter description                      | Time per step   | % loop time | Call(s)/step    | Bandwidth 
 -----------------------------------------------------------------------------------------------------------
-Libraries:                               | 0.0484481       |  8.4        | 3               | 
-Kernels:                                 | 0.457903        | 79.3        | 1807            | 
-Copy host to device:                     | 0.0013404       |  0.2        | 43              | 2.2 GB/s
-Copy device to host:                     | 0.0153024       |  2.6        | 98              | 4.0 GB/s
-Alloc/Free on device:                    | 0.000149768     |  0.0        | 4               | 
-GPU: 88% Copy H<->D: 2.9% Alloc/free: 0.026% Comm: 0% CPU & I/O: 9.4%
+Libraries:                               | 0.0434439       |  9.0        | 3               | 
+Kernels:                                 | 0.363077        | 75.4        | 1804            | 
+Copy host to device:                     | 0.00140484      |  0.3        | 43              | 2.1 GB/s
+Copy device to host:                     | 0.0171422       |  3.6        | 98              | 3.6 GB/s
+Alloc/Free on device:                    | 0.000169156     |  0.0        | 4               | 
+GPU: 84% Copy H<->D: 3.8% Alloc/free: 0.035% Comm: 0% CPU & I/O: 12%
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
                                               Post-resolution statistics
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-Time of the post-resolution:                                               1.0736         
+Time of the post-resolution:                                               0.982788       
 
-Total time for the whole computation                                       57.7842        
+Total time for the whole computation                                       59.0671        
 
-[Slurm] Power consumption (82 s):  0.178 kW  0.004 kWh  0.000 € (0.10€/kWh)
+[Slurm] Power consumption (73 s):  0.229 kW  0.005 kWh  0.000 € (0.10€/kWh)
diff --git a/tests/GPU/thermohydraulique_VEF_DNS/thermohydraulique_VEF_DNS_BENCH.TU.is157091_cc86 b/tests/GPU/thermohydraulique_VEF_DNS/thermohydraulique_VEF_DNS_BENCH.TU.is157091_cc86
index fee1e8c0b2..e656412694 100644
--- a/tests/GPU/thermohydraulique_VEF_DNS/thermohydraulique_VEF_DNS_BENCH.TU.is157091_cc86
+++ b/tests/GPU/thermohydraulique_VEF_DNS/thermohydraulique_VEF_DNS_BENCH.TU.is157091_cc86
@@ -8,13 +8,13 @@ Time is given in seconds
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
                                               Context of the computation
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-Date:     20-02-2026 -- 18:01:42
+Date:     07-05-2026 -- 14:32:18
 OS:       is157091__Linux__x86_64__5.11.12-300.fc34.x86_64__#1 SMP Wed Apr 7 16:31:13 UTC 2021
 CPU model : AMD Ryzen Threadripper PRO 5975WX 32-Cores
 Total number of threads:64
 GPU model: NVIDIA RTX A6000
 CUDA runtime version: 12.90
-CUDA drivers version: 12.70
+CUDA drivers version: 13.20
 Nb procs used for the computation: 1
 TRUST version: 1.9.8_beta
 Total number of elements used for the calculation: 1638400
@@ -22,37 +22,37 @@ Total number of elements used for the calculation: 1638400
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
                                            Computation start-up statistics
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-Total time of the start-up:                                                29.542         
+Total time of the start-up:                                                29.1674        
 
 Number of calls to the linear solver per time step:                        2              
-Average time of the resolution of the linear problem per call:             0.859958       
+Average time of the resolution of the linear problem per call:             0.982441       
 Average number of iteration of the linear solver per call:                 0              
 
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
                                                  Time loop statistics
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 The first time step is not accounted for the computation of the time loop statistics
-Total time of the time loop:                                               4.91682        
+Total time of the time loop:                                               4.07933        
 Number of time steps:                                                      9              
 Skipped time steps:                                                        1              
-Average time per time step:                                                0.546314       
-Standard deviation between time steps:                                     0.113084       
-Time elapsed in the skipped time steps:                                    0.906668       
+Average time per time step:                                                0.453259       
+Standard deviation between time steps:                                     0.108612       
+Time elapsed in the skipped time steps:                                    0.790546       
 
 
 Standard counter description             | Time/step       | % loop time | Call(s)/step   
 ------------------------------------------------------------------------------------------
-Linear solver resolutions Ax=B           | 0.0500233       |  9.2        | 3              
-Convection operator                      | 0.1068827       | 19.6        | 6              
-Diffusion operator                       | 0.05186786      |  9.5        | 24             
-Gradient operator                        | 0.03487695      |  6.4        | 6              
-Divergence operator                      | 0.01704209      |  3.1        | 4              
-Source terms                             | 0.05857455      | 10.7        | 6              
-Update ::mettre_a_jour                   | 0.0282653       |  5.2        | 1              
-Solver for implicit diffusion            | 0.03199751      |  5.9        | 6              
-Computation of the time step dt          | 0.004401211     |  0.8        | 10             
-Post-treatment operations                | 0.141302        | 25.9        | 1              
-Other operations                         | 0.02108005      |  3.9        | 
+Linear solver resolutions Ax=B           | 0.0483038       | 10.7        | 3              
+Convection operator                      | 0.08223569      | 18.1        | 6              
+Diffusion operator                       | 0.04077174      |  9.0        | 24             
+Gradient operator                        | 0.0149534       |  3.3        | 6              
+Divergence operator                      | 0.01122735      |  2.5        | 4              
+Source terms                             | 0.05184123      | 11.4        | 6              
+Update ::mettre_a_jour                   | 0.02289992      |  5.1        | 1              
+Solver for implicit diffusion            | 0.03037772      |  6.7        | 6              
+Computation of the time step dt          | 0.004375326     |  1.0        | 10             
+Post-treatment operations                | 0.126597        | 27.9        | 1              
+Other operations                         | 0.01967598      |  4.3        | 
 
 Average number of iteration of the linear solver per call:                 8              
 
@@ -62,16 +62,16 @@ Average number of iteration of the linear solver per call:                 8
 -----------------------------------------------------------------------------------------------------------
 Counter description                      | Time per step   | % loop time | Call(s)/step    | Bandwidth 
 -----------------------------------------------------------------------------------------------------------
-Libraries:                               | 0.0493887       |  9.0        | 3               | 
-Kernels:                                 | 0.455456        | 83.4        | 1807            | 
-Copy host to device:                     | 0.000654241     |  0.1        | 43              | 4.5 GB/s
-Copy device to host:                     | 0.0061757       |  1.1        | 98              | 10.0 GB/s
-Alloc/Free on device:                    | 9.24147e-05     |  0.0        | 4               | 
-GPU: 92% Copy H<->D: 1.3% Alloc/free: 0.017% Comm: 0% CPU & I/O: 6.3%
+Libraries:                               | 0.0476381       | 10.5        | 3               | 
+Kernels:                                 | 0.366554        | 80.9        | 1733            | 
+Copy host to device:                     | 0.000658838     |  0.1        | 43              | 4.4 GB/s
+Copy device to host:                     | 0.00639428      |  1.4        | 98              | 9.7 GB/s
+Alloc/Free on device:                    | 0.000101789     |  0.0        | 4               | 
+GPU: 91% Copy H<->D: 1.6% Alloc/free: 0.022% Comm: 0% CPU & I/O: 7%
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
                                               Post-resolution statistics
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-Time of the post-resolution:                                               0.432622       
+Time of the post-resolution:                                               0.427963       
 
-Total time for the whole computation                                       35.7981        
+Total time for the whole computation                                       34.4652        
 
diff --git a/tests/GPU/thermohydraulique_VEF_DNS/thermohydraulique_VEF_DNS_BENCH.TU.is159479_cc120 b/tests/GPU/thermohydraulique_VEF_DNS/thermohydraulique_VEF_DNS_BENCH.TU.is159479_cc120
new file mode 100644
index 0000000000..018703a961
--- /dev/null
+++ b/tests/GPU/thermohydraulique_VEF_DNS/thermohydraulique_VEF_DNS_BENCH.TU.is159479_cc120
@@ -0,0 +1,77 @@
+                                             # Global performance file #
+
+This is the global file for tracking performance in TRUST. It stores aggregated quantities.
+More detailed statistics can be found in the thermohydraulique_VEF_DNS_BENCH_csv.TU file
+For time loop, only standard counters of level 1 are printed alongside your custom counters
+Time is given in seconds
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+                                              Context of the computation
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Date:     24-04-2026 -- 14:39:17
+OS:       is159479.intra.cea.fr__Linux__x86_64__6.12.0-160000.26-default__#1 SMP PREEMPT_DYNAMIC Thu Feb  5 00:00:11 UTC 2026 (f
+CPU model : AMD Ryzen Threadripper PRO 9965WX 24-Cores
+Total number of threads:48
+GPU model: NVIDIA RTX PRO 6000 Blackwell Workstation Edition
+CUDA runtime version: 13.0
+CUDA drivers version: 13.0
+Nb procs used for the computation: 1
+TRUST version: 1.9.8_beta
+Total number of elements used for the calculation: 1638400
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+                                           Computation start-up statistics
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Total time of the start-up:                                                23.2994        
+
+Number of calls to the linear solver per time step:                        2              
+Average time of the resolution of the linear problem per call:             0.611811       
+Average number of iteration of the linear solver per call:                 0              
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+                                                 Time loop statistics
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+The first time step is not accounted for the computation of the time loop statistics
+Total time of the time loop:                                               1.61824        
+Number of time steps:                                                      9              
+Skipped time steps:                                                        1              
+Average time per time step:                                                0.179805       
+Standard deviation between time steps:                                     0.0577892      
+Time elapsed in the skipped time steps:                                    0.513971       
+
+
+Standard counter description             | Time/step       | % loop time | Call(s)/step   
+------------------------------------------------------------------------------------------
+Linear solver resolutions Ax=B           | 0.0210945       | 11.7        | 3              
+Convection operator                      | 0.03025967      | 16.8        | 6              
+Diffusion operator                       | 0.01563452      |  8.7        | 24             
+Gradient operator                        | 0.006605338     |  3.7        | 6              
+Divergence operator                      | 0.001920241     |  1.1        | 4              
+Source terms                             | 0.02015723      | 11.2        | 6              
+Update ::mettre_a_jour                   | 0.008509181     |  4.7        | 1              
+Solver for implicit diffusion            | 0.009471892     |  5.3        | 6              
+Computation of the time step dt          | 0.00156277      |  0.9        | 10             
+Post-treatment operations                | 0.05698094      | 31.7        | 1              
+Other operations                         | 0.007608653     |  4.2        | 
+
+Average number of iteration of the linear solver per call:                 8              
+
+
+-----------------------------------------------------------------------------------------------------------
+                                                    GPU statistics
+-----------------------------------------------------------------------------------------------------------
+Counter description                      | Time per step   | % loop time | Call(s)/step    | Bandwidth 
+-----------------------------------------------------------------------------------------------------------
+Libraries:                               | 0.020927        | 11.6        | 3               | 
+Kernels:                                 | 0.132036        | 73.4        | 1804            | 
+Copy host to device:                     | 0.00046937      |  0.3        | 43              | 6.2 GB/s
+Copy device to host:                     | 0.00734308      |  4.1        | 98              | 8.4 GB/s
+Alloc/Free on device:                    | 6.39967e-05     |  0.0        | 4               | 
+GPU: 85% Copy H<->D: 4.3% Alloc/free: 0.036% Comm: 0% CPU & I/O: 11%
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+                                              Post-resolution statistics
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Time of the post-resolution:                                               0.372868       
+
+Total time for the whole computation                                       25.8045        
+
diff --git a/tests/GPU/thermohydraulique_VEF_DNS/thermohydraulique_VEF_DNS_BENCH.TU.is247793_gfx1100 b/tests/GPU/thermohydraulique_VEF_DNS/thermohydraulique_VEF_DNS_BENCH.TU.is247793_gfx1100
index d91331f1d9..db1ff81fe9 100644
--- a/tests/GPU/thermohydraulique_VEF_DNS/thermohydraulique_VEF_DNS_BENCH.TU.is247793_gfx1100
+++ b/tests/GPU/thermohydraulique_VEF_DNS/thermohydraulique_VEF_DNS_BENCH.TU.is247793_gfx1100
@@ -1,52 +1,77 @@
-Statistiques d'initialisation du calcul
-
-Temps total                       54.7659
-
-Statistiques de resolution du probleme
-
-Temps total                       21.8014
-
-
-Timesteps                         10
-Secondes / pas de temps           2.18013
-Dont solveurs Ax=B                0.047244  2% (3 appels/pas de temps)
-Dont solveur diffusion_implicite  0.044039  2% (6 appels/pas de temps)
-Dont mettre_a_jour                0.156435  7% (1 appel/pas de temps)
-Dont operateurs convection        0.169475  7% (6 appels/pas de temps)
-Dont operateurs diffusion         0.109363  5% (24 appels/pas de temps)
-Dont operateurs gradient          0.041460  1% (6 appels/pas de temps)
-Dont operateurs divergence        0.011335  0% (4 appels/pas de temps)
-Dont operateurs source            0.278923 12% (6 appels/pas de temps)
-Dont operations postraitement     1.232232 56% (1 appel/pas de temps)
-Dont calcul dt                    0.005455  0% (10 appels/pas de temps)
-Dont calcul divers                0.084165  3% (0 appels/pas de temps)
-Nb solveur / pas de temps         3
-Secondes / solveur                0.0157481
-Iterations / solveur              5.13333
-GPU statistics per time step (experimental):
-Libraries : 0.046677 s  2.1%  3.0 calls
-Kernels   : 0.585664 s 26.9% 8259112.2 calls
-Copy H2D  : 0.059068 s  2.7% 123.4 calls 16.4 GB/s
-Copy D2H  : 0.120714 s  5.5% 174.3 calls 24.5 GB/s
-Alloc/Free: 0.005446 s  0.2% 42.4 calls
-GPU: 29% Copy H<->D: 8.2% Alloc/Free: 0.2% Comm: 0% CPU & Others: 62.4%
-I/O:
-
-Timesteps = number of time steps
-Nb solveur = number of linear system resolutions
-Nb assemblage implicite = number of matrix assemblies for the implicit scheme
-Iterations = average number of iterations of the solver
-Communications = fraction of the time spent
-                 in communications between processors (excluding io files)
-Network latency = time of one mpsum measured by an internal bench over 0.1s
-Network bandwidth = maximum on all processors
-                    of the average bandwidth of send_recv operations
-Waiting time = estimation of the waiting time of the different processors
-
-Max_waiting_time big    => probably due to a bad partitioning
-Communications > 30%    => too many processors or network too slow
-
-Statistiques de post resolution
-
-Temps total                       1.97435
+                                             # Global performance file #
+
+This is the global file for tracking performance in TRUST. It stores aggregated quantities.
+More detailed statistics can be found in the thermohydraulique_VEF_DNS_BENCH_csv.TU file
+For time loop, only standard counters of level 1 are printed alongside your custom counters
+Time is given in seconds
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+                                              Context of the computation
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Date:     22-05-2026 -- 17:43:39
+OS:       is247793__Linux__x86_64__6.8.0-107-generic__#107~22.04.1-Ubuntu SMP PREEMPT_DYNAMIC Wed Mar 18 23:40:43 UTC 
+CPU model : AMD Ryzen Threadripper PRO 7965WX 24-Cores
+Total number of threads:48
+GPU model: AMD Radeon PRO W7900
+HIP runtime version: 7.53
+HIP drivers version: 7.53
+Nb procs used for the computation: 1
+TRUST version: 1.9.8_beta
+Total number of elements used for the calculation: 1638400
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+                                           Computation start-up statistics
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Total time of the start-up:                                                26.899         
+
+Number of calls to the linear solver per time step:                        2              
+Average time of the resolution of the linear problem per call:             1.26318        
+Average number of iteration of the linear solver per call:                 0              
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+                                                 Time loop statistics
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+The first time step is not accounted for the computation of the time loop statistics
+Total time of the time loop:                                               5.13364        
+Number of time steps:                                                      9              
+Skipped time steps:                                                        1              
+Average time per time step:                                                0.570404       
+Standard deviation between time steps:                                     0.176546       
+Time elapsed in the skipped time steps:                                    1.05177        
+
+
+Standard counter description             | Time/step       | % loop time | Call(s)/step   
+------------------------------------------------------------------------------------------
+Linear solver resolutions Ax=B           | 0.0608076       | 10.7        | 3              
+Convection operator                      | 0.09816364      | 17.2        | 6              
+Diffusion operator                       | 0.05411814      |  9.5        | 24             
+Gradient operator                        | 0.02642929      |  4.6        | 6              
+Divergence operator                      | 0.007925149     |  1.4        | 4              
+Source terms                             | 0.08792285      | 15.4        | 6              
+Update ::mettre_a_jour                   | 0.02459289      |  4.3        | 1              
+Solver for implicit diffusion            | 0.03142004      |  5.5        | 6              
+Computation of the time step dt          | 0.004884953     |  0.9        | 10             
+Post-treatment operations                | 0.149868        | 26.3        | 1              
+Other operations                         | 0.02427166      |  4.3        | 
+
+Average number of iteration of the linear solver per call:                 7              
+
+
+-----------------------------------------------------------------------------------------------------------
+                                                    GPU statistics
+-----------------------------------------------------------------------------------------------------------
+Counter description                      | Time per step   | % loop time | Call(s)/step    | Bandwidth 
+-----------------------------------------------------------------------------------------------------------
+Libraries:                               | 0.0588719       | 10.3        | 3               | 
+Kernels:                                 | 0.470853        | 82.5        | 1632            | 
+Copy host to device:                     | 0.00133171      |  0.2        | 51              | 2.2 GB/s
+Copy device to host:                     | 0.00484713      |  0.8        | 40              | 12.8 GB/s
+Alloc/Free on device:                    | 0.000724319     |  0.1        | 23              | 
+GPU: 93% Copy H<->D: 1.1% Alloc/free: 0.13% Comm: 0% CPU & I/O: 5.9%
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+                                              Post-resolution statistics
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Time of the post-resolution:                                               0.663809       
+
+Total time for the whole computation                                       33.7482        
 
diff --git a/tests/GPU/thermohydraulique_VEF_DNS/thermohydraulique_VEF_DNS_BENCH.TU.lumi_gfx90a b/tests/GPU/thermohydraulique_VEF_DNS/thermohydraulique_VEF_DNS_BENCH.TU.lumi_gfx90a
index 40ba048fd3..ce8211b76c 100644
--- a/tests/GPU/thermohydraulique_VEF_DNS/thermohydraulique_VEF_DNS_BENCH.TU.lumi_gfx90a
+++ b/tests/GPU/thermohydraulique_VEF_DNS/thermohydraulique_VEF_DNS_BENCH.TU.lumi_gfx90a
@@ -8,13 +8,13 @@ Time is given in seconds
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
                                               Context of the computation
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-Date:     20-02-2026 -- 13:11:49
-OS:       nid005020__Linux__x86_64__6.4.0-150600.23.73_15.0.14-cray_shasta_c__#1 SMP Tue Oct 21 20:32:25 UTC 2025 (89d3c98)
+Date:     15-05-2026 -- 21:17:21
+OS:       nid007956__Linux__x86_64__6.4.0-150600.23.73_15.0.14-cray_shasta_c__#1 SMP Tue Oct 21 20:32:25 UTC 2025 (89d3c98)
 CPU model : AMD EPYC 7A53 64-Core Processor
 Total number of threads:128
 GPU model: AMD Instinct MI250X
-HIP runtime version: 6.42
-HIP drivers version: 6.42
+HIP runtime version: 6.43
+HIP drivers version: 6.43
 Nb procs used for the computation: 1
 TRUST version: 1.9.8_beta
 Total number of elements used for the calculation: 1638400
@@ -22,37 +22,37 @@ Total number of elements used for the calculation: 1638400
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
                                            Computation start-up statistics
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-Total time of the start-up:                                                41.2855        
+Total time of the start-up:                                                76.7767        
 
 Number of calls to the linear solver per time step:                        2              
-Average time of the resolution of the linear problem per call:             1.36088        
+Average time of the resolution of the linear problem per call:             2.71566        
 Average number of iteration of the linear solver per call:                 0              
 
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
                                                  Time loop statistics
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 The first time step is not accounted for the computation of the time loop statistics
-Total time of the time loop:                                               8.43206        
+Total time of the time loop:                                               6.97185        
 Number of time steps:                                                      9              
 Skipped time steps:                                                        1              
-Average time per time step:                                                0.936896       
-Standard deviation between time steps:                                     0.1774         
-Time elapsed in the skipped time steps:                                    1.36652        
+Average time per time step:                                                0.77465        
+Standard deviation between time steps:                                     0.278049       
+Time elapsed in the skipped time steps:                                    1.17363        
 
 
 Standard counter description             | Time/step       | % loop time | Call(s)/step   
 ------------------------------------------------------------------------------------------
-Linear solver resolutions Ax=B           | 0.0714963       |  6.6        | 3              
-Convection operator                      | 0.2741781       | 25.2        | 6              
-Diffusion operator                       | 0.09669706      |  8.9        | 24             
-Gradient operator                        | 0.08605537      |  7.9        | 6              
-Divergence operator                      | 0.0199693       |  1.8        | 4              
-Source terms                             | 0.1487263       | 13.7        | 6              
-Update ::mettre_a_jour                   | 0.02601599      |  2.4        | 1              
-Solver for implicit diffusion            | 0.02026094      |  1.9        | 6              
-Computation of the time step dt          | 0.003127665     |  0.3        | 10             
-Post-treatment operations                | 0.1756955       | 16.1        | 1              
-Other operations                         | 0.01467348      |  1.3        | 
+Linear solver resolutions Ax=B           | 0.028532        |  3.7        | 3              
+Convection operator                      | 0.2375367       | 30.7        | 6              
+Diffusion operator                       | 0.08060514      | 10.4        | 24             
+Gradient operator                        | 0.04153123      |  5.4        | 6              
+Divergence operator                      | 0.01144389      |  1.5        | 4              
+Source terms                             | 0.1300165       | 16.8        | 6              
+Update ::mettre_a_jour                   | 0.01896836      |  2.4        | 1              
+Solver for implicit diffusion            | 0.01883069      |  2.4        | 6              
+Computation of the time step dt          | 0.00290556      |  0.4        | 10             
+Post-treatment operations                | 0.1897926       | 24.5        | 1              
+Other operations                         | 0.01448759      |  1.9        | 
 
 Average number of iteration of the linear solver per call:                 8              
 
@@ -62,17 +62,17 @@ Average number of iteration of the linear solver per call:                 8
 -----------------------------------------------------------------------------------------------------------
 Counter description                      | Time per step   | % loop time | Call(s)/step    | Bandwidth 
 -----------------------------------------------------------------------------------------------------------
-Libraries:                               | 0.0687561       |  7.3        | 3               | 
-Kernels:                                 | 0.805209        | 85.9        | 1807            | 
-Copy host to device:                     | 0.00104428      |  0.1        | 43              | 2.8 GB/s
-Copy device to host:                     | 0.00508409      |  0.5        | 98              | 12.2 GB/s
-Alloc/Free on device:                    | 0.000108709     |  0.0        | 4               | 
-GPU: 93% Copy H<->D: 0.65% Alloc/free: 0.012% Comm: 0% CPU & I/O: 6.1%
+Libraries:                               | 0.0279793       |  3.6        | 3               | 
+Kernels:                                 | 0.680184        | 87.8        | 1632            | 
+Copy host to device:                     | 0.0011041       |  0.1        | 51              | 2.6 GB/s
+Copy device to host:                     | 0.00390742      |  0.5        | 40              | 15.9 GB/s
+Alloc/Free on device:                    | 0.000569724     |  0.1        | 23              | 
+GPU: 91% Copy H<->D: 0.65% Alloc/free: 0.074% Comm: 0% CPU & I/O: 7.9%
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
                                               Post-resolution statistics
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-Time of the post-resolution:                                               1.51324        
+Time of the post-resolution:                                               1.70629        
 
-Total time for the whole computation                                       52.5973        
+Total time for the whole computation                                       86.6285        
 
-Power consumption: 0 kW 0 kWh 0 € (0.12€/kWh)
+[Slurm] Power consumption (107 s):  0.497 kW  0.015 kWh  0.001 € (0.10€/kWh)
diff --git a/tests/GPU/thermohydraulique_VEF_DNS/thermohydraulique_VEF_DNS_BENCH.TU.topaze_cc80 b/tests/GPU/thermohydraulique_VEF_DNS/thermohydraulique_VEF_DNS_BENCH.TU.topaze_cc80
index c45966936b..1c6a1ecb9a 100644
--- a/tests/GPU/thermohydraulique_VEF_DNS/thermohydraulique_VEF_DNS_BENCH.TU.topaze_cc80
+++ b/tests/GPU/thermohydraulique_VEF_DNS/thermohydraulique_VEF_DNS_BENCH.TU.topaze_cc80
@@ -8,8 +8,8 @@ Time is given in seconds
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
                                               Context of the computation
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-Date:     20-02-2026 -- 18:14:24
-OS:       topaze7059__Linux__x86_64__4.18.0-553.89.1.el8_10.x86_64__#1 SMP Sat Nov 29 00:49:18 EST 2025
+Date:     15-05-2026 -- 14:02:44
+OS:       topaze7040__Linux__x86_64__4.18.0-553.89.1.el8_10.x86_64__#1 SMP Sat Nov 29 00:49:18 EST 2025
 CPU model : AMD EPYC 7763 64-Core Processor
 Total number of threads:256
 GPU model: NVIDIA A100-SXM4-80GB
@@ -22,37 +22,37 @@ Total number of elements used for the calculation: 1638400
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
                                            Computation start-up statistics
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-Total time of the start-up:                                                39.1244        
+Total time of the start-up:                                                39.6475        
 
 Number of calls to the linear solver per time step:                        2              
-Average time of the resolution of the linear problem per call:             1.31573        
+Average time of the resolution of the linear problem per call:             1.26543        
 Average number of iteration of the linear solver per call:                 0              
 
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
                                                  Time loop statistics
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 The first time step is not accounted for the computation of the time loop statistics
-Total time of the time loop:                                               3.01555        
+Total time of the time loop:                                               2.7949         
 Number of time steps:                                                      9              
 Skipped time steps:                                                        1              
-Average time per time step:                                                0.335062       
-Standard deviation between time steps:                                     0.151722       
-Time elapsed in the skipped time steps:                                    0.911653       
+Average time per time step:                                                0.310545       
+Standard deviation between time steps:                                     0.237972       
+Time elapsed in the skipped time steps:                                    0.784626       
 
 
 Standard counter description             | Time/step       | % loop time | Call(s)/step   
 ------------------------------------------------------------------------------------------
-Linear solver resolutions Ax=B           | 0.0285602       |  6.5        | 3              
-Convection operator                      | 0.0496885       | 11.4        | 6              
-Diffusion operator                       | 0.03493974      |  8.0        | 24             
-Gradient operator                        | 0.01687099      |  3.9        | 6              
-Divergence operator                      | 0.005575172     |  1.3        | 4              
-Source terms                             | 0.03593161      |  8.2        | 6              
-Update ::mettre_a_jour                   | 0.01548032      |  3.5        | 1              
-Solver for implicit diffusion            | 0.01588839      |  3.6        | 6              
-Computation of the time step dt          | 0.002096975     |  0.5        | 10             
-Post-treatment operations                | 0.1171378       | 26.8        | 1              
-Other operations                         | 0.01289186      |  3.0        | 
+Linear solver resolutions Ax=B           | 0.0277935       |  8.9        | 3              
+Convection operator                      | 0.0337385       | 10.9        | 6              
+Diffusion operator                       | 0.02719853      |  8.8        | 24             
+Gradient operator                        | 0.01307838      |  4.2        | 6              
+Divergence operator                      | 0.002931471     |  0.9        | 4              
+Source terms                             | 0.03489879      | 11.2        | 6              
+Update ::mettre_a_jour                   | 0.01111204      |  3.6        | 1              
+Solver for implicit diffusion            | 0.01428297      |  4.6        | 6              
+Computation of the time step dt          | 0.001949491     |  0.6        | 10             
+Post-treatment operations                | 0.1323786       | 42.6        | 1              
+Other operations                         | 0.01118269      |  3.6        | 
 
 Average number of iteration of the linear solver per call:                 8              
 
@@ -62,16 +62,17 @@ Average number of iteration of the linear solver per call:                 8
 -----------------------------------------------------------------------------------------------------------
 Counter description                      | Time per step   | % loop time | Call(s)/step    | Bandwidth 
 -----------------------------------------------------------------------------------------------------------
-Libraries:                               | 0.0281164       |  8.4        | 3               | 
-Kernels:                                 | 0.237447        | 70.9        | 1807            | 
-Copy host to device:                     | 0.000842697     |  0.3        | 43              | 3.5 GB/s
-Copy device to host:                     | 0.00602455      |  1.8        | 98              | 10.3 GB/s
-Alloc/Free on device:                    | 0.000168023     |  0.1        | 4               | 
-GPU: 79% Copy H<->D: 2% Alloc/free: 0.05% Comm: 0% CPU & I/O: 19%
+Libraries:                               | 0.027341        |  8.8        | 3               | 
+Kernels:                                 | 0.212071        | 68.3        | 1632            | 
+Copy host to device:                     | 0.00092218      |  0.3        | 51              | 3.2 GB/s
+Copy device to host:                     | 0.00497458      |  1.6        | 40              | 12.5 GB/s
+Alloc/Free on device:                    | 0.000821941     |  0.3        | 23              | 
+GPU: 77% Copy H<->D: 1.9% Alloc/free: 0.26% Comm: 0% CPU & I/O: 21%
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
                                               Post-resolution statistics
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-Time of the post-resolution:                                               1.31988        
+Time of the post-resolution:                                               1.08213        
 
-Total time for the whole computation                                       44.3715        
+Total time for the whole computation                                       44.3091        
 
+[Slurm] Power consumption (79 s):  0.511 kW  0.011 kWh  0.001 € (0.10€/kWh)
diff --git a/tests/Reference/Correlation_triple_VDF/Correlation_triple_VDF.data b/tests/Reference/Correlation_triple_VDF/Correlation_triple_VDF.data
new file mode 100755
index 0000000000..02951dea58
--- /dev/null
+++ b/tests/Reference/Correlation_triple_VDF/Correlation_triple_VDF.data
@@ -0,0 +1,126 @@
+Dimension 3
+
+Pb_hydraulique pb
+
+Domaine dom
+Mailler dom
+{
+	pave bloc
+	{
+		origine 0 0 0
+		longueurs 1 1 1
+		nombre_de_noeuds 6 6 6
+		}
+		{
+		bord frontiere	X = 0	0 <= Y <= 1	0 <= Z <= 1
+		bord frontiere	X = 1	0 <= Y <= 1	0 <= Z <= 1
+		bord frontiere	Y = 0	0 <= X <= 1	0 <= Z <= 1
+		bord frontiere	Y = 1	0 <= X <= 1	0 <= Z <= 1
+		bord frontiere	Z = 0	0 <= X <= 1	0 <= Y <= 1
+		bord frontiere	Z = 1	0 <= X <= 1	0 <= Y <= 1
+		}
+}
+
+VDF dis
+
+Schema_Euler_explicite sch
+Lire sch
+{
+	nb_pas_dt_max 10
+	tinit 0
+	tmax 1
+	dt_sauv -1
+}
+
+Associer pb dom
+Associer pb sch
+
+Discretiser pb dis
+
+Lire pb
+{
+	Fluide_incompressible
+	{
+		mu	champ_uniforme 1 1
+		rho	champ_uniforme 1 1
+	}
+	Navier_Stokes_standard
+	{
+		solveur_pression	petsc Cholesky { }
+		conditions_initiales	{ vitesse champ_uniforme 3 1 0 0 }
+		conditions_limites	{
+					frontiere paroi_fixe
+					}
+		convection		{ centre }
+		diffusion		{ }
+		sources			{ source_qdm champ_fonc_xyz dom 3 cos(2*pi*x)*cos(2*pi*y)*cos(2*pi*z) cos(2*pi*x)*cos(2*pi*y)*sin(2*pi*z) cos(2*pi*x)*sin(2*pi*y)*sin(2*pi*z) }
+	}
+	Postraitement
+	{
+		definition_champs	{
+					ui		refChamp		{ pb_champ pb vitesse }
+					u1		transformation		{ methode composante numero 0 localisation elem sources_reference { ui } }
+					u2		transformation		{ methode composante numero 1 localisation elem sources_reference { ui } }
+					u3		transformation		{ methode composante numero 2 localisation elem sources_reference { ui } }
+					moy_u1		moyenne			{ t_deb 0 t_fin 1e+6 sources_reference { u1 } }
+					moy_u2		moyenne			{ t_deb 0 t_fin 1e+6 sources_reference { u2 } }
+					moy_u3		moyenne			{ t_deb 0 t_fin 1e+6 sources_reference { u3 } }
+					u1prime		transformation		{ methode formule expression 1 u1-moy_u1 localisation elem sources_reference { u1 , moy_u1 } }
+					u2prime		transformation		{ methode formule expression 1 u2-moy_u2 localisation elem sources_reference { u2 , moy_u2 } }
+					u3prime		transformation		{ methode formule expression 1 u3-moy_u3 localisation elem sources_reference { u3 , moy_u3 } }
+
+					u1u1u1_methode1		moyenne		{ t_deb 0 t_fin 1e+6 sources { transformation { methode formule expression 1 u1prime*u1prime*u1prime localisation elem sources_reference { u1prime } } } }
+					u1u1u2_methode1		moyenne		{ t_deb 0 t_fin 1e+6 sources { transformation { methode formule expression 1 u1prime*u1prime*u2prime localisation elem sources_reference { u1prime , u2prime } } } }
+					u1u1u3_methode1		moyenne		{ t_deb 0 t_fin 1e+6 sources { transformation { methode formule expression 1 u1prime*u1prime*u3prime localisation elem sources_reference { u1prime , u3prime } } } }
+					u1u2u2_methode1		moyenne		{ t_deb 0 t_fin 1e+6 sources { transformation { methode formule expression 1 u1prime*u2prime*u2prime localisation elem sources_reference { u1prime , u2prime } } } }
+					u1u2u3_methode1		moyenne		{ t_deb 0 t_fin 1e+6 sources { transformation { methode formule expression 1 u1prime*u2prime*u3prime localisation elem sources_reference { u1prime , u2prime , u3prime } } } }
+					u1u3u3_methode1		moyenne		{ t_deb 0 t_fin 1e+6 sources { transformation { methode formule expression 1 u1prime*u3prime*u3prime localisation elem sources_reference { u1prime , u3prime } } } }
+					u2u2u2_methode1		moyenne		{ t_deb 0 t_fin 1e+6 sources { transformation { methode formule expression 1 u2prime*u2prime*u2prime localisation elem sources_reference { u2prime } } } }
+					u2u2u3_methode1		moyenne		{ t_deb 0 t_fin 1e+6 sources { transformation { methode formule expression 1 u2prime*u2prime*u3prime localisation elem sources_reference { u2prime , u3prime } } } }
+					u2u3u3_methode1		moyenne		{ t_deb 0 t_fin 1e+6 sources { transformation { methode formule expression 1 u2prime*u3prime*u3prime localisation elem sources_reference { u2prime , u3prime } } } }
+					u3u3u3_methode1		moyenne		{ t_deb 0 t_fin 1e+6 sources { transformation { methode formule expression 1 u3prime*u3prime*u3prime localisation elem sources_reference { u3prime } } } }
+
+					# uiujuk = vecteur 27 composantes : composante (i,j,k) -> colonne 9*i+3*j+k-13 #
+					uiujuk			correlation_triple	{ t_deb 0 t_fin 1e+6 sources_reference { ui , ui , ui } }
+					u1u1u1_methode2		transformation		{ methode composante numero 0 localisation elem sources_reference { uiujuk } }
+					u1u1u2_methode2		transformation		{ methode composante numero 1 localisation elem sources_reference { uiujuk } }
+					u1u1u3_methode2		transformation		{ methode composante numero 2 localisation elem sources_reference { uiujuk } }
+					u1u2u2_methode2		transformation		{ methode composante numero 4 localisation elem sources_reference { uiujuk } }
+					u1u2u3_methode2		transformation		{ methode composante numero 5 localisation elem sources_reference { uiujuk } }
+					u1u3u3_methode2		transformation		{ methode composante numero 8 localisation elem sources_reference { uiujuk } }
+					u2u2u2_methode2		transformation		{ methode composante numero 13 localisation elem sources_reference { uiujuk } }
+					u2u2u3_methode2		transformation		{ methode composante numero 14 localisation elem sources_reference { uiujuk } }
+					u2u3u3_methode2		transformation		{ methode composante numero 17 localisation elem sources_reference { uiujuk } }
+					u3u3u3_methode2		transformation		{ methode composante numero 26 localisation elem sources_reference { uiujuk } }
+					}
+		sondes			{
+					u1u1u1_methode1	u1u1u1_methode1 periode 1e-6 point 1 0.5 0.5 0.5
+					u1u1u2_methode1 u1u1u2_methode1 periode 1e-6 position_like u1u1u1_methode1
+					u1u1u3_methode1 u1u1u3_methode1 periode 1e-6 position_like u1u1u1_methode1
+					u1u2u2_methode1 u1u2u2_methode1 periode 1e-6 position_like u1u1u1_methode1
+					u1u2u3_methode1 u1u2u3_methode1 periode 1e-6 position_like u1u1u1_methode1
+					u1u3u3_methode1 u1u3u3_methode1 periode 1e-6 position_like u1u1u1_methode1
+					u2u2u2_methode1 u2u2u2_methode1 periode 1e-6 position_like u1u1u1_methode1
+					u2u2u3_methode1 u2u2u3_methode1 periode 1e-6 position_like u1u1u1_methode1
+					u2u3u3_methode1 u2u3u3_methode1 periode 1e-6 position_like u1u1u1_methode1
+					u3u3u3_methode1 u3u3u3_methode1 periode 1e-6 position_like u1u1u1_methode1
+
+					u1u1u1_methode2 u1u1u1_methode2 periode 1e-6 position_like u1u1u1_methode1
+					u1u1u2_methode2 u1u1u2_methode2 periode 1e-6 position_like u1u1u1_methode1
+					u1u1u3_methode2 u1u1u3_methode2 periode 1e-6 position_like u1u1u1_methode1
+					u1u2u2_methode2 u1u2u2_methode2 periode 1e-6 position_like u1u1u1_methode1
+					u1u2u3_methode2 u1u2u3_methode2 periode 1e-6 position_like u1u1u1_methode1
+					u1u3u3_methode2 u1u3u3_methode2 periode 1e-6 position_like u1u1u1_methode1
+					u2u2u2_methode2 u2u2u2_methode2 periode 1e-6 position_like u1u1u1_methode1
+					u2u2u3_methode2 u2u2u3_methode2 periode 1e-6 position_like u1u1u1_methode1
+					u2u3u3_methode2 u2u3u3_methode2 periode 1e-6 position_like u1u1u1_methode1
+					u3u3u3_methode2 u3u3u3_methode2 periode 1e-6 position_like u1u1u1_methode1
+					}
+	}
+}
+
+EcritureLectureSpecial 0
+
+Resoudre pb
+ 
+Fin
diff --git a/tests/Reference/Correlation_triple_VDF/Correlation_triple_VDF.lml.gz b/tests/Reference/Correlation_triple_VDF/Correlation_triple_VDF.lml.gz
new file mode 100644
index 0000000000..b73bcf440a
Binary files /dev/null and b/tests/Reference/Correlation_triple_VDF/Correlation_triple_VDF.lml.gz differ
diff --git a/tests/Reference/Correlation_triple_VDF/Correlation_triple_VDF_U1U1U1_METHODE1.son.ref b/tests/Reference/Correlation_triple_VDF/Correlation_triple_VDF_U1U1U1_METHODE1.son.ref
new file mode 100644
index 0000000000..5b9124bf11
--- /dev/null
+++ b/tests/Reference/Correlation_triple_VDF/Correlation_triple_VDF_U1U1U1_METHODE1.son.ref
@@ -0,0 +1,15 @@
+# Correlation_triple_VDF_U1U1U1_METHODE1.son
+# Temps x= 5.00000000e-01 y= 5.00000000e-01 z= 5.00000000e-01
+# Champ U1U1U1_METHODE1 [??]
+# Type POINT
+0.00000000e+00 0.00000000e+00
+6.66666667e-03 0.00000000e+00
+1.33320901e-02 -8.18407828e-11
+1.99971806e-02 -1.46044746e-10
+2.66621435e-02 -1.82050285e-10
+3.33270417e-02 -1.92896806e-10
+3.99919067e-02 -1.90502981e-10
+4.66567528e-02 -1.82207513e-10
+5.33215866e-02 -1.71361748e-10
+5.99864138e-02 -1.60277265e-10
+6.66512360e-02 -1.49535492e-10
diff --git a/tests/Reference/Correlation_triple_VDF/Correlation_triple_VDF_U1U1U1_METHODE2.son.ref b/tests/Reference/Correlation_triple_VDF/Correlation_triple_VDF_U1U1U1_METHODE2.son.ref
new file mode 100644
index 0000000000..06ba690bb8
--- /dev/null
+++ b/tests/Reference/Correlation_triple_VDF/Correlation_triple_VDF_U1U1U1_METHODE2.son.ref
@@ -0,0 +1,15 @@
+# Correlation_triple_VDF_U1U1U1_METHODE2.son
+# Temps x= 5.00000000e-01 y= 5.00000000e-01 z= 5.00000000e-01
+# Champ U1U1U1_METHODE2 [??]
+# Type POINT
+0.00000000e+00 0.00000000e+00
+6.66666667e-03 0.00000000e+00
+1.33320901e-02 -8.18407828e-11
+1.99971806e-02 -1.46044746e-10
+2.66621435e-02 -1.82050285e-10
+3.33270417e-02 -1.92896806e-10
+3.99919067e-02 -1.90502981e-10
+4.66567528e-02 -1.82207513e-10
+5.33215866e-02 -1.71361748e-10
+5.99864138e-02 -1.60277265e-10
+6.66512360e-02 -1.49535492e-10
diff --git a/tests/Reference/Correlation_triple_VDF/Correlation_triple_VDF_U1U1U2_METHODE1.son.ref b/tests/Reference/Correlation_triple_VDF/Correlation_triple_VDF_U1U1U2_METHODE1.son.ref
new file mode 100644
index 0000000000..a1d24db973
--- /dev/null
+++ b/tests/Reference/Correlation_triple_VDF/Correlation_triple_VDF_U1U1U2_METHODE1.son.ref
@@ -0,0 +1,15 @@
+# Correlation_triple_VDF_U1U1U2_METHODE1.son
+# Temps x= 5.00000000e-01 y= 5.00000000e-01 z= 5.00000000e-01
+# Champ U1U1U2_METHODE1 [??]
+# Type POINT
+0.00000000e+00 0.00000000e+00
+6.66666667e-03 0.00000000e+00
+1.33320901e-02 -4.09203914e-11
+1.99971806e-02 -7.30223733e-11
+2.66621435e-02 -9.10251430e-11
+3.33270417e-02 -9.64484038e-11
+3.99919067e-02 -9.52514912e-11
+4.66567528e-02 -9.11037572e-11
+5.33215866e-02 -8.56808748e-11
+5.99864138e-02 -8.01386335e-11
+6.66512360e-02 -7.47677468e-11
diff --git a/tests/Reference/Correlation_triple_VDF/Correlation_triple_VDF_U1U1U2_METHODE2.son.ref b/tests/Reference/Correlation_triple_VDF/Correlation_triple_VDF_U1U1U2_METHODE2.son.ref
new file mode 100644
index 0000000000..622fe5415c
--- /dev/null
+++ b/tests/Reference/Correlation_triple_VDF/Correlation_triple_VDF_U1U1U2_METHODE2.son.ref
@@ -0,0 +1,15 @@
+# Correlation_triple_VDF_U1U1U2_METHODE2.son
+# Temps x= 5.00000000e-01 y= 5.00000000e-01 z= 5.00000000e-01
+# Champ U1U1U2_METHODE2 [??]
+# Type POINT
+0.00000000e+00 0.00000000e+00
+6.66666667e-03 0.00000000e+00
+1.33320901e-02 -4.09203914e-11
+1.99971806e-02 -7.30223733e-11
+2.66621435e-02 -9.10251430e-11
+3.33270417e-02 -9.64484038e-11
+3.99919067e-02 -9.52514912e-11
+4.66567528e-02 -9.11037572e-11
+5.33215866e-02 -8.56808748e-11
+5.99864138e-02 -8.01386335e-11
+6.66512360e-02 -7.47677468e-11
diff --git a/tests/Reference/Correlation_triple_VDF/Correlation_triple_VDF_U1U1U3_METHODE1.son.ref b/tests/Reference/Correlation_triple_VDF/Correlation_triple_VDF_U1U1U3_METHODE1.son.ref
new file mode 100644
index 0000000000..f36fa51c38
--- /dev/null
+++ b/tests/Reference/Correlation_triple_VDF/Correlation_triple_VDF_U1U1U3_METHODE1.son.ref
@@ -0,0 +1,15 @@
+# Correlation_triple_VDF_U1U1U3_METHODE1.son
+# Temps x= 5.00000000e-01 y= 5.00000000e-01 z= 5.00000000e-01
+# Champ U1U1U3_METHODE1 [??]
+# Type POINT
+0.00000000e+00 0.00000000e+00
+6.66666667e-03 0.00000000e+00
+1.33320901e-02 -6.51020414e-15
+1.99971806e-02 -1.79543497e-14
+2.66621435e-02 -2.58186455e-14
+3.33270417e-02 -2.93005285e-14
+3.99919067e-02 -3.01466580e-14
+4.66567528e-02 -2.95220709e-14
+5.33215866e-02 -2.82337440e-14
+5.99864138e-02 -2.66926033e-14
+6.66512360e-02 -2.51088448e-14
diff --git a/tests/Reference/Correlation_triple_VDF/Correlation_triple_VDF_U1U1U3_METHODE2.son.ref b/tests/Reference/Correlation_triple_VDF/Correlation_triple_VDF_U1U1U3_METHODE2.son.ref
new file mode 100644
index 0000000000..2f77bcd18e
--- /dev/null
+++ b/tests/Reference/Correlation_triple_VDF/Correlation_triple_VDF_U1U1U3_METHODE2.son.ref
@@ -0,0 +1,15 @@
+# Correlation_triple_VDF_U1U1U3_METHODE2.son
+# Temps x= 5.00000000e-01 y= 5.00000000e-01 z= 5.00000000e-01
+# Champ U1U1U3_METHODE2 [??]
+# Type POINT
+0.00000000e+00 0.00000000e+00
+6.66666667e-03 0.00000000e+00
+1.33320901e-02 -6.51020414e-15
+1.99971806e-02 -1.79543497e-14
+2.66621435e-02 -2.58186455e-14
+3.33270417e-02 -2.93005285e-14
+3.99919067e-02 -3.01466580e-14
+4.66567528e-02 -2.95220709e-14
+5.33215866e-02 -2.82337440e-14
+5.99864138e-02 -2.66926033e-14
+6.66512360e-02 -2.51088448e-14
diff --git a/tests/Reference/Correlation_triple_VDF/Correlation_triple_VDF_U1U2U2_METHODE1.son.ref b/tests/Reference/Correlation_triple_VDF/Correlation_triple_VDF_U1U2U2_METHODE1.son.ref
new file mode 100644
index 0000000000..e42032e0fb
--- /dev/null
+++ b/tests/Reference/Correlation_triple_VDF/Correlation_triple_VDF_U1U2U2_METHODE1.son.ref
@@ -0,0 +1,15 @@
+# Correlation_triple_VDF_U1U2U2_METHODE1.son
+# Temps x= 5.00000000e-01 y= 5.00000000e-01 z= 5.00000000e-01
+# Champ U1U2U2_METHODE1 [??]
+# Type POINT
+0.00000000e+00 0.00000000e+00
+6.66666667e-03 0.00000000e+00
+1.33320901e-02 -2.04601957e-11
+1.99971806e-02 -3.65111867e-11
+2.66621435e-02 -4.55125718e-11
+3.33270417e-02 -4.82242023e-11
+3.99919067e-02 -4.76257460e-11
+4.66567528e-02 -4.55518790e-11
+5.33215866e-02 -4.28404378e-11
+5.99864138e-02 -4.00693172e-11
+6.66512360e-02 -3.73838738e-11
diff --git a/tests/Reference/Correlation_triple_VDF/Correlation_triple_VDF_U1U2U2_METHODE2.son.ref b/tests/Reference/Correlation_triple_VDF/Correlation_triple_VDF_U1U2U2_METHODE2.son.ref
new file mode 100644
index 0000000000..672aee1070
--- /dev/null
+++ b/tests/Reference/Correlation_triple_VDF/Correlation_triple_VDF_U1U2U2_METHODE2.son.ref
@@ -0,0 +1,15 @@
+# Correlation_triple_VDF_U1U2U2_METHODE2.son
+# Temps x= 5.00000000e-01 y= 5.00000000e-01 z= 5.00000000e-01
+# Champ U1U2U2_METHODE2 [??]
+# Type POINT
+0.00000000e+00 0.00000000e+00
+6.66666667e-03 0.00000000e+00
+1.33320901e-02 -2.04601957e-11
+1.99971806e-02 -3.65111867e-11
+2.66621435e-02 -4.55125718e-11
+3.33270417e-02 -4.82242023e-11
+3.99919067e-02 -4.76257460e-11
+4.66567528e-02 -4.55518790e-11
+5.33215866e-02 -4.28404378e-11
+5.99864138e-02 -4.00693172e-11
+6.66512360e-02 -3.73838738e-11
diff --git a/tests/Reference/Correlation_triple_VDF/Correlation_triple_VDF_U1U2U3_METHODE1.son.ref b/tests/Reference/Correlation_triple_VDF/Correlation_triple_VDF_U1U2U3_METHODE1.son.ref
new file mode 100644
index 0000000000..da3a8b7bb8
--- /dev/null
+++ b/tests/Reference/Correlation_triple_VDF/Correlation_triple_VDF_U1U2U3_METHODE1.son.ref
@@ -0,0 +1,15 @@
+# Correlation_triple_VDF_U1U2U3_METHODE1.son
+# Temps x= 5.00000000e-01 y= 5.00000000e-01 z= 5.00000000e-01
+# Champ U1U2U3_METHODE1 [??]
+# Type POINT
+0.00000000e+00 0.00000000e+00
+6.66666667e-03 0.00000000e+00
+1.33320901e-02 -3.25510207e-15
+1.99971806e-02 -8.97717487e-15
+2.66621435e-02 -1.29093228e-14
+3.33270417e-02 -1.46502644e-14
+3.99919067e-02 -1.50733291e-14
+4.66567528e-02 -1.47610356e-14
+5.33215866e-02 -1.41168722e-14
+5.99864138e-02 -1.33463018e-14
+6.66512360e-02 -1.25544225e-14
diff --git a/tests/Reference/Correlation_triple_VDF/Correlation_triple_VDF_U1U2U3_METHODE2.son.ref b/tests/Reference/Correlation_triple_VDF/Correlation_triple_VDF_U1U2U3_METHODE2.son.ref
new file mode 100644
index 0000000000..0b120ccc4b
--- /dev/null
+++ b/tests/Reference/Correlation_triple_VDF/Correlation_triple_VDF_U1U2U3_METHODE2.son.ref
@@ -0,0 +1,15 @@
+# Correlation_triple_VDF_U1U2U3_METHODE2.son
+# Temps x= 5.00000000e-01 y= 5.00000000e-01 z= 5.00000000e-01
+# Champ U1U2U3_METHODE2 [??]
+# Type POINT
+0.00000000e+00 0.00000000e+00
+6.66666667e-03 0.00000000e+00
+1.33320901e-02 -3.25510207e-15
+1.99971806e-02 -8.97717487e-15
+2.66621435e-02 -1.29093228e-14
+3.33270417e-02 -1.46502644e-14
+3.99919067e-02 -1.50733291e-14
+4.66567528e-02 -1.47610356e-14
+5.33215866e-02 -1.41168722e-14
+5.99864138e-02 -1.33463018e-14
+6.66512360e-02 -1.25544225e-14
diff --git a/tests/Reference/Correlation_triple_VDF/Correlation_triple_VDF_U1U3U3_METHODE1.son.ref b/tests/Reference/Correlation_triple_VDF/Correlation_triple_VDF_U1U3U3_METHODE1.son.ref
new file mode 100644
index 0000000000..9134f77668
--- /dev/null
+++ b/tests/Reference/Correlation_triple_VDF/Correlation_triple_VDF_U1U3U3_METHODE1.son.ref
@@ -0,0 +1,15 @@
+# Correlation_triple_VDF_U1U3U3_METHODE1.son
+# Temps x= 5.00000000e-01 y= 5.00000000e-01 z= 5.00000000e-01
+# Champ U1U3U3_METHODE1 [??]
+# Type POINT
+0.00000000e+00 0.00000000e+00
+6.66666667e-03 0.00000000e+00
+1.33320901e-02 -5.17868432e-19
+1.99971806e-02 -2.37125564e-18
+2.66621435e-02 -3.88270790e-18
+3.33270417e-02 -4.68789302e-18
+3.99919067e-02 -5.00982847e-18
+4.66567528e-02 -5.01075198e-18
+5.33215866e-02 -4.86799847e-18
+5.99864138e-02 -4.64726336e-18
+6.66512360e-02 -4.40545326e-18
diff --git a/tests/Reference/Correlation_triple_VDF/Correlation_triple_VDF_U1U3U3_METHODE2.son.ref b/tests/Reference/Correlation_triple_VDF/Correlation_triple_VDF_U1U3U3_METHODE2.son.ref
new file mode 100644
index 0000000000..7582db2bbe
--- /dev/null
+++ b/tests/Reference/Correlation_triple_VDF/Correlation_triple_VDF_U1U3U3_METHODE2.son.ref
@@ -0,0 +1,15 @@
+# Correlation_triple_VDF_U1U3U3_METHODE2.son
+# Temps x= 5.00000000e-01 y= 5.00000000e-01 z= 5.00000000e-01
+# Champ U1U3U3_METHODE2 [??]
+# Type POINT
+0.00000000e+00 0.00000000e+00
+6.66666667e-03 0.00000000e+00
+1.33320901e-02 -5.17868432e-19
+1.99971806e-02 -2.37125564e-18
+2.66621435e-02 -3.88270790e-18
+3.33270417e-02 -4.68789302e-18
+3.99919067e-02 -5.00982847e-18
+4.66567528e-02 -5.01075198e-18
+5.33215866e-02 -4.86799847e-18
+5.99864138e-02 -4.64726336e-18
+6.66512360e-02 -4.40545326e-18
diff --git a/tests/Reference/Correlation_triple_VDF/Correlation_triple_VDF_U2U2U2_METHODE1.son.ref b/tests/Reference/Correlation_triple_VDF/Correlation_triple_VDF_U2U2U2_METHODE1.son.ref
new file mode 100644
index 0000000000..f1f5819c22
--- /dev/null
+++ b/tests/Reference/Correlation_triple_VDF/Correlation_triple_VDF_U2U2U2_METHODE1.son.ref
@@ -0,0 +1,15 @@
+# Correlation_triple_VDF_U2U2U2_METHODE1.son
+# Temps x= 5.00000000e-01 y= 5.00000000e-01 z= 5.00000000e-01
+# Champ U2U2U2_METHODE1 [??]
+# Type POINT
+0.00000000e+00 0.00000000e+00
+6.66666667e-03 0.00000000e+00
+1.33320901e-02 -1.02300979e-11
+1.99971806e-02 -1.82555934e-11
+2.66621435e-02 -2.27562860e-11
+3.33270417e-02 -2.41121013e-11
+3.99919067e-02 -2.38128732e-11
+4.66567528e-02 -2.27759397e-11
+5.33215866e-02 -2.14202191e-11
+5.99864138e-02 -2.00346588e-11
+6.66512360e-02 -1.86919371e-11
diff --git a/tests/Reference/Correlation_triple_VDF/Correlation_triple_VDF_U2U2U2_METHODE2.son.ref b/tests/Reference/Correlation_triple_VDF/Correlation_triple_VDF_U2U2U2_METHODE2.son.ref
new file mode 100644
index 0000000000..425c540bcc
--- /dev/null
+++ b/tests/Reference/Correlation_triple_VDF/Correlation_triple_VDF_U2U2U2_METHODE2.son.ref
@@ -0,0 +1,15 @@
+# Correlation_triple_VDF_U2U2U2_METHODE2.son
+# Temps x= 5.00000000e-01 y= 5.00000000e-01 z= 5.00000000e-01
+# Champ U2U2U2_METHODE2 [??]
+# Type POINT
+0.00000000e+00 0.00000000e+00
+6.66666667e-03 0.00000000e+00
+1.33320901e-02 -1.02300979e-11
+1.99971806e-02 -1.82555934e-11
+2.66621435e-02 -2.27562860e-11
+3.33270417e-02 -2.41121013e-11
+3.99919067e-02 -2.38128732e-11
+4.66567528e-02 -2.27759397e-11
+5.33215866e-02 -2.14202191e-11
+5.99864138e-02 -2.00346588e-11
+6.66512360e-02 -1.86919371e-11
diff --git a/tests/Reference/Correlation_triple_VDF/Correlation_triple_VDF_U2U2U3_METHODE1.son.ref b/tests/Reference/Correlation_triple_VDF/Correlation_triple_VDF_U2U2U3_METHODE1.son.ref
new file mode 100644
index 0000000000..b726e58409
--- /dev/null
+++ b/tests/Reference/Correlation_triple_VDF/Correlation_triple_VDF_U2U2U3_METHODE1.son.ref
@@ -0,0 +1,15 @@
+# Correlation_triple_VDF_U2U2U3_METHODE1.son
+# Temps x= 5.00000000e-01 y= 5.00000000e-01 z= 5.00000000e-01
+# Champ U2U2U3_METHODE1 [??]
+# Type POINT
+0.00000000e+00 0.00000000e+00
+6.66666667e-03 0.00000000e+00
+1.33320901e-02 -1.62755103e-15
+1.99971806e-02 -4.48858744e-15
+2.66621435e-02 -6.45466146e-15
+3.33270417e-02 -7.32513225e-15
+3.99919067e-02 -7.53666465e-15
+4.66567528e-02 -7.38051788e-15
+5.33215866e-02 -7.05843616e-15
+5.99864138e-02 -6.67315098e-15
+6.66512360e-02 -6.27721135e-15
diff --git a/tests/Reference/Correlation_triple_VDF/Correlation_triple_VDF_U2U2U3_METHODE2.son.ref b/tests/Reference/Correlation_triple_VDF/Correlation_triple_VDF_U2U2U3_METHODE2.son.ref
new file mode 100644
index 0000000000..475fca6de8
--- /dev/null
+++ b/tests/Reference/Correlation_triple_VDF/Correlation_triple_VDF_U2U2U3_METHODE2.son.ref
@@ -0,0 +1,15 @@
+# Correlation_triple_VDF_U2U2U3_METHODE2.son
+# Temps x= 5.00000000e-01 y= 5.00000000e-01 z= 5.00000000e-01
+# Champ U2U2U3_METHODE2 [??]
+# Type POINT
+0.00000000e+00 0.00000000e+00
+6.66666667e-03 0.00000000e+00
+1.33320901e-02 -1.62755103e-15
+1.99971806e-02 -4.48858744e-15
+2.66621435e-02 -6.45466146e-15
+3.33270417e-02 -7.32513225e-15
+3.99919067e-02 -7.53666465e-15
+4.66567528e-02 -7.38051788e-15
+5.33215866e-02 -7.05843616e-15
+5.99864138e-02 -6.67315098e-15
+6.66512360e-02 -6.27721135e-15
diff --git a/tests/Reference/Correlation_triple_VDF/Correlation_triple_VDF_U2U3U3_METHODE1.son.ref b/tests/Reference/Correlation_triple_VDF/Correlation_triple_VDF_U2U3U3_METHODE1.son.ref
new file mode 100644
index 0000000000..bb0117ccfa
--- /dev/null
+++ b/tests/Reference/Correlation_triple_VDF/Correlation_triple_VDF_U2U3U3_METHODE1.son.ref
@@ -0,0 +1,15 @@
+# Correlation_triple_VDF_U2U3U3_METHODE1.son
+# Temps x= 5.00000000e-01 y= 5.00000000e-01 z= 5.00000000e-01
+# Champ U2U3U3_METHODE1 [??]
+# Type POINT
+0.00000000e+00 0.00000000e+00
+6.66666667e-03 0.00000000e+00
+1.33320901e-02 -2.58934216e-19
+1.99971806e-02 -1.18562782e-18
+2.66621435e-02 -1.94135396e-18
+3.33270417e-02 -2.34394653e-18
+3.99919067e-02 -2.50491426e-18
+4.66567528e-02 -2.50537602e-18
+5.33215866e-02 -2.43399927e-18
+5.99864138e-02 -2.32363171e-18
+6.66512360e-02 -2.20272666e-18
diff --git a/tests/Reference/Correlation_triple_VDF/Correlation_triple_VDF_U2U3U3_METHODE2.son.ref b/tests/Reference/Correlation_triple_VDF/Correlation_triple_VDF_U2U3U3_METHODE2.son.ref
new file mode 100644
index 0000000000..b83645a934
--- /dev/null
+++ b/tests/Reference/Correlation_triple_VDF/Correlation_triple_VDF_U2U3U3_METHODE2.son.ref
@@ -0,0 +1,15 @@
+# Correlation_triple_VDF_U2U3U3_METHODE2.son
+# Temps x= 5.00000000e-01 y= 5.00000000e-01 z= 5.00000000e-01
+# Champ U2U3U3_METHODE2 [??]
+# Type POINT
+0.00000000e+00 0.00000000e+00
+6.66666667e-03 0.00000000e+00
+1.33320901e-02 -2.58934216e-19
+1.99971806e-02 -1.18562782e-18
+2.66621435e-02 -1.94135396e-18
+3.33270417e-02 -2.34394653e-18
+3.99919067e-02 -2.50491426e-18
+4.66567528e-02 -2.50537602e-18
+5.33215866e-02 -2.43399927e-18
+5.99864138e-02 -2.32363171e-18
+6.66512360e-02 -2.20272666e-18
diff --git a/tests/Reference/Correlation_triple_VDF/Correlation_triple_VDF_U3U3U3_METHODE1.son.ref b/tests/Reference/Correlation_triple_VDF/Correlation_triple_VDF_U3U3U3_METHODE1.son.ref
new file mode 100644
index 0000000000..29887e50fa
--- /dev/null
+++ b/tests/Reference/Correlation_triple_VDF/Correlation_triple_VDF_U3U3U3_METHODE1.son.ref
@@ -0,0 +1,15 @@
+# Correlation_triple_VDF_U3U3U3_METHODE1.son
+# Temps x= 5.00000000e-01 y= 5.00000000e-01 z= 5.00000000e-01
+# Champ U3U3U3_METHODE1 [??]
+# Type POINT
+0.00000000e+00 0.00000000e+00
+6.66666667e-03 1.49813643e-95
+1.33320901e-02 -4.11949775e-23
+1.99971806e-02 -3.28966603e-22
+2.66621435e-02 -6.05179509e-22
+3.33270417e-02 -7.73523441e-22
+3.99919067e-02 -8.57039449e-22
+4.66567528e-02 -8.74074347e-22
+5.33215866e-02 -8.62178607e-22
+5.99864138e-02 -8.30605582e-22
+6.66512360e-02 -7.93322822e-22
diff --git a/tests/Reference/Correlation_triple_VDF/Correlation_triple_VDF_U3U3U3_METHODE2.son.ref b/tests/Reference/Correlation_triple_VDF/Correlation_triple_VDF_U3U3U3_METHODE2.son.ref
new file mode 100644
index 0000000000..e9f9c3f4a9
--- /dev/null
+++ b/tests/Reference/Correlation_triple_VDF/Correlation_triple_VDF_U3U3U3_METHODE2.son.ref
@@ -0,0 +1,15 @@
+# Correlation_triple_VDF_U3U3U3_METHODE2.son
+# Temps x= 5.00000000e-01 y= 5.00000000e-01 z= 5.00000000e-01
+# Champ U3U3U3_METHODE2 [??]
+# Type POINT
+0.00000000e+00 0.00000000e+00
+6.66666667e-03 1.49813643e-95
+1.33320901e-02 -4.11949775e-23
+1.99971806e-02 -3.28966603e-22
+2.66621435e-02 -6.05179509e-22
+3.33270417e-02 -7.73523441e-22
+3.99919067e-02 -8.57039449e-22
+4.66567528e-02 -8.74074347e-22
+5.33215866e-02 -8.62178607e-22
+5.99864138e-02 -8.30605582e-22
+6.66512360e-02 -7.93322822e-22
diff --git a/tests/Reference/Correlation_triple_VDF/verifie b/tests/Reference/Correlation_triple_VDF/verifie
new file mode 100755
index 0000000000..f989b60130
--- /dev/null
+++ b/tests/Reference/Correlation_triple_VDF/verifie
@@ -0,0 +1,17 @@
+message()
+{
+   [ $1 != $2 ] && echo $ECHO_OPTS "Error ($1!=$2) when checking:\n $msg" && err=1
+   #echo $msg
+}
+
+#####################################
+# Comparaison non regression des .son (reduction)
+#####################################
+err=0
+for file in `ls *.son.ref 2>/dev/null`
+do
+   msg="compare_sonde $file ${file%.ref}"
+   eval $msg 1>verifie.log 2>&1
+   message $? 0
+done
+exit $err
diff --git a/tests/Reference/Correlation_triple_VEF/Correlation_triple_VEF.data b/tests/Reference/Correlation_triple_VEF/Correlation_triple_VEF.data
new file mode 100755
index 0000000000..eeaf320247
--- /dev/null
+++ b/tests/Reference/Correlation_triple_VEF/Correlation_triple_VEF.data
@@ -0,0 +1,127 @@
+Dimension 3
+
+Pb_hydraulique pb
+
+Domaine dom
+Mailler dom
+{
+	pave bloc
+	{
+		origine 0 0 0
+		longueurs 1 1 1
+		nombre_de_noeuds 3 3 3
+		}
+		{
+		bord frontiere	X = 0	0 <= Y <= 1	0 <= Z <= 1
+		bord frontiere	X = 1	0 <= Y <= 1	0 <= Z <= 1
+		bord frontiere	Y = 0	0 <= X <= 1	0 <= Z <= 1
+		bord frontiere	Y = 1	0 <= X <= 1	0 <= Z <= 1
+		bord frontiere	Z = 0	0 <= X <= 1	0 <= Y <= 1
+		bord frontiere	Z = 1	0 <= X <= 1	0 <= Y <= 1
+		}
+}
+Tetraedriser_homogene_fin dom
+
+VEFPreP1b dis
+
+Schema_Euler_explicite sch
+Lire sch
+{
+	nb_pas_dt_max 10
+	tinit 0
+	tmax 1
+	dt_sauv -1
+}
+
+Associer pb dom
+Associer pb sch
+
+Discretiser pb dis
+
+Lire pb
+{
+	Fluide_incompressible
+	{
+		mu	champ_uniforme 1 1
+		rho	champ_uniforme 1 1
+	}
+	Navier_Stokes_standard
+	{
+		solveur_pression	petsc Cholesky { }
+		conditions_initiales	{ vitesse champ_uniforme 3 1 0 0 }
+		conditions_limites	{
+					frontiere paroi_fixe
+					}
+		convection		{ centre }
+		diffusion		{ }
+		sources			{ source_qdm champ_fonc_xyz dom 3 cos(2*pi*x)*cos(2*pi*y)*cos(2*pi*z) cos(2*pi*x)*cos(2*pi*y)*sin(2*pi*z) cos(2*pi*x)*sin(2*pi*y)*sin(2*pi*z) }
+	}
+	Postraitement
+	{
+		definition_champs	{
+					ui		refChamp		{ pb_champ pb vitesse }
+					u1		transformation		{ methode composante numero 0 localisation elem sources_reference { ui } }
+					u2		transformation		{ methode composante numero 1 localisation elem sources_reference { ui } }
+					u3		transformation		{ methode composante numero 2 localisation elem sources_reference { ui } }
+					moy_u1		moyenne			{ t_deb 0 t_fin 1e+6 sources_reference { u1 } }
+					moy_u2		moyenne			{ t_deb 0 t_fin 1e+6 sources_reference { u2 } }
+					moy_u3		moyenne			{ t_deb 0 t_fin 1e+6 sources_reference { u3 } }
+					u1prime		transformation		{ methode formule expression 1 u1-moy_u1 localisation elem sources_reference { u1 , moy_u1 } }
+					u2prime		transformation		{ methode formule expression 1 u2-moy_u2 localisation elem sources_reference { u2 , moy_u2 } }
+					u3prime		transformation		{ methode formule expression 1 u3-moy_u3 localisation elem sources_reference { u3 , moy_u3 } }
+
+					u1u1u1_methode1		moyenne		{ t_deb 0 t_fin 1e+6 sources { transformation { methode formule expression 1 u1prime*u1prime*u1prime localisation elem sources_reference { u1prime } } } }
+					u1u1u2_methode1		moyenne		{ t_deb 0 t_fin 1e+6 sources { transformation { methode formule expression 1 u1prime*u1prime*u2prime localisation elem sources_reference { u1prime , u2prime } } } }
+					u1u1u3_methode1		moyenne		{ t_deb 0 t_fin 1e+6 sources { transformation { methode formule expression 1 u1prime*u1prime*u3prime localisation elem sources_reference { u1prime , u3prime } } } }
+					u1u2u2_methode1		moyenne		{ t_deb 0 t_fin 1e+6 sources { transformation { methode formule expression 1 u1prime*u2prime*u2prime localisation elem sources_reference { u1prime , u2prime } } } }
+					u1u2u3_methode1		moyenne		{ t_deb 0 t_fin 1e+6 sources { transformation { methode formule expression 1 u1prime*u2prime*u3prime localisation elem sources_reference { u1prime , u2prime , u3prime } } } }
+					u1u3u3_methode1		moyenne		{ t_deb 0 t_fin 1e+6 sources { transformation { methode formule expression 1 u1prime*u3prime*u3prime localisation elem sources_reference { u1prime , u3prime } } } }
+					u2u2u2_methode1		moyenne		{ t_deb 0 t_fin 1e+6 sources { transformation { methode formule expression 1 u2prime*u2prime*u2prime localisation elem sources_reference { u2prime } } } }
+					u2u2u3_methode1		moyenne		{ t_deb 0 t_fin 1e+6 sources { transformation { methode formule expression 1 u2prime*u2prime*u3prime localisation elem sources_reference { u2prime , u3prime } } } }
+					u2u3u3_methode1		moyenne		{ t_deb 0 t_fin 1e+6 sources { transformation { methode formule expression 1 u2prime*u3prime*u3prime localisation elem sources_reference { u2prime , u3prime } } } }
+					u3u3u3_methode1		moyenne		{ t_deb 0 t_fin 1e+6 sources { transformation { methode formule expression 1 u3prime*u3prime*u3prime localisation elem sources_reference { u3prime } } } }
+
+					# uiujuk = vecteur 27 composantes : composante (i,j,k) -> colonne 9*i+3*j+k-13 #
+					uiujuk			correlation_triple	{ t_deb 0 t_fin 1e+6 sources_reference { ui , ui , ui } }
+					u1u1u1_methode2		transformation		{ methode composante numero 0 localisation elem sources_reference { uiujuk } }
+					u1u1u2_methode2		transformation		{ methode composante numero 1 localisation elem sources_reference { uiujuk } }
+					u1u1u3_methode2		transformation		{ methode composante numero 2 localisation elem sources_reference { uiujuk } }
+					u1u2u2_methode2		transformation		{ methode composante numero 4 localisation elem sources_reference { uiujuk } }
+					u1u2u3_methode2		transformation		{ methode composante numero 5 localisation elem sources_reference { uiujuk } }
+					u1u3u3_methode2		transformation		{ methode composante numero 8 localisation elem sources_reference { uiujuk } }
+					u2u2u2_methode2		transformation		{ methode composante numero 13 localisation elem sources_reference { uiujuk } }
+					u2u2u3_methode2		transformation		{ methode composante numero 14 localisation elem sources_reference { uiujuk } }
+					u2u3u3_methode2		transformation		{ methode composante numero 17 localisation elem sources_reference { uiujuk } }
+					u3u3u3_methode2		transformation		{ methode composante numero 26 localisation elem sources_reference { uiujuk } }
+					}
+		sondes			{
+					u1u1u1_methode1	u1u1u1_methode1 periode 1e-6 point 1 0.5 0.5 0.5
+					u1u1u2_methode1 u1u1u2_methode1 periode 1e-6 position_like u1u1u1_methode1
+					u1u1u3_methode1 u1u1u3_methode1 periode 1e-6 position_like u1u1u1_methode1
+					u1u2u2_methode1 u1u2u2_methode1 periode 1e-6 position_like u1u1u1_methode1
+					u1u2u3_methode1 u1u2u3_methode1 periode 1e-6 position_like u1u1u1_methode1
+					u1u3u3_methode1 u1u3u3_methode1 periode 1e-6 position_like u1u1u1_methode1
+					u2u2u2_methode1 u2u2u2_methode1 periode 1e-6 position_like u1u1u1_methode1
+					u2u2u3_methode1 u2u2u3_methode1 periode 1e-6 position_like u1u1u1_methode1
+					u2u3u3_methode1 u2u3u3_methode1 periode 1e-6 position_like u1u1u1_methode1
+					u3u3u3_methode1 u3u3u3_methode1 periode 1e-6 position_like u1u1u1_methode1
+
+					u1u1u1_methode2 u1u1u1_methode2 periode 1e-6 position_like u1u1u1_methode1
+					u1u1u2_methode2 u1u1u2_methode2 periode 1e-6 position_like u1u1u1_methode1
+					u1u1u3_methode2 u1u1u3_methode2 periode 1e-6 position_like u1u1u1_methode1
+					u1u2u2_methode2 u1u2u2_methode2 periode 1e-6 position_like u1u1u1_methode1
+					u1u2u3_methode2 u1u2u3_methode2 periode 1e-6 position_like u1u1u1_methode1
+					u1u3u3_methode2 u1u3u3_methode2 periode 1e-6 position_like u1u1u1_methode1
+					u2u2u2_methode2 u2u2u2_methode2 periode 1e-6 position_like u1u1u1_methode1
+					u2u2u3_methode2 u2u2u3_methode2 periode 1e-6 position_like u1u1u1_methode1
+					u2u3u3_methode2 u2u3u3_methode2 periode 1e-6 position_like u1u1u1_methode1
+					u3u3u3_methode2 u3u3u3_methode2 periode 1e-6 position_like u1u1u1_methode1
+					}
+	}
+}
+
+EcritureLectureSpecial 0
+
+Resoudre pb
+ 
+Fin
diff --git a/tests/Reference/Correlation_triple_VEF/Correlation_triple_VEF.lml.gz b/tests/Reference/Correlation_triple_VEF/Correlation_triple_VEF.lml.gz
new file mode 100644
index 0000000000..6dbb3f8b4c
Binary files /dev/null and b/tests/Reference/Correlation_triple_VEF/Correlation_triple_VEF.lml.gz differ
diff --git a/tests/Reference/Correlation_triple_VEF/Correlation_triple_VEF_U1U1U1_METHODE1.son.ref b/tests/Reference/Correlation_triple_VEF/Correlation_triple_VEF_U1U1U1_METHODE1.son.ref
new file mode 100644
index 0000000000..cbeded905f
--- /dev/null
+++ b/tests/Reference/Correlation_triple_VEF/Correlation_triple_VEF_U1U1U1_METHODE1.son.ref
@@ -0,0 +1,15 @@
+# Correlation_triple_VEF_U1U1U1_METHODE1.son
+# Temps x= 5.00000000e-01 y= 5.00000000e-01 z= 5.00000000e-01
+# Champ U1U1U1_METHODE1 [??]
+# Type POINT
+0.00000000e+00 0.00000000e+00
+5.78703704e-04 0.00000000e+00
+1.15740323e-03 -2.26527027e-14
+1.73609871e-03 -1.15311174e-13
+2.31479037e-03 -2.99001596e-13
+2.89347850e-03 -5.80417457e-13
+3.47216334e-03 -9.58580792e-13
+4.05084513e-03 -1.42858148e-12
+4.62952411e-03 -1.98362480e-12
+5.20820048e-03 -2.61613726e-12
+5.78687442e-03 -3.31835602e-12
diff --git a/tests/Reference/Correlation_triple_VEF/Correlation_triple_VEF_U1U1U1_METHODE2.son.ref b/tests/Reference/Correlation_triple_VEF/Correlation_triple_VEF_U1U1U1_METHODE2.son.ref
new file mode 100644
index 0000000000..5bac1202a9
--- /dev/null
+++ b/tests/Reference/Correlation_triple_VEF/Correlation_triple_VEF_U1U1U1_METHODE2.son.ref
@@ -0,0 +1,15 @@
+# Correlation_triple_VEF_U1U1U1_METHODE2.son
+# Temps x= 5.00000000e-01 y= 5.00000000e-01 z= 5.00000000e-01
+# Champ U1U1U1_METHODE2 [??]
+# Type POINT
+0.00000000e+00 0.00000000e+00
+5.78703704e-04 0.00000000e+00
+1.15740323e-03 -2.26527027e-14
+1.73609871e-03 -1.15311174e-13
+2.31479037e-03 -2.99001596e-13
+2.89347850e-03 -5.80417457e-13
+3.47216334e-03 -9.58580792e-13
+4.05084513e-03 -1.42858148e-12
+4.62952411e-03 -1.98362480e-12
+5.20820048e-03 -2.61613726e-12
+5.78687442e-03 -3.31835602e-12
diff --git a/tests/Reference/Correlation_triple_VEF/Correlation_triple_VEF_U1U1U2_METHODE1.son.ref b/tests/Reference/Correlation_triple_VEF/Correlation_triple_VEF_U1U1U2_METHODE1.son.ref
new file mode 100644
index 0000000000..46b5c3e48b
--- /dev/null
+++ b/tests/Reference/Correlation_triple_VEF/Correlation_triple_VEF_U1U1U2_METHODE1.son.ref
@@ -0,0 +1,15 @@
+# Correlation_triple_VEF_U1U1U2_METHODE1.son
+# Temps x= 5.00000000e-01 y= 5.00000000e-01 z= 5.00000000e-01
+# Champ U1U1U2_METHODE1 [??]
+# Type POINT
+0.00000000e+00 0.00000000e+00
+5.78703704e-04 0.00000000e+00
+1.15740323e-03 -4.56931963e-14
+1.73609871e-03 -2.36989146e-13
+2.31479037e-03 -6.25205318e-13
+2.89347850e-03 -1.23230794e-12
+3.47216334e-03 -2.06213704e-12
+4.05084513e-03 -3.10754383e-12
+4.62952411e-03 -4.35491800e-12
+5.20820048e-03 -5.78705876e-12
+5.78687442e-03 -7.38499816e-12
diff --git a/tests/Reference/Correlation_triple_VEF/Correlation_triple_VEF_U1U1U2_METHODE2.son.ref b/tests/Reference/Correlation_triple_VEF/Correlation_triple_VEF_U1U1U2_METHODE2.son.ref
new file mode 100644
index 0000000000..4971af5fb7
--- /dev/null
+++ b/tests/Reference/Correlation_triple_VEF/Correlation_triple_VEF_U1U1U2_METHODE2.son.ref
@@ -0,0 +1,15 @@
+# Correlation_triple_VEF_U1U1U2_METHODE2.son
+# Temps x= 5.00000000e-01 y= 5.00000000e-01 z= 5.00000000e-01
+# Champ U1U1U2_METHODE2 [??]
+# Type POINT
+0.00000000e+00 0.00000000e+00
+5.78703704e-04 0.00000000e+00
+1.15740323e-03 -4.56931963e-14
+1.73609871e-03 -2.36989146e-13
+2.31479037e-03 -6.25205318e-13
+2.89347850e-03 -1.23230794e-12
+3.47216334e-03 -2.06213704e-12
+4.05084513e-03 -3.10754383e-12
+4.62952411e-03 -4.35491800e-12
+5.20820048e-03 -5.78705876e-12
+5.78687442e-03 -7.38499816e-12
diff --git a/tests/Reference/Correlation_triple_VEF/Correlation_triple_VEF_U1U1U3_METHODE1.son.ref b/tests/Reference/Correlation_triple_VEF/Correlation_triple_VEF_U1U1U3_METHODE1.son.ref
new file mode 100644
index 0000000000..a022eb32e7
--- /dev/null
+++ b/tests/Reference/Correlation_triple_VEF/Correlation_triple_VEF_U1U1U3_METHODE1.son.ref
@@ -0,0 +1,15 @@
+# Correlation_triple_VEF_U1U1U3_METHODE1.son
+# Temps x= 5.00000000e-01 y= 5.00000000e-01 z= 5.00000000e-01
+# Champ U1U1U3_METHODE1 [??]
+# Type POINT
+0.00000000e+00 0.00000000e+00
+5.78703704e-04 0.00000000e+00
+1.15740323e-03 -5.05224794e-15
+1.73609871e-03 -2.61954964e-14
+2.31479037e-03 -6.86578915e-14
+2.89347850e-03 -1.33490753e-13
+3.47216334e-03 -2.18917985e-13
+4.05084513e-03 -3.21461679e-13
+4.62952411e-03 -4.36772000e-13
+5.20820048e-03 -5.60195372e-13
+5.78687442e-03 -6.87142168e-13
diff --git a/tests/Reference/Correlation_triple_VEF/Correlation_triple_VEF_U1U1U3_METHODE2.son.ref b/tests/Reference/Correlation_triple_VEF/Correlation_triple_VEF_U1U1U3_METHODE2.son.ref
new file mode 100644
index 0000000000..e2d49a4973
--- /dev/null
+++ b/tests/Reference/Correlation_triple_VEF/Correlation_triple_VEF_U1U1U3_METHODE2.son.ref
@@ -0,0 +1,15 @@
+# Correlation_triple_VEF_U1U1U3_METHODE2.son
+# Temps x= 5.00000000e-01 y= 5.00000000e-01 z= 5.00000000e-01
+# Champ U1U1U3_METHODE2 [??]
+# Type POINT
+0.00000000e+00 0.00000000e+00
+5.78703704e-04 0.00000000e+00
+1.15740323e-03 -5.05224794e-15
+1.73609871e-03 -2.61954964e-14
+2.31479037e-03 -6.86578915e-14
+2.89347850e-03 -1.33490753e-13
+3.47216334e-03 -2.18917985e-13
+4.05084513e-03 -3.21461679e-13
+4.62952411e-03 -4.36772000e-13
+5.20820048e-03 -5.60195372e-13
+5.78687442e-03 -6.87142168e-13
diff --git a/tests/Reference/Correlation_triple_VEF/Correlation_triple_VEF_U1U2U2_METHODE1.son.ref b/tests/Reference/Correlation_triple_VEF/Correlation_triple_VEF_U1U2U2_METHODE1.son.ref
new file mode 100644
index 0000000000..6cbb349dff
--- /dev/null
+++ b/tests/Reference/Correlation_triple_VEF/Correlation_triple_VEF_U1U2U2_METHODE1.son.ref
@@ -0,0 +1,15 @@
+# Correlation_triple_VEF_U1U2U2_METHODE1.son
+# Temps x= 5.00000000e-01 y= 5.00000000e-01 z= 5.00000000e-01
+# Champ U1U2U2_METHODE1 [??]
+# Type POINT
+0.00000000e+00 0.00000000e+00
+5.78703704e-04 0.00000000e+00
+1.15740323e-03 -9.21686130e-14
+1.73609871e-03 -4.87088644e-13
+2.31479037e-03 -1.30746418e-12
+2.89347850e-03 -2.61692373e-12
+3.47216334e-03 -4.43738903e-12
+4.05084513e-03 -6.76190998e-12
+4.62952411e-03 -9.56421672e-12
+5.20820048e-03 -1.28057460e-11
+5.78687442e-03 -1.64407399e-11
diff --git a/tests/Reference/Correlation_triple_VEF/Correlation_triple_VEF_U1U2U2_METHODE2.son.ref b/tests/Reference/Correlation_triple_VEF/Correlation_triple_VEF_U1U2U2_METHODE2.son.ref
new file mode 100644
index 0000000000..f2510e0645
--- /dev/null
+++ b/tests/Reference/Correlation_triple_VEF/Correlation_triple_VEF_U1U2U2_METHODE2.son.ref
@@ -0,0 +1,15 @@
+# Correlation_triple_VEF_U1U2U2_METHODE2.son
+# Temps x= 5.00000000e-01 y= 5.00000000e-01 z= 5.00000000e-01
+# Champ U1U2U2_METHODE2 [??]
+# Type POINT
+0.00000000e+00 0.00000000e+00
+5.78703704e-04 0.00000000e+00
+1.15740323e-03 -9.21686130e-14
+1.73609871e-03 -4.87088644e-13
+2.31479037e-03 -1.30746418e-12
+2.89347850e-03 -2.61692373e-12
+3.47216334e-03 -4.43738903e-12
+4.05084513e-03 -6.76190998e-12
+4.62952411e-03 -9.56421672e-12
+5.20820048e-03 -1.28057460e-11
+5.78687442e-03 -1.64407399e-11
diff --git a/tests/Reference/Correlation_triple_VEF/Correlation_triple_VEF_U1U2U3_METHODE1.son.ref b/tests/Reference/Correlation_triple_VEF/Correlation_triple_VEF_U1U2U3_METHODE1.son.ref
new file mode 100644
index 0000000000..341b3f96e0
--- /dev/null
+++ b/tests/Reference/Correlation_triple_VEF/Correlation_triple_VEF_U1U2U3_METHODE1.son.ref
@@ -0,0 +1,15 @@
+# Correlation_triple_VEF_U1U2U3_METHODE1.son
+# Temps x= 5.00000000e-01 y= 5.00000000e-01 z= 5.00000000e-01
+# Champ U1U2U3_METHODE1 [??]
+# Type POINT
+0.00000000e+00 0.00000000e+00
+5.78703704e-04 0.00000000e+00
+1.15740323e-03 -1.01909852e-14
+1.73609871e-03 -5.38400943e-14
+2.31479037e-03 -1.43574764e-13
+2.89347850e-03 -2.83434637e-13
+3.47216334e-03 -4.70913274e-13
+4.05084513e-03 -6.99081300e-13
+4.62952411e-03 -9.58414535e-13
+5.20820048e-03 -1.23820699e-12
+5.78687442e-03 -1.52758340e-12
diff --git a/tests/Reference/Correlation_triple_VEF/Correlation_triple_VEF_U1U2U3_METHODE2.son.ref b/tests/Reference/Correlation_triple_VEF/Correlation_triple_VEF_U1U2U3_METHODE2.son.ref
new file mode 100644
index 0000000000..88f7f9553a
--- /dev/null
+++ b/tests/Reference/Correlation_triple_VEF/Correlation_triple_VEF_U1U2U3_METHODE2.son.ref
@@ -0,0 +1,15 @@
+# Correlation_triple_VEF_U1U2U3_METHODE2.son
+# Temps x= 5.00000000e-01 y= 5.00000000e-01 z= 5.00000000e-01
+# Champ U1U2U3_METHODE2 [??]
+# Type POINT
+0.00000000e+00 0.00000000e+00
+5.78703704e-04 0.00000000e+00
+1.15740323e-03 -1.01909852e-14
+1.73609871e-03 -5.38400943e-14
+2.31479037e-03 -1.43574764e-13
+2.89347850e-03 -2.83434637e-13
+3.47216334e-03 -4.70913274e-13
+4.05084513e-03 -6.99081300e-13
+4.62952411e-03 -9.58414535e-13
+5.20820048e-03 -1.23820699e-12
+5.78687442e-03 -1.52758340e-12
diff --git a/tests/Reference/Correlation_triple_VEF/Correlation_triple_VEF_U1U3U3_METHODE1.son.ref b/tests/Reference/Correlation_triple_VEF/Correlation_triple_VEF_U1U3U3_METHODE1.son.ref
new file mode 100644
index 0000000000..b9e8679628
--- /dev/null
+++ b/tests/Reference/Correlation_triple_VEF/Correlation_triple_VEF_U1U3U3_METHODE1.son.ref
@@ -0,0 +1,15 @@
+# Correlation_triple_VEF_U1U3U3_METHODE1.son
+# Temps x= 5.00000000e-01 y= 5.00000000e-01 z= 5.00000000e-01
+# Champ U1U3U3_METHODE1 [??]
+# Type POINT
+0.00000000e+00 0.00000000e+00
+5.78703704e-04 0.00000000e+00
+1.15740323e-03 -1.12680635e-15
+1.73609871e-03 -5.95118739e-15
+2.31479037e-03 -1.57664429e-14
+2.89347850e-03 -3.07024816e-14
+3.47216334e-03 -4.99990985e-14
+4.05084513e-03 -7.23603589e-14
+4.62952411e-03 -9.62727240e-14
+5.20820048e-03 -1.20242788e-13
+5.78687442e-03 -1.42951394e-13
diff --git a/tests/Reference/Correlation_triple_VEF/Correlation_triple_VEF_U1U3U3_METHODE2.son.ref b/tests/Reference/Correlation_triple_VEF/Correlation_triple_VEF_U1U3U3_METHODE2.son.ref
new file mode 100644
index 0000000000..74db591575
--- /dev/null
+++ b/tests/Reference/Correlation_triple_VEF/Correlation_triple_VEF_U1U3U3_METHODE2.son.ref
@@ -0,0 +1,15 @@
+# Correlation_triple_VEF_U1U3U3_METHODE2.son
+# Temps x= 5.00000000e-01 y= 5.00000000e-01 z= 5.00000000e-01
+# Champ U1U3U3_METHODE2 [??]
+# Type POINT
+0.00000000e+00 0.00000000e+00
+5.78703704e-04 0.00000000e+00
+1.15740323e-03 -1.12680635e-15
+1.73609871e-03 -5.95118739e-15
+2.31479037e-03 -1.57664429e-14
+2.89347850e-03 -3.07024816e-14
+3.47216334e-03 -4.99990985e-14
+4.05084513e-03 -7.23603589e-14
+4.62952411e-03 -9.62727240e-14
+5.20820048e-03 -1.20242788e-13
+5.78687442e-03 -1.42951394e-13
diff --git a/tests/Reference/Correlation_triple_VEF/Correlation_triple_VEF_U2U2U2_METHODE1.son.ref b/tests/Reference/Correlation_triple_VEF/Correlation_triple_VEF_U2U2U2_METHODE1.son.ref
new file mode 100644
index 0000000000..35a28643a9
--- /dev/null
+++ b/tests/Reference/Correlation_triple_VEF/Correlation_triple_VEF_U2U2U2_METHODE1.son.ref
@@ -0,0 +1,15 @@
+# Correlation_triple_VEF_U2U2U2_METHODE1.son
+# Temps x= 5.00000000e-01 y= 5.00000000e-01 z= 5.00000000e-01
+# Champ U2U2U2_METHODE1 [??]
+# Type POINT
+0.00000000e+00 0.00000000e+00
+5.78703704e-04 0.00000000e+00
+1.15740323e-03 -1.85915058e-13
+1.73609871e-03 -1.00117428e-12
+2.31479037e-03 -2.73460086e-12
+2.89347850e-03 -5.55845539e-12
+3.47216334e-03 -9.55115626e-12
+4.05084513e-03 -1.47183055e-11
+4.62952411e-03 -2.10118152e-11
+5.20820048e-03 -2.83463294e-11
+5.78687442e-03 -3.66126392e-11
diff --git a/tests/Reference/Correlation_triple_VEF/Correlation_triple_VEF_U2U2U2_METHODE2.son.ref b/tests/Reference/Correlation_triple_VEF/Correlation_triple_VEF_U2U2U2_METHODE2.son.ref
new file mode 100644
index 0000000000..e78245af53
--- /dev/null
+++ b/tests/Reference/Correlation_triple_VEF/Correlation_triple_VEF_U2U2U2_METHODE2.son.ref
@@ -0,0 +1,15 @@
+# Correlation_triple_VEF_U2U2U2_METHODE2.son
+# Temps x= 5.00000000e-01 y= 5.00000000e-01 z= 5.00000000e-01
+# Champ U2U2U2_METHODE2 [??]
+# Type POINT
+0.00000000e+00 0.00000000e+00
+5.78703704e-04 0.00000000e+00
+1.15740323e-03 -1.85915058e-13
+1.73609871e-03 -1.00117428e-12
+2.31479037e-03 -2.73460086e-12
+2.89347850e-03 -5.55845539e-12
+3.47216334e-03 -9.55115626e-12
+4.05084513e-03 -1.47183055e-11
+4.62952411e-03 -2.10118152e-11
+5.20820048e-03 -2.83463294e-11
+5.78687442e-03 -3.66126392e-11
diff --git a/tests/Reference/Correlation_triple_VEF/Correlation_triple_VEF_U2U2U3_METHODE1.son.ref b/tests/Reference/Correlation_triple_VEF/Correlation_triple_VEF_U2U2U3_METHODE1.son.ref
new file mode 100644
index 0000000000..03d12f4c66
--- /dev/null
+++ b/tests/Reference/Correlation_triple_VEF/Correlation_triple_VEF_U2U2U3_METHODE1.son.ref
@@ -0,0 +1,15 @@
+# Correlation_triple_VEF_U2U2U3_METHODE1.son
+# Temps x= 5.00000000e-01 y= 5.00000000e-01 z= 5.00000000e-01
+# Champ U2U2U3_METHODE1 [??]
+# Type POINT
+0.00000000e+00 0.00000000e+00
+5.78703704e-04 0.00000000e+00
+1.15740323e-03 -2.05564295e-14
+1.73609871e-03 -1.10664194e-13
+2.31479037e-03 -3.00277659e-13
+2.89347850e-03 -6.01931216e-13
+3.47216334e-03 -1.01326075e-12
+4.05084513e-03 -1.52078166e-12
+4.62952411e-03 -2.10379672e-12
+5.20820048e-03 -2.73780195e-12
+5.78687442e-03 -3.39715437e-12
diff --git a/tests/Reference/Correlation_triple_VEF/Correlation_triple_VEF_U2U2U3_METHODE2.son.ref b/tests/Reference/Correlation_triple_VEF/Correlation_triple_VEF_U2U2U3_METHODE2.son.ref
new file mode 100644
index 0000000000..73cd24e33c
--- /dev/null
+++ b/tests/Reference/Correlation_triple_VEF/Correlation_triple_VEF_U2U2U3_METHODE2.son.ref
@@ -0,0 +1,15 @@
+# Correlation_triple_VEF_U2U2U3_METHODE2.son
+# Temps x= 5.00000000e-01 y= 5.00000000e-01 z= 5.00000000e-01
+# Champ U2U2U3_METHODE2 [??]
+# Type POINT
+0.00000000e+00 0.00000000e+00
+5.78703704e-04 0.00000000e+00
+1.15740323e-03 -2.05564295e-14
+1.73609871e-03 -1.10664194e-13
+2.31479037e-03 -3.00277659e-13
+2.89347850e-03 -6.01931216e-13
+3.47216334e-03 -1.01326075e-12
+4.05084513e-03 -1.52078166e-12
+4.62952411e-03 -2.10379672e-12
+5.20820048e-03 -2.73780195e-12
+5.78687442e-03 -3.39715437e-12
diff --git a/tests/Reference/Correlation_triple_VEF/Correlation_triple_VEF_U2U3U3_METHODE1.son.ref b/tests/Reference/Correlation_triple_VEF/Correlation_triple_VEF_U2U3U3_METHODE1.son.ref
new file mode 100644
index 0000000000..d92bfff7cd
--- /dev/null
+++ b/tests/Reference/Correlation_triple_VEF/Correlation_triple_VEF_U2U3U3_METHODE1.son.ref
@@ -0,0 +1,15 @@
+# Correlation_triple_VEF_U2U3U3_METHODE1.son
+# Temps x= 5.00000000e-01 y= 5.00000000e-01 z= 5.00000000e-01
+# Champ U2U3U3_METHODE1 [??]
+# Type POINT
+0.00000000e+00 0.00000000e+00
+5.78703704e-04 0.00000000e+00
+1.15740323e-03 -2.27290246e-15
+1.73609871e-03 -1.22322000e-14
+2.31479037e-03 -3.29730648e-14
+2.89347850e-03 -6.51925159e-14
+3.47216334e-03 -1.07545496e-13
+4.05084513e-03 -1.57320462e-13
+4.62952411e-03 -2.11143844e-13
+5.20820048e-03 -2.65560730e-13
+5.78687442e-03 -3.17442734e-13
diff --git a/tests/Reference/Correlation_triple_VEF/Correlation_triple_VEF_U2U3U3_METHODE2.son.ref b/tests/Reference/Correlation_triple_VEF/Correlation_triple_VEF_U2U3U3_METHODE2.son.ref
new file mode 100644
index 0000000000..aedb5e8971
--- /dev/null
+++ b/tests/Reference/Correlation_triple_VEF/Correlation_triple_VEF_U2U3U3_METHODE2.son.ref
@@ -0,0 +1,15 @@
+# Correlation_triple_VEF_U2U3U3_METHODE2.son
+# Temps x= 5.00000000e-01 y= 5.00000000e-01 z= 5.00000000e-01
+# Champ U2U3U3_METHODE2 [??]
+# Type POINT
+0.00000000e+00 0.00000000e+00
+5.78703704e-04 0.00000000e+00
+1.15740323e-03 -2.27290246e-15
+1.73609871e-03 -1.22322000e-14
+2.31479037e-03 -3.29730648e-14
+2.89347850e-03 -6.51925159e-14
+3.47216334e-03 -1.07545496e-13
+4.05084513e-03 -1.57320462e-13
+4.62952411e-03 -2.11143844e-13
+5.20820048e-03 -2.65560730e-13
+5.78687442e-03 -3.17442734e-13
diff --git a/tests/Reference/Correlation_triple_VEF/Correlation_triple_VEF_U3U3U3_METHODE1.son.ref b/tests/Reference/Correlation_triple_VEF/Correlation_triple_VEF_U3U3U3_METHODE1.son.ref
new file mode 100644
index 0000000000..1d7289a9b2
--- /dev/null
+++ b/tests/Reference/Correlation_triple_VEF/Correlation_triple_VEF_U3U3U3_METHODE1.son.ref
@@ -0,0 +1,15 @@
+# Correlation_triple_VEF_U3U3U3_METHODE1.son
+# Temps x= 5.00000000e-01 y= 5.00000000e-01 z= 5.00000000e-01
+# Champ U3U3U3_METHODE1 [??]
+# Type POINT
+0.00000000e+00 0.00000000e+00
+5.78703704e-04 0.00000000e+00
+1.15740323e-03 -2.51312399e-16
+1.73609871e-03 -1.35207888e-15
+2.31479037e-03 -3.62078602e-15
+2.89347850e-03 -7.06166536e-15
+3.47216334e-03 -1.14201266e-14
+4.05084513e-03 -1.62936895e-14
+4.62952411e-03 -2.12424418e-14
+5.20820048e-03 -2.58712844e-14
+5.78687442e-03 -2.98775739e-14
diff --git a/tests/Reference/Correlation_triple_VEF/Correlation_triple_VEF_U3U3U3_METHODE2.son.ref b/tests/Reference/Correlation_triple_VEF/Correlation_triple_VEF_U3U3U3_METHODE2.son.ref
new file mode 100644
index 0000000000..9232b1d95e
--- /dev/null
+++ b/tests/Reference/Correlation_triple_VEF/Correlation_triple_VEF_U3U3U3_METHODE2.son.ref
@@ -0,0 +1,15 @@
+# Correlation_triple_VEF_U3U3U3_METHODE2.son
+# Temps x= 5.00000000e-01 y= 5.00000000e-01 z= 5.00000000e-01
+# Champ U3U3U3_METHODE2 [??]
+# Type POINT
+0.00000000e+00 0.00000000e+00
+5.78703704e-04 0.00000000e+00
+1.15740323e-03 -2.51312399e-16
+1.73609871e-03 -1.35207888e-15
+2.31479037e-03 -3.62078602e-15
+2.89347850e-03 -7.06166536e-15
+3.47216334e-03 -1.14201266e-14
+4.05084513e-03 -1.62936895e-14
+4.62952411e-03 -2.12424418e-14
+5.20820048e-03 -2.58712844e-14
+5.78687442e-03 -2.98775739e-14
diff --git a/tests/Reference/Correlation_triple_VEF/verifie b/tests/Reference/Correlation_triple_VEF/verifie
new file mode 100755
index 0000000000..f989b60130
--- /dev/null
+++ b/tests/Reference/Correlation_triple_VEF/verifie
@@ -0,0 +1,17 @@
+message()
+{
+   [ $1 != $2 ] && echo $ECHO_OPTS "Error ($1!=$2) when checking:\n $msg" && err=1
+   #echo $msg
+}
+
+#####################################
+# Comparaison non regression des .son (reduction)
+#####################################
+err=0
+for file in `ls *.son.ref 2>/dev/null`
+do
+   msg="compare_sonde $file ${file%.ref}"
+   eval $msg 1>verifie.log 2>&1
+   message $? 0
+done
+exit $err
diff --git a/tests/Reference/Cylindre_2D_Axi_VDF_Radiation/Cylindre_2D_Axi_VDF_Radiation.data b/tests/Reference/Cylindre_2D_Axi_VDF_Radiation/Cylindre_2D_Axi_VDF_Radiation.data
index 427df828f6..b939aa3873 100644
--- a/tests/Reference/Cylindre_2D_Axi_VDF_Radiation/Cylindre_2D_Axi_VDF_Radiation.data
+++ b/tests/Reference/Cylindre_2D_Axi_VDF_Radiation/Cylindre_2D_Axi_VDF_Radiation.data
@@ -80,7 +80,7 @@ Lire pb
 
     Navier_Stokes_standard
     {
-        solveur_pression cholesky { }
+        solveur_pression petsc cholesky { }
         convection { amont }
         diffusion { }
         sources { boussinesq_temperature { T0 431. } }
diff --git a/tests/Reference/Cylindre_tournant/Cylindre_tournant_EC_dans_repere_fixe.son_ref b/tests/Reference/Cylindre_tournant/Cylindre_tournant_EC_dans_repere_fixe.son_ref
new file mode 100644
index 0000000000..cc2e762c74
--- /dev/null
+++ b/tests/Reference/Cylindre_tournant/Cylindre_tournant_EC_dans_repere_fixe.son_ref
@@ -0,0 +1,3 @@
+# Temps        Energie_cinetique_totale
+0.00000000e+00 1.17216882e-02
+4.52937624e-03 1.17216694e-02
diff --git a/tests/Reference/Cylindre_tournant/verifie b/tests/Reference/Cylindre_tournant/verifie
new file mode 100755
index 0000000000..c991c9f644
--- /dev/null
+++ b/tests/Reference/Cylindre_tournant/verifie
@@ -0,0 +1,5 @@
+if [ -f $1_EC_dans_repere_fixe.son_ref ]
+then
+compare_sonde $1_EC_dans_repere_fixe.son_ref $1_EC_dans_repere_fixe.son 1>verifie.log 2>&1 || exit -1
+fi
+exit 0
diff --git a/tests/Reference/DEC_64/DEC_64.data b/tests/Reference/DEC_64/DEC_64.data
index 2f8b72e1d0..6bc3ac78df 100644
--- a/tests/Reference/DEC_64/DEC_64.data
+++ b/tests/Reference/DEC_64/DEC_64.data
@@ -23,6 +23,11 @@ Mailler_64 dom
 }
 Raffiner_simplexes_64 dom
 Transformer_64 dom x y z
+Analyse_Angle_64 dom 10
+
+Corriger_frontiere_periodique_64 { domaine dom bord paroiY }
+Ecrire_med_64 dom dom.med
+Lire_med_64 { domaine dom file dom.med }
 Partition_64 dom
 {
     Partition_tool metis_64 { nb_parts 4 }
@@ -30,6 +35,10 @@ Partition_64 dom
     Larg_joint 2
     zones_name DOM
 }
+#
+Discretiser_domaine_64 dom
+Postraiter_domaine_64 { format single_lata fichier dom domaine dom }
+#
 
 # VEF domain #
 lire_med_64 { domaine dom file tetra.med }
@@ -37,15 +46,24 @@ lire_med_64 { domaine dom file tetra.med }
 RegroupeBord_64 dom perioz { entree sortie }
 
 Raffiner_simplexes_64 dom
+Transformer_64 dom x y z
 
 Analyse_Angle_64 dom 10
 
 Declarer_bord_perio_64 { domaine dom bord paroiY }
-
+Ecrire_med_64 dom dom.med
+Lire_med_64 { domaine dom file dom.med }
 Decouper_64 dom	{
 	partitionneur metis_64 { Nb_parts 3 }
 	larg_joint 2
 	single_hdf
 	nom_Zones dom
 }
+Ecrire_fichier dom dom.geom
+Lire_fichier dom dom.geom
+
+#
+Discretiser_domaine_64 dom
+Postraiter_domaine_64 { format single_lata fichier dom domaine dom }
+#
 Fin
diff --git a/tests/Reference/Rayonnement_VDF/Rayonnement_VDF.data b/tests/Reference/Rayonnement_VDF/Rayonnement_VDF.data
index 4fee042a2d..d8dfca79e3 100644
--- a/tests/Reference/Rayonnement_VDF/Rayonnement_VDF.data
+++ b/tests/Reference/Rayonnement_VDF/Rayonnement_VDF.data
@@ -114,7 +114,7 @@ Lire pb2
 
     Navier_Stokes_standard
     {
-        solveur_pression cholesky { }
+        solveur_pression petsc cholesky { }
         convection { amont }
         diffusion { }
         sources { boussinesq_temperature { T0 300. } }
diff --git a/tests/Reference/cpu_3D/cpu_3D.TU.ref_is246827 b/tests/Reference/cpu_3D/cpu_3D.TU.ref_is246827
new file mode 100644
index 0000000000..3583e9a0d8
--- /dev/null
+++ b/tests/Reference/cpu_3D/cpu_3D.TU.ref_is246827
@@ -0,0 +1,44 @@
+Statistiques d'initialisation du calcul
+
+Temps total                       0.409547
+
+Statistiques de resolution du probleme
+
+Temps total                       0.6614
+
+
+Timesteps                         3
+Secondes / pas de temps           0.220459
+Dont solveurs Ax=B                0.154522 70% (1 appel/pas de temps)
+Dont mettre_a_jour                0.001801  0% (1 appel/pas de temps)
+Dont operateurs convection        0.033740 15% (2 appels/pas de temps)
+Dont operateurs diffusion         0.013478  6% (2 appels/pas de temps)
+Dont operateurs gradient          0.003080  1% (2 appels/pas de temps)
+Dont operateurs divergence        0.002406  1% (2 appels/pas de temps)
+Dont operateurs source            0.001065  0% (1 appel/pas de temps)
+Dont operations postraitement     0.001074  0% (1 appel/pas de temps)
+Dont calcul dt                    0.002536  1% (4 appels/pas de temps)
+Dont calcul divers                0.006757  3% (0 appels/pas de temps)
+Nb solveur / pas de temps         1
+Secondes / solveur                0.154522
+Iterations / solveur              103
+I/O:
+
+Timesteps = number of time steps
+Nb solveur = number of linear system resolutions
+Nb assemblage implicite = number of matrix assemblies for the implicit scheme
+Iterations = average number of iterations of the solver
+Communications = fraction of the time spent
+                 in communications between processors (excluding io files)
+Network latency = time of one mpsum measured by an internal bench over 0.1s
+Network bandwidth = maximum on all processors
+                    of the average bandwidth of send_recv operations
+Waiting time = estimation of the waiting time of the different processors
+
+Max_waiting_time big    => probably due to a bad partitioning
+Communications > 30%    => too many processors or network too slow
+
+Statistiques de post resolution
+
+Temps total                       0.001712
+
diff --git a/tests/Reference/rayo_transp_VDF/rayo_transp_VDF.data b/tests/Reference/rayo_transp_VDF/rayo_transp_VDF.data
index 5c2746edc7..6766d5d38a 100644
--- a/tests/Reference/rayo_transp_VDF/rayo_transp_VDF.data
+++ b/tests/Reference/rayo_transp_VDF/rayo_transp_VDF.data
@@ -148,7 +148,7 @@ Lire pb_f1
 
     Navier_Stokes_standard
     {
-        solveur_pression cholesky { }
+        solveur_pression petsc cholesky { }
         convection { amont }
         diffusion { }
         sources { boussinesq_temperature { T0 300. } }
@@ -254,7 +254,7 @@ Lire pb_f2
 
     Navier_Stokes_standard
     {
-        solveur_pression cholesky { }
+        solveur_pression petsc cholesky { }
         convection { amont }
         diffusion { }
         sources { boussinesq_temperature { T0 300. } }
diff --git a/tests/Turbulence/Marche3D_Turb_Null/Marche3D_Turb_Null.data b/tests/Turbulence/Marche3D_Turb_Null/Marche3D_Turb_Null.data
index 4278aea188..1e7d886863 100644
--- a/tests/Turbulence/Marche3D_Turb_Null/Marche3D_Turb_Null.data
+++ b/tests/Turbulence/Marche3D_Turb_Null/Marche3D_Turb_Null.data
@@ -52,7 +52,7 @@ Read pb
 
     Navier_Stokes_Turbulent
     {
-        solveur_pression cholesky { }
+        solveur_pression petsc cholesky { }
         convection { amont }
         diffusion { }
         initial_conditions {
diff --git a/tests/UnitTests/unit_device.cpp b/tests/UnitTests/unit_device.cpp
index a45d4635c7..64c97c4e8a 100644
--- a/tests/UnitTests/unit_device.cpp
+++ b/tests/UnitTests/unit_device.cpp
@@ -466,9 +466,35 @@ TEST(DeviceTest, DoubleTravCopyConstructor)
 
 
 
-TEST(DeviceTest, copyToDevice2)
+// NVCC extended lambdas cannot be used inside protected/private member functions
+// (GTest's TestBody is protected). Extract kernels as free functions at file scope.
+template<class View3D>
+static void check_values_3d(View3D view_rw, int n0, int n1, int n2, bool& all_correct)
+{
+  auto policy = Kokkos::MDRangePolicy<Kokkos::Rank<3>>({0, 0, 0}, {n0, n1, n2});
+  Kokkos::parallel_reduce("CheckValues", policy,
+                          KOKKOS_LAMBDA(int i, int j, int k, bool& result) {
+                            if (view_rw(i, j, k) != i + j - k) {
+                              result = false;
+                              printf("Mismatch at i=%d, j=%d, k=%d: view_rw(i,j,k)=%d, expected=%d\n",
+                                     i, j, k, view_rw(i, j, k), i + j - k);
+                            }
+                          },
+                          Kokkos::LAnd<bool>(all_correct));
+}
+
+template<class View3D>
+static void set_values_3d(View3D view_rw, int n0, int n1, int n2)
 {
+  auto policy = Kokkos::MDRangePolicy<Kokkos::Rank<3>>({0, 0, 0}, {n0, n1, n2});
+  Kokkos::parallel_for("SetValues", policy,
+                       KOKKOS_LAMBDA(int i, int j, int k) {
+                         view_rw(i, j, k) = i + j - k;
+                       });
+}
 
+TEST(DeviceTest, copyToDevice2)
+{
   int n0=2, n1=3, n2=4;
   TRUSTTab<int> tab(n0,n1,n2);
 
@@ -480,40 +506,22 @@ TEST(DeviceTest, copyToDevice2)
     }
   }
 
-  //This does a map to device
   auto view_rw = tab.view_rw<3, Kokkos::DefaultExecutionSpace>();
-  
-  // Parallel reduce to check if all values match i + j + k
-  auto policy = Kokkos::MDRangePolicy<Kokkos::Rank<3>>({0, 0, 0}, {n0, n1, n2});
 
   bool all_correct = true;
-  Kokkos::parallel_reduce("CheckValues", policy,
-                          KOKKOS_LAMBDA(int i, int j, int k, bool& result) {
-                    
-                            if (view_rw(i, j, k) != i + j - k) {result = false;printf("Mismatch at i=%d, j=%d, k=%d: view_rw(i,j,k)=%d, expected=%d\n", 
-           i, j, k, view_rw(i, j, k), i + j - k);}},
-          Kokkos::LAnd<bool>(all_correct));
+  check_values_3d(view_rw, n0, n1, n2, all_correct);
 
   EXPECT_TRUE(all_correct);
 }
 
 TEST(DeviceTest, copyFromDevice2)
 {
-
   int n0=10, n1=11, n2=12;
   TRUSTTab<int> tab(n0,n1,n2);
 
-  //This does a map to device
   auto view_rw = tab.view_rw<3, Kokkos::DefaultExecutionSpace>();
 
-  // Parallel reduce to check if all values match i + j + k
-  auto policy = Kokkos::MDRangePolicy<Kokkos::Rank<3>>({0, 0, 0}, {n0, n1, n2});
-
-  Kokkos::parallel_for("SetValues", policy,
-                      KOKKOS_LAMBDA(int i, int j, int k) 
-                      {
-                        view_rw(i, j, k) = i + j - k;
-                      });
+  set_values_3d(view_rw, n0, n1, n2);
 
   //copyFromDevice(tab);
 
@@ -526,6 +534,47 @@ TEST(DeviceTest, copyFromDevice2)
   }
 }
 
+// NVCC: KOKKOS_LAMBDA cannot appear inside GTest's protected TestBody.
+// Extract all kernel patterns as file-scope template helpers.
+template<class View1D>
+static void check_values_1d(View1D view, int N, bool& all_correct)
+{
+  Kokkos::parallel_reduce("CheckValues", N,
+                          KOKKOS_LAMBDA(int i, bool& result) {
+                            if (view(i) != (decltype(view(i)))i) result = false;
+                          }, Kokkos::LAnd<bool>(all_correct));
+}
+
+template<class View2D>
+static void check_values_2d_col(View2D view, int N, int col, bool& all_correct)
+{
+  Kokkos::parallel_reduce("CheckValues", N,
+                          KOKKOS_LAMBDA(int i, bool& result) {
+                            if (view(i, col) != i) result = false;
+                          }, Kokkos::LAnd<bool>(all_correct));
+}
+
+template<class View2D>
+static void check_values_2d_row(View2D view, int N, int row, bool& all_correct)
+{
+  Kokkos::parallel_reduce("CheckValues", N,
+                          KOKKOS_LAMBDA(int i, bool& result) {
+                            if (view(row, i) != i) result = false;
+                          }, Kokkos::LAnd<bool>(all_correct));
+}
+
+// Checks 2D view using flat index: row = i/ncols (0 or 1), col = i%ncols
+template<class View2D>
+static void check_values_2d_flat(View2D view, int N, int ncols, bool& all_correct)
+{
+  Kokkos::parallel_reduce("CheckValues", N,
+                          KOKKOS_LAMBDA(int i, bool& result) {
+                            int k = (int)(i >= ncols);
+                            int l = i % ncols;
+                            if (view(k, l) != i) result = false;
+                          }, Kokkos::LAnd<bool>(all_correct));
+}
+
 TEST(DeviceTest, resizeGPUArrayUP){
 
   int N=10;
@@ -538,13 +587,8 @@ TEST(DeviceTest, resizeGPUArrayUP){
 
   for (int i=0; i<N; i++){EXPECT_EQ(i, a[i]);}
 
-  auto policy = Kokkos::RangePolicy(0, N);
-
   bool all_correct = true;
-  Kokkos::parallel_reduce("CheckValues", policy,
-                          KOKKOS_LAMBDA(int i, bool& result) {
-                            if (view_rw(i) != i) {result = false;printf("Mismatch at i=%d, view_rw(i)=%d\n", i, view_rw(i));}},
-          Kokkos::LAnd<bool>(all_correct));
+  check_values_1d(view_rw, N, all_correct);
   EXPECT_TRUE(all_correct);
 }
 
@@ -558,13 +602,8 @@ TEST(DeviceTest, resizeGPUArrayDOWN){
   a.resize(N/2);
   for (int i=0; i<N/2; i++){EXPECT_EQ(i, a[i]);}
 
-  auto policy = Kokkos::RangePolicy(0, N/2);
-
   bool all_correct = true;
-  Kokkos::parallel_reduce("CheckValues", policy,
-                          KOKKOS_LAMBDA(int i, bool& result) {
-                            if (view_rw(i) != i) {result = false;printf("Mismatch at i=%d, view_rw(i)=%f\n", i, view_rw(i));}},
-          Kokkos::LAnd<bool>(all_correct));
+  check_values_1d(view_rw, N/2, all_correct);
   EXPECT_TRUE(all_correct);
 }
 
@@ -583,15 +622,9 @@ TEST(DeviceTest, resizeGPUTabUP){
   auto view = tab.view_rw();
 
   bool all_correct = true;
-  Kokkos::parallel_reduce("CheckValues", 6,
-                          KOKKOS_LAMBDA(int i, bool& result) {
-                            int k=(int)(i>=3);
-                            int l=i%3;
-                            if (view(k,l) != i) {result = false;
-                             printf("Mismatch, i=%d, view(%d,%d)=%d\n", i, k,l, view(k,l));}},
-          Kokkos::LAnd<bool>(all_correct));
+  check_values_2d_flat(view, 6, 3, all_correct);
   EXPECT_TRUE(all_correct);
-  
+
   tab.resize(4,6);
 
   auto view2 = tab.view_rw();
@@ -599,15 +632,9 @@ TEST(DeviceTest, resizeGPUTabUP){
   for (int i=0; i<6; i++){
       EXPECT_EQ(tab(0,i),i);
   }
-  
+
   all_correct = true;
-  Kokkos::parallel_reduce("CheckValues", 6,
-                          KOKKOS_LAMBDA(int i, bool& result) {
-                            if (view2(0,i) != i) {
-                              result = false;
-                              printf("Mismatch at i=%d, view2(0,i)=%d\n", i, view2(0,i));
-                              }},
-          Kokkos::LAnd<bool>(all_correct));
+  check_values_2d_row(view2, 6, 0, all_correct);
   EXPECT_TRUE(all_correct);
   }
 
@@ -634,13 +661,7 @@ TEST(DeviceTest, resizeGPUTabDOWN){
   EXPECT_EQ(tab(1,2), 5);
 
   bool all_correct = true;
-  Kokkos::parallel_reduce("CheckValues", 6,
-                          KOKKOS_LAMBDA(int i, bool& result) {
-                            int k=(int)(i>=3);
-                            int l=i%3;
-                            if (view2(k,l) != i) {result = false;
-                             printf("Mismatch, i=%d, view(%d,%d)=%d\n", i, k,l, view2(k,l));}},
-          Kokkos::LAnd<bool>(all_correct));
+  check_values_2d_flat(view2, 6, 3, all_correct);
   EXPECT_TRUE(all_correct);
 }
 
@@ -705,10 +726,7 @@ TEST(DeviceTest, append_line_GPU_dim1){
   auto view1=tab.view_rw();
 
   bool all_correct = true;
-  Kokkos::parallel_reduce("CheckValues", 5,
-                          KOKKOS_LAMBDA(int i, bool& result) {
-                            if (view1(i,0) != i) {result = false;}},
-          Kokkos::LAnd<bool>(all_correct));
+  check_values_2d_col(view1, 5, 0, all_correct);
   EXPECT_TRUE(all_correct);
 
 }
@@ -750,17 +768,11 @@ TEST(DeviceTest, append_line_GPU_dim2){
   auto view1=tab.view_rw();
 
   bool all_correct = true;
-  Kokkos::parallel_reduce("CheckValues", 5,
-                          KOKKOS_LAMBDA(int i, bool& result) {
-                            if (view1(i,0) != i) {result = false;}},
-          Kokkos::LAnd<bool>(all_correct));
+  check_values_2d_col(view1, 5, 0, all_correct);
   EXPECT_TRUE(all_correct);
 
   all_correct = true;
-  Kokkos::parallel_reduce("CheckValues", 5,
-                          KOKKOS_LAMBDA(int i, bool& result) {
-                            if (view1(i,1) != i) {result = false;}},
-          Kokkos::LAnd<bool>(all_correct));
+  check_values_2d_col(view1, 5, 1, all_correct);
   EXPECT_TRUE(all_correct);
 
 }
@@ -797,11 +809,7 @@ TEST(DeviceTest, append_line_GPU_dim3) {
   // Verify with parallel_reduce for each column
   for (int col = 0; col < 3; col++) {
     bool all_correct = true;
-    Kokkos::parallel_reduce("CheckValues", 5,
-                            KOKKOS_LAMBDA(int i, bool& result) {
-                              if (view1(i, col) != i) {result = false;}
-                            },
-                            Kokkos::LAnd<bool>(all_correct));
+    check_values_2d_col(view1, 5, col, all_correct);
     EXPECT_TRUE(all_correct);
   }
 }
@@ -839,11 +847,7 @@ TEST(DeviceTest, append_line_GPU_dim4) {
   // Verify with parallel_reduce for each column
   for (int col = 0; col < 4; col++) {
     bool all_correct = true;
-    Kokkos::parallel_reduce("CheckValues", 5,
-                            KOKKOS_LAMBDA(int i, bool& result) {
-                              if (view1(i, col) != i) {result = false;}
-                            },
-                            Kokkos::LAnd<bool>(all_correct));
+    check_values_2d_col(view1, 5, col, all_correct);
     EXPECT_TRUE(all_correct);
   }
 }
@@ -867,11 +871,7 @@ TEST(DeviceTest, copy_ctor_tab_GPU) {
   // Verify with parallel_reduce for each column
   for (int col = 0; col < 4; col++) {
     bool all_correct = true;
-    Kokkos::parallel_reduce("CheckValues", 5,
-                            KOKKOS_LAMBDA(int i, bool& result) {
-                              if (view2(i, col) != i) {result = false;}
-                            },
-                            Kokkos::LAnd<bool>(all_correct));
+    check_values_2d_col(view2, 5, col, all_correct);
     EXPECT_TRUE(all_correct);
   }
 }