devitocodes
diff --git a/‎FAQ.md‎
Lines changed: 22 additions & 22 deletions b/‎FAQ.md‎
Lines changed: 22 additions & 22 deletions
diff --git a/‎benchmarks/user/README.md‎
Lines changed: 1 addition & 1 deletion b/‎benchmarks/user/README.md‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎devito/arch/archinfo.py‎
Lines changed: 100 additions & 37 deletions b/‎devito/arch/archinfo.py‎
Lines changed: 100 additions & 37 deletions
diff --git a/‎devito/arch/compiler.py‎
Lines changed: 44 additions & 18 deletions b/‎devito/arch/compiler.py‎
Lines changed: 44 additions & 18 deletions
diff --git a/‎devito/core/cpu.py‎
Lines changed: 0 additions & 1 deletion b/‎devito/core/cpu.py‎
Lines changed: 0 additions & 1 deletion
diff --git a/‎devito/core/gpu.py‎
Lines changed: 0 additions & 1 deletion b/‎devito/core/gpu.py‎
Lines changed: 0 additions & 1 deletion
diff --git a/‎devito/finite_differences/derivative.py‎
Lines changed: 4 additions & 15 deletions b/‎devito/finite_differences/derivative.py‎
Lines changed: 4 additions & 15 deletions
@@ -147,7 +147,7 @@ selected and used for all remaining timesteps.
 
 The "backend compiler" takes as input the code generated by Devito and
 translates it into a shared object. Supported backend compilers are `gcc`,
-`icc`, `pgcc`, `clang`. For each of these compilers, Devito uses some preset compilation
+`icc`, `clang`. For each of these compilers, Devito uses some preset compilation
 flags (e.g., -O3, -march=native, etc).
 
 The default backend compiler is `gcc`. To change it, one should set the
 
@@ -1,7 +1,8 @@
 """Collection of utilities to detect properties of the underlying architecture."""
 
+from contextlib import suppress
 from functools import cached_property
-from subprocess import PIPE, Popen, DEVNULL, run
+from subprocess import PIPE, Popen, DEVNULL, run, CalledProcessError
 from pathlib import Path
 import ctypes
 import re
@@ -11,35 +12,38 @@
 
 import cpuinfo
 import numpy as np
+from packaging.version import parse, InvalidVersion
 import psutil
 
 from devito.logger import warning
 from devito.tools import as_tuple, all_equal, memoized_func
 
-__all__ = ['platform_registry', 'get_cpu_info', 'get_gpu_info', 'get_visible_devices',
-           'get_nvidia_cc', 'get_cuda_path', 'get_hip_path', 'check_cuda_runtime',
-           'get_m1_llvm_path', 'get_advisor_path', 'Platform', 'Cpu64', 'Intel64',
-           'IntelSkylake', 'Amd', 'Arm', 'Power', 'Device', 'NvidiaDevice',
-           'AmdDevice', 'IntelDevice',
-           # Brand-agnostic
-           'ANYCPU', 'ANYGPU',
-           # Intel CPUs
-           'INTEL64', 'SNB', 'IVB', 'HSW', 'BDW', 'KNL', 'KNL7210',
-           'SKX', 'KLX', 'CLX', 'CLK', 'SPR',
-           # AMD CPUs
-           'AMD',
-           # ARM CPUs
-           'ARM', 'AppleArm', 'M1', 'M2', 'M3',
-           'Graviton', 'GRAVITON2', 'GRAVITON3', 'GRAVITON4',
-           'Cortex', 'NvidiaArm', 'GRACE',
-           # Other legacy CPUs
-           'POWER8', 'POWER9',
-           # Generic GPUs
-           'AMDGPUX', 'NVIDIAX', 'INTELGPUX',
-           # Nvidia GPUs
-           'VOLTA', 'AMPERE', 'HOPPER', 'BLACKWELL',
-           # Intel GPUs
-           'PVC', 'INTELGPUMAX', 'MAX1100', 'MAX1550']
+__all__ = [
+    'platform_registry', 'get_cpu_info', 'get_gpu_info', 'get_visible_devices',
+    'get_nvidia_cc', 'get_cuda_path', 'get_cuda_version', 'get_hip_path',
+    'check_cuda_runtime', 'get_m1_llvm_path', 'get_advisor_path', 'Platform',
+    'Cpu64', 'Intel64', 'IntelSkylake', 'Amd', 'Arm', 'Power', 'Device',
+    'NvidiaDevice', 'AmdDevice', 'IntelDevice',
+    # Brand-agnostic
+    'ANYCPU', 'ANYGPU',
+    # Intel CPUs
+    'INTEL64', 'SNB', 'IVB', 'HSW', 'BDW', 'KNL', 'KNL7210',
+    'SKX', 'KLX', 'CLX', 'CLK', 'SPR',
+    # AMD CPUs
+    'AMD',
+    # ARM CPUs
+    'ARM', 'AppleArm', 'M1', 'M2', 'M3',
+    'Graviton', 'GRAVITON2', 'GRAVITON3', 'GRAVITON4',
+    'Cortex', 'NvidiaArm', 'GRACE',
+    # Other legacy CPUs
+    'POWER8', 'POWER9',
+    # Generic GPUs
+    'AMDGPUX', 'NVIDIAX', 'INTELGPUX',
+    # Nvidia GPUs
+    'VOLTA', 'AMPERE', 'HOPPER', 'BLACKWELL',
+    # Intel GPUs
+    'PVC', 'INTELGPUMAX', 'MAX1100', 'MAX1550'
+]
 
 
 @memoized_func
@@ -553,6 +557,30 @@ def get_cuda_path():
     return None
 
 
+@memoized_func
+def get_cuda_version():
+    cuda_home = get_cuda_path()
+    if cuda_home is None:
+        nvc_version_command = ['nvcc', '--version']
+    else:
+        nvc_version_command = [f'{cuda_home}/bin/nvcc', '--version']
+
+    cuda_version = None
+    try:
+        out = run(nvc_version_command, capture_output=True, text=True)
+    except (FileNotFoundError, CalledProcessError):
+        pass
+    finally:
+        if out.returncode == 0:
+            start = out.stdout.find('release')
+            start = out.stdout.find(',', start) + 1
+            stop = out.stdout.find('\n', start)
+            with suppress(InvalidVersion):
+                cuda_version = parse(out.stdout[start:stop])
+
+    return cuda_version
+
+
 @memoized_func
 def get_advisor_path():
     """
@@ -619,28 +647,35 @@ def get_m1_llvm_path(language):
 
 @memoized_func
 def check_cuda_runtime():
-    libnames = ('libcudart.so', 'libcudart.dylib', 'cudart.dll')
-    for libname in libnames:
-        try:
-            cuda = ctypes.CDLL(libname)
-        except OSError:
-            continue
-        else:
-            break
-    else:
+    libname = ctypes.util.find_library("cudart")
+    if not libname:
         warning("Unable to check compatibility of NVidia driver and runtime")
         return
 
+    cuda = ctypes.CDLL(libname)
     driver_version = ctypes.c_int()
     runtime_version = ctypes.c_int()
 
     if cuda.cudaDriverGetVersion(ctypes.byref(driver_version)) == 0 and \
        cuda.cudaRuntimeGetVersion(ctypes.byref(runtime_version)) == 0:
         driver_version = driver_version.value
         runtime_version = runtime_version.value
-        if driver_version < runtime_version:
-            warning("The NVidia driver (v%d) on this system may not be compatible "
-                    "with the CUDA runtime (v%d)" % (driver_version, runtime_version))
+
+        driver_v = parse(str(driver_version/1000))
+        runtime_v = parse(str(runtime_version/1000))
+        # First check the "major" version, known to be incompatible
+        if driver_v.major < runtime_v.major:
+            raise RuntimeError(
+                f'The NVidia driver (v{driver_version}) on this system is '
+                f'not compatible with the CUDA runtime (v{runtime_version})'
+            )
+        # Next check the version including minor revisions which may still
+        # be compatible
+        elif driver_v < runtime_v:
+            warning(
+                f'The NVidia driver (v{driver_version}) on this system may '
+                f'not be compatible with the CUDA runtime (v{runtime_version})'
+            )
     else:
         warning("Unable to check compatibility of NVidia driver and runtime")
 
@@ -1069,6 +1104,32 @@ def march(self):
                 return 'tesla'
         return None
 
+    @cached_property
+    def max_shm_per_block(self):
+        """
+        Get the maximum amount of shared memory per thread block
+        """
+        # Load libcudart
+        libname = ctypes.util.find_library("cudart")
+        if not libname:
+            return 64 * 1024  # 64 KB default
+        lib = ctypes.CDLL(libname)
+
+        cudaDevAttrMaxSharedMemoryPerBlockOptin = 97
+        # get current device
+        dev = ctypes.c_int()
+        lib.cudaGetDevice(ctypes.byref(dev))
+
+        # query attribute
+        value = ctypes.c_int()
+        lib.cudaDeviceGetAttribute(
+            ctypes.byref(value),
+            ctypes.c_int(cudaDevAttrMaxSharedMemoryPerBlockOptin),
+            dev
+        )
+
+        return value.value
+
     def supports(self, query, language=None):
         if language != 'cuda':
             return False
@@ -1125,6 +1186,8 @@ class AmdDevice(Device):
 
     max_mem_trans_nbytes = 256
 
+    max_shm_per_block = 64*1024  # 64 KB
+
     @cached_property
     def march(cls):
         # TODO: this corresponds to Vega, which acts as the fallback `march`
 
@@ -1,5 +1,6 @@
 from functools import partial
 from hashlib import sha1
+from itertools import filterfalse
 from os import environ, path, makedirs
 from packaging.version import Version
 from subprocess import (DEVNULL, PIPE, CalledProcessError, check_output,
@@ -13,9 +14,11 @@
 from codepy.toolchain import (GCCToolchain,
                               call_capture_output as _call_capture_output)
 
-from devito.arch import (AMDGPUX, Cpu64, AppleArm, NvidiaDevice, POWER8, POWER9,
-                         Graviton, Cortex, IntelDevice, get_nvidia_cc, NvidiaArm,
-                         check_cuda_runtime, get_m1_llvm_path)
+from devito.arch import (
+    AMDGPUX, Cpu64, AppleArm, NvidiaDevice, POWER8, POWER9, Graviton,
+    Cortex, IntelDevice, get_nvidia_cc, NvidiaArm, check_cuda_runtime,
+    get_cuda_version, get_m1_llvm_path
+)
 from devito.exceptions import CompilationError
 from devito.logger import debug, warning
 from devito.parameters import configuration
@@ -63,7 +66,10 @@ def sniff_compiler_version(cc, allow_fail=False):
     elif ver.startswith("icx"):
         compiler = "icx"
     elif ver.startswith("pgcc"):
-        compiler = "pgcc"
+        raise CompilationError(
+            'Portland compiler no longer supported,'
+            ' use `nvc` from the nvidia HPC SDK instead'
+        )
     elif ver.startswith("nvc++"):
         compiler = "nvc"
     elif ver.startswith("cray"):
@@ -626,7 +632,7 @@ def __lookup_cmds__(self):
         self.MPICXX = 'mpicxx'
 
 
-class PGICompiler(Compiler):
+class NvidiaCompiler(Compiler):
 
     _default_cpp = True
 
@@ -656,25 +662,41 @@ def __init_finalize__(self, **kwargs):
 
         if not configuration['safe-math']:
             self.cflags.append('-fast')
-        # Default PGI compile for a target is GPU and single threaded host.
+        # Default compile for a target is GPU and single threaded host.
         # self.cflags += ['-ta=tesla,host']
 
     def __lookup_cmds__(self):
-        # NOTE: using `pgc++` instead of `pgcc` because of issue #1219
-        self.CC = 'pgc++'
-        self.CXX = 'pgc++'
-        self.MPICC = 'mpic++'
-        self.MPICXX = 'mpicxx'
-
-
-class NvidiaCompiler(PGICompiler):
-
-    def __lookup_cmds__(self):
+        # Note: Using `nvc++` instead of `nvcc` because of issue #1219
         self.CC = 'nvc++'
         self.CXX = 'nvc++'
         self.MPICC = 'mpic++'
         self.MPICXX = 'mpicxx'
 
+    def add_libraries(self, libs):
+        # Urgh...
+        # NvidiaCompiler inherits from Compiler inherits from GCCToolchain in codepy
+        # And _GCC_ supports linking versioned shared objects with the syntax:
+        # `gcc -L/path/to/versioned/lib -l:libfoo.so.2.0 ...`
+        # But this syntax is not supported by the Nvidia compiler.
+        # Nor does `codepy.GCCToolchain` understand that linking to versioned objects
+        # is a thing that someone might want to do.
+        #
+        # Since this is just linking information, we can just tell the linker
+        # (which we invoke using the compiler and the `-Wl,-options` syntax) to
+        # go and look in all of the directories we have provided thus far and
+        # the linker supports the syntax:
+        # `ld -L/path/to/versioned/lib -l:libfoo.so.2.0 ...`
+        #
+        # Note: It would be nicer to just look in the one _relevant_ lib dir!
+        new = as_list(libs)
+        versioned = filter(lambda s: s.startswith(':'), new)
+        versioned = map(lambda s: s.removeprefix(':'), versioned)
+        self.add_ldflags([
+            f'-Wl,-L{",-L".join(map(str, self.library_dirs))},-l:{soname}'
+            for soname in versioned
+        ])
+        super().add_libraries(filterfalse(lambda s: s.startswith(':'), new))
+
 
 class CudaCompiler(Compiler):
 
@@ -748,6 +770,12 @@ def __init_finalize__(self, **kwargs):
         # garbage, since the CUDA kernel behaviour would be undefined
         check_cuda_runtime()
 
+    @property
+    def std(self):
+        # Since CUDA 13, code needs compiling with C++17 standard
+        _cxxstd = 'c++17' if get_cuda_version().major >= 13 else 'c++14'
+        return _cxxstd if self._cpp else self._cstd
+
     def __lookup_cmds__(self):
         self.CC = 'nvcc'
         self.CXX = 'nvcc'
@@ -1065,8 +1093,6 @@ def __contains__(self, key):
     'aomp': AOMPCompiler,
     'amdclang': AOMPCompiler,
     'hip': HipCompiler,
-    'pgcc': PGICompiler,
-    'pgi': PGICompiler,
     'nvc': NvidiaCompiler,
     'nvc++': NvidiaCompiler,
     'nvidia': NvidiaCompiler,
 
@@ -177,7 +177,6 @@ def _specialize_clusters(cls, clusters, **kwargs):
         # Reduce flops
         clusters = cire(clusters, 'sops', sregistry, options, platform)
         clusters = factorize(clusters, **kwargs)
-        clusters = optimize_pows(clusters)
 
         # The previous passes may have created fusion opportunities
         clusters = fuse(clusters)
 
@@ -218,7 +218,6 @@ def _specialize_clusters(cls, clusters, **kwargs):
         # Reduce flops
         clusters = cire(clusters, 'sops', sregistry, options, platform)
         clusters = factorize(clusters, **kwargs)
-        clusters = optimize_pows(clusters)
 
         # The previous passes may have created fusion opportunities
         clusters = fuse(clusters)
 
@@ -335,20 +335,6 @@ def __call__(self, x0=None, fd_order=None, side=None, method=None, **kwargs):
             except AttributeError:
                 raise TypeError("fd_order incompatible with dimensions") from None
 
-        if isinstance(self.expr, Derivative):
-            # In case this was called on a perfect cross-derivative `u.dxdy`
-            # we need to propagate the call to the nested derivative
-            rkwe = dict(rkw)
-            rkwe.pop('weights', None)
-            if 'x0' in rkwe:
-                rkwe['x0'] = self._filter_dims(self.expr._filter_dims(rkw['x0']),
-                                               neg=True)
-            if fd_order is not None:
-                fdo = self.expr._filter_dims(_fd_order)
-                if fdo:
-                    rkwe['fd_order'] = fdo
-            rkw['expr'] = self.expr(**rkwe)
-
         if fd_order is not None:
             rkw['fd_order'] = self._filter_dims(_fd_order, as_tuple=True)
 
@@ -530,9 +516,12 @@ def _eval_at(self, func):
             # it into `u(x + h_x/2).dx` and `v(x).dx`, since they require
             # different FD indices
             mapper = as_mapper(self.expr._args_diff, lambda i: i.staggered)
+            if len(mapper) == 1:
+                # All terms have the same staggering, we can use expr as is
+                return self._rebuild(self.expr, **rkw)
             args = [self.expr.func(*v) for v in mapper.values()]
             args.extend([a for a in self.expr.args if a not in self.expr._args_diff])
-            args = [self._rebuild(a, **rkw) for a in args]
+            args = [self._rebuild(a)._eval_at(func) for a in args]
             return self.expr.func(*args)
         elif self.expr.is_Mul:
             # For Mul, We treat the basic case `u(x + h_x/2) * v(x) which is what appear