Skip to content

Commit 1e738f3

Browse files
authored
Merge branch 'main' into patch-1
2 parents b84d993 + f020eb1 commit 1e738f3

32 files changed

Lines changed: 504 additions & 306 deletions

FAQ.md

Lines changed: 22 additions & 22 deletions
Large diffs are not rendered by default.

benchmarks/user/README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -147,7 +147,7 @@ selected and used for all remaining timesteps.
147147

148148
The "backend compiler" takes as input the code generated by Devito and
149149
translates it into a shared object. Supported backend compilers are `gcc`,
150-
`icc`, `pgcc`, `clang`. For each of these compilers, Devito uses some preset compilation
150+
`icc`, `clang`. For each of these compilers, Devito uses some preset compilation
151151
flags (e.g., -O3, -march=native, etc).
152152

153153
The default backend compiler is `gcc`. To change it, one should set the

devito/arch/archinfo.py

Lines changed: 100 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,8 @@
11
"""Collection of utilities to detect properties of the underlying architecture."""
22

3+
from contextlib import suppress
34
from functools import cached_property
4-
from subprocess import PIPE, Popen, DEVNULL, run
5+
from subprocess import PIPE, Popen, DEVNULL, run, CalledProcessError
56
from pathlib import Path
67
import ctypes
78
import re
@@ -11,35 +12,38 @@
1112

1213
import cpuinfo
1314
import numpy as np
15+
from packaging.version import parse, InvalidVersion
1416
import psutil
1517

1618
from devito.logger import warning
1719
from devito.tools import as_tuple, all_equal, memoized_func
1820

19-
__all__ = ['platform_registry', 'get_cpu_info', 'get_gpu_info', 'get_visible_devices',
20-
'get_nvidia_cc', 'get_cuda_path', 'get_hip_path', 'check_cuda_runtime',
21-
'get_m1_llvm_path', 'get_advisor_path', 'Platform', 'Cpu64', 'Intel64',
22-
'IntelSkylake', 'Amd', 'Arm', 'Power', 'Device', 'NvidiaDevice',
23-
'AmdDevice', 'IntelDevice',
24-
# Brand-agnostic
25-
'ANYCPU', 'ANYGPU',
26-
# Intel CPUs
27-
'INTEL64', 'SNB', 'IVB', 'HSW', 'BDW', 'KNL', 'KNL7210',
28-
'SKX', 'KLX', 'CLX', 'CLK', 'SPR',
29-
# AMD CPUs
30-
'AMD',
31-
# ARM CPUs
32-
'ARM', 'AppleArm', 'M1', 'M2', 'M3',
33-
'Graviton', 'GRAVITON2', 'GRAVITON3', 'GRAVITON4',
34-
'Cortex', 'NvidiaArm', 'GRACE',
35-
# Other legacy CPUs
36-
'POWER8', 'POWER9',
37-
# Generic GPUs
38-
'AMDGPUX', 'NVIDIAX', 'INTELGPUX',
39-
# Nvidia GPUs
40-
'VOLTA', 'AMPERE', 'HOPPER', 'BLACKWELL',
41-
# Intel GPUs
42-
'PVC', 'INTELGPUMAX', 'MAX1100', 'MAX1550']
21+
__all__ = [
22+
'platform_registry', 'get_cpu_info', 'get_gpu_info', 'get_visible_devices',
23+
'get_nvidia_cc', 'get_cuda_path', 'get_cuda_version', 'get_hip_path',
24+
'check_cuda_runtime', 'get_m1_llvm_path', 'get_advisor_path', 'Platform',
25+
'Cpu64', 'Intel64', 'IntelSkylake', 'Amd', 'Arm', 'Power', 'Device',
26+
'NvidiaDevice', 'AmdDevice', 'IntelDevice',
27+
# Brand-agnostic
28+
'ANYCPU', 'ANYGPU',
29+
# Intel CPUs
30+
'INTEL64', 'SNB', 'IVB', 'HSW', 'BDW', 'KNL', 'KNL7210',
31+
'SKX', 'KLX', 'CLX', 'CLK', 'SPR',
32+
# AMD CPUs
33+
'AMD',
34+
# ARM CPUs
35+
'ARM', 'AppleArm', 'M1', 'M2', 'M3',
36+
'Graviton', 'GRAVITON2', 'GRAVITON3', 'GRAVITON4',
37+
'Cortex', 'NvidiaArm', 'GRACE',
38+
# Other legacy CPUs
39+
'POWER8', 'POWER9',
40+
# Generic GPUs
41+
'AMDGPUX', 'NVIDIAX', 'INTELGPUX',
42+
# Nvidia GPUs
43+
'VOLTA', 'AMPERE', 'HOPPER', 'BLACKWELL',
44+
# Intel GPUs
45+
'PVC', 'INTELGPUMAX', 'MAX1100', 'MAX1550'
46+
]
4347

4448

4549
@memoized_func
@@ -553,6 +557,30 @@ def get_cuda_path():
553557
return None
554558

555559

560+
@memoized_func
561+
def get_cuda_version():
562+
cuda_home = get_cuda_path()
563+
if cuda_home is None:
564+
nvc_version_command = ['nvcc', '--version']
565+
else:
566+
nvc_version_command = [f'{cuda_home}/bin/nvcc', '--version']
567+
568+
cuda_version = None
569+
try:
570+
out = run(nvc_version_command, capture_output=True, text=True)
571+
except (FileNotFoundError, CalledProcessError):
572+
pass
573+
finally:
574+
if out.returncode == 0:
575+
start = out.stdout.find('release')
576+
start = out.stdout.find(',', start) + 1
577+
stop = out.stdout.find('\n', start)
578+
with suppress(InvalidVersion):
579+
cuda_version = parse(out.stdout[start:stop])
580+
581+
return cuda_version
582+
583+
556584
@memoized_func
557585
def get_advisor_path():
558586
"""
@@ -619,28 +647,35 @@ def get_m1_llvm_path(language):
619647

620648
@memoized_func
621649
def check_cuda_runtime():
622-
libnames = ('libcudart.so', 'libcudart.dylib', 'cudart.dll')
623-
for libname in libnames:
624-
try:
625-
cuda = ctypes.CDLL(libname)
626-
except OSError:
627-
continue
628-
else:
629-
break
630-
else:
650+
libname = ctypes.util.find_library("cudart")
651+
if not libname:
631652
warning("Unable to check compatibility of NVidia driver and runtime")
632653
return
633654

655+
cuda = ctypes.CDLL(libname)
634656
driver_version = ctypes.c_int()
635657
runtime_version = ctypes.c_int()
636658

637659
if cuda.cudaDriverGetVersion(ctypes.byref(driver_version)) == 0 and \
638660
cuda.cudaRuntimeGetVersion(ctypes.byref(runtime_version)) == 0:
639661
driver_version = driver_version.value
640662
runtime_version = runtime_version.value
641-
if driver_version < runtime_version:
642-
warning("The NVidia driver (v%d) on this system may not be compatible "
643-
"with the CUDA runtime (v%d)" % (driver_version, runtime_version))
663+
664+
driver_v = parse(str(driver_version/1000))
665+
runtime_v = parse(str(runtime_version/1000))
666+
# First check the "major" version, known to be incompatible
667+
if driver_v.major < runtime_v.major:
668+
raise RuntimeError(
669+
f'The NVidia driver (v{driver_version}) on this system is '
670+
f'not compatible with the CUDA runtime (v{runtime_version})'
671+
)
672+
# Next check the version including minor revisions which may still
673+
# be compatible
674+
elif driver_v < runtime_v:
675+
warning(
676+
f'The NVidia driver (v{driver_version}) on this system may '
677+
f'not be compatible with the CUDA runtime (v{runtime_version})'
678+
)
644679
else:
645680
warning("Unable to check compatibility of NVidia driver and runtime")
646681

@@ -1069,6 +1104,32 @@ def march(self):
10691104
return 'tesla'
10701105
return None
10711106

1107+
@cached_property
1108+
def max_shm_per_block(self):
1109+
"""
1110+
Get the maximum amount of shared memory per thread block
1111+
"""
1112+
# Load libcudart
1113+
libname = ctypes.util.find_library("cudart")
1114+
if not libname:
1115+
return 64 * 1024 # 64 KB default
1116+
lib = ctypes.CDLL(libname)
1117+
1118+
cudaDevAttrMaxSharedMemoryPerBlockOptin = 97
1119+
# get current device
1120+
dev = ctypes.c_int()
1121+
lib.cudaGetDevice(ctypes.byref(dev))
1122+
1123+
# query attribute
1124+
value = ctypes.c_int()
1125+
lib.cudaDeviceGetAttribute(
1126+
ctypes.byref(value),
1127+
ctypes.c_int(cudaDevAttrMaxSharedMemoryPerBlockOptin),
1128+
dev
1129+
)
1130+
1131+
return value.value
1132+
10721133
def supports(self, query, language=None):
10731134
if language != 'cuda':
10741135
return False
@@ -1125,6 +1186,8 @@ class AmdDevice(Device):
11251186

11261187
max_mem_trans_nbytes = 256
11271188

1189+
max_shm_per_block = 64*1024 # 64 KB
1190+
11281191
@cached_property
11291192
def march(cls):
11301193
# TODO: this corresponds to Vega, which acts as the fallback `march`

devito/arch/compiler.py

Lines changed: 44 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
from functools import partial
22
from hashlib import sha1
3+
from itertools import filterfalse
34
from os import environ, path, makedirs
45
from packaging.version import Version
56
from subprocess import (DEVNULL, PIPE, CalledProcessError, check_output,
@@ -13,9 +14,11 @@
1314
from codepy.toolchain import (GCCToolchain,
1415
call_capture_output as _call_capture_output)
1516

16-
from devito.arch import (AMDGPUX, Cpu64, AppleArm, NvidiaDevice, POWER8, POWER9,
17-
Graviton, Cortex, IntelDevice, get_nvidia_cc, NvidiaArm,
18-
check_cuda_runtime, get_m1_llvm_path)
17+
from devito.arch import (
18+
AMDGPUX, Cpu64, AppleArm, NvidiaDevice, POWER8, POWER9, Graviton,
19+
Cortex, IntelDevice, get_nvidia_cc, NvidiaArm, check_cuda_runtime,
20+
get_cuda_version, get_m1_llvm_path
21+
)
1922
from devito.exceptions import CompilationError
2023
from devito.logger import debug, warning
2124
from devito.parameters import configuration
@@ -63,7 +66,10 @@ def sniff_compiler_version(cc, allow_fail=False):
6366
elif ver.startswith("icx"):
6467
compiler = "icx"
6568
elif ver.startswith("pgcc"):
66-
compiler = "pgcc"
69+
raise CompilationError(
70+
'Portland compiler no longer supported,'
71+
' use `nvc` from the nvidia HPC SDK instead'
72+
)
6773
elif ver.startswith("nvc++"):
6874
compiler = "nvc"
6975
elif ver.startswith("cray"):
@@ -626,7 +632,7 @@ def __lookup_cmds__(self):
626632
self.MPICXX = 'mpicxx'
627633

628634

629-
class PGICompiler(Compiler):
635+
class NvidiaCompiler(Compiler):
630636

631637
_default_cpp = True
632638

@@ -656,25 +662,41 @@ def __init_finalize__(self, **kwargs):
656662

657663
if not configuration['safe-math']:
658664
self.cflags.append('-fast')
659-
# Default PGI compile for a target is GPU and single threaded host.
665+
# Default compile for a target is GPU and single threaded host.
660666
# self.cflags += ['-ta=tesla,host']
661667

662668
def __lookup_cmds__(self):
663-
# NOTE: using `pgc++` instead of `pgcc` because of issue #1219
664-
self.CC = 'pgc++'
665-
self.CXX = 'pgc++'
666-
self.MPICC = 'mpic++'
667-
self.MPICXX = 'mpicxx'
668-
669-
670-
class NvidiaCompiler(PGICompiler):
671-
672-
def __lookup_cmds__(self):
669+
# Note: Using `nvc++` instead of `nvcc` because of issue #1219
673670
self.CC = 'nvc++'
674671
self.CXX = 'nvc++'
675672
self.MPICC = 'mpic++'
676673
self.MPICXX = 'mpicxx'
677674

675+
def add_libraries(self, libs):
676+
# Urgh...
677+
# NvidiaCompiler inherits from Compiler inherits from GCCToolchain in codepy
678+
# And _GCC_ supports linking versioned shared objects with the syntax:
679+
# `gcc -L/path/to/versioned/lib -l:libfoo.so.2.0 ...`
680+
# But this syntax is not supported by the Nvidia compiler.
681+
# Nor does `codepy.GCCToolchain` understand that linking to versioned objects
682+
# is a thing that someone might want to do.
683+
#
684+
# Since this is just linking information, we can just tell the linker
685+
# (which we invoke using the compiler and the `-Wl,-options` syntax) to
686+
# go and look in all of the directories we have provided thus far and
687+
# the linker supports the syntax:
688+
# `ld -L/path/to/versioned/lib -l:libfoo.so.2.0 ...`
689+
#
690+
# Note: It would be nicer to just look in the one _relevant_ lib dir!
691+
new = as_list(libs)
692+
versioned = filter(lambda s: s.startswith(':'), new)
693+
versioned = map(lambda s: s.removeprefix(':'), versioned)
694+
self.add_ldflags([
695+
f'-Wl,-L{",-L".join(map(str, self.library_dirs))},-l:{soname}'
696+
for soname in versioned
697+
])
698+
super().add_libraries(filterfalse(lambda s: s.startswith(':'), new))
699+
678700

679701
class CudaCompiler(Compiler):
680702

@@ -748,6 +770,12 @@ def __init_finalize__(self, **kwargs):
748770
# garbage, since the CUDA kernel behaviour would be undefined
749771
check_cuda_runtime()
750772

773+
@property
774+
def std(self):
775+
# Since CUDA 13, code needs compiling with C++17 standard
776+
_cxxstd = 'c++17' if get_cuda_version().major >= 13 else 'c++14'
777+
return _cxxstd if self._cpp else self._cstd
778+
751779
def __lookup_cmds__(self):
752780
self.CC = 'nvcc'
753781
self.CXX = 'nvcc'
@@ -1065,8 +1093,6 @@ def __contains__(self, key):
10651093
'aomp': AOMPCompiler,
10661094
'amdclang': AOMPCompiler,
10671095
'hip': HipCompiler,
1068-
'pgcc': PGICompiler,
1069-
'pgi': PGICompiler,
10701096
'nvc': NvidiaCompiler,
10711097
'nvc++': NvidiaCompiler,
10721098
'nvidia': NvidiaCompiler,

devito/core/cpu.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -177,7 +177,6 @@ def _specialize_clusters(cls, clusters, **kwargs):
177177
# Reduce flops
178178
clusters = cire(clusters, 'sops', sregistry, options, platform)
179179
clusters = factorize(clusters, **kwargs)
180-
clusters = optimize_pows(clusters)
181180

182181
# The previous passes may have created fusion opportunities
183182
clusters = fuse(clusters)

devito/core/gpu.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -218,7 +218,6 @@ def _specialize_clusters(cls, clusters, **kwargs):
218218
# Reduce flops
219219
clusters = cire(clusters, 'sops', sregistry, options, platform)
220220
clusters = factorize(clusters, **kwargs)
221-
clusters = optimize_pows(clusters)
222221

223222
# The previous passes may have created fusion opportunities
224223
clusters = fuse(clusters)

devito/finite_differences/derivative.py

Lines changed: 4 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -335,20 +335,6 @@ def __call__(self, x0=None, fd_order=None, side=None, method=None, **kwargs):
335335
except AttributeError:
336336
raise TypeError("fd_order incompatible with dimensions") from None
337337

338-
if isinstance(self.expr, Derivative):
339-
# In case this was called on a perfect cross-derivative `u.dxdy`
340-
# we need to propagate the call to the nested derivative
341-
rkwe = dict(rkw)
342-
rkwe.pop('weights', None)
343-
if 'x0' in rkwe:
344-
rkwe['x0'] = self._filter_dims(self.expr._filter_dims(rkw['x0']),
345-
neg=True)
346-
if fd_order is not None:
347-
fdo = self.expr._filter_dims(_fd_order)
348-
if fdo:
349-
rkwe['fd_order'] = fdo
350-
rkw['expr'] = self.expr(**rkwe)
351-
352338
if fd_order is not None:
353339
rkw['fd_order'] = self._filter_dims(_fd_order, as_tuple=True)
354340

@@ -530,9 +516,12 @@ def _eval_at(self, func):
530516
# it into `u(x + h_x/2).dx` and `v(x).dx`, since they require
531517
# different FD indices
532518
mapper = as_mapper(self.expr._args_diff, lambda i: i.staggered)
519+
if len(mapper) == 1:
520+
# All terms have the same staggering, we can use expr as is
521+
return self._rebuild(self.expr, **rkw)
533522
args = [self.expr.func(*v) for v in mapper.values()]
534523
args.extend([a for a in self.expr.args if a not in self.expr._args_diff])
535-
args = [self._rebuild(a, **rkw) for a in args]
524+
args = [self._rebuild(a)._eval_at(func) for a in args]
536525
return self.expr.func(*args)
537526
elif self.expr.is_Mul:
538527
# For Mul, We treat the basic case `u(x + h_x/2) * v(x) which is what appear

0 commit comments

Comments
 (0)