diff --git a/devito/ir/iet/nodes.py b/devito/ir/iet/nodes.py index 60646066b7..9df5e1309f 100644 --- a/devito/ir/iet/nodes.py +++ b/devito/ir/iet/nodes.py @@ -102,10 +102,16 @@ def ccode(self): @property def view(self): - """A representation of the IET rooted in ``self``.""" + """A high-level representation of the IET rooted in `self`.""" from devito.ir.iet.visitors import printAST return printAST(self) + @property + def view_cir(self): + from devito.ir.iet.visitors import CGen + from devito.passes.iet.languages.CIR import CIRPrinter + return str(CGen(printer=CIRPrinter).visit(self)) + @property def children(self): """Return the traversable children.""" @@ -148,7 +154,7 @@ def writes(self): return () def _signature_items(self): - return (str(self),) + return (self.view_cir,) class ExprStmt: diff --git a/devito/ir/support/basic.py b/devito/ir/support/basic.py index a3639052c2..de3ad34234 100644 --- a/devito/ir/support/basic.py +++ b/devito/ir/support/basic.py @@ -8,7 +8,8 @@ from devito.ir.support.utils import AccessMode, extrema from devito.ir.support.vector import LabeledVector, Vector from devito.symbolics import (compare_ops, retrieve_indexed, retrieve_terminals, - q_constant, q_affine, q_routine, search, uxreplace) + q_constant, q_comp_acc, q_affine, q_routine, search, + uxreplace) from devito.tools import (Tag, as_mapper, as_tuple, is_integer, filter_sorted, flatten, memoized_meth, memoized_generator) from devito.types import (ComponentAccess, Dimension, DimensionTuple, Fence, @@ -529,9 +530,16 @@ def __hash__(self): (self.source, self.sink, self.source.timestamp == self.sink.timestamp) ) - @property + @cached_property def function(self): - return self.source.function + if q_comp_acc(self.source.access) and not q_comp_acc(self.sink.access): + # E.g., `source=ab[x].x` and `sink=ab[x]` -> `a(x)` + return self.source.access.function_access + elif q_comp_acc(self.sink.access) and not q_comp_acc(self.source.access): + # E.g., `source=ab[x]` and `sink=ab[x].y` -> `b(x)` + return self.sink.access.function_access + else: + return self.source.function @property def findices(self): @@ -955,7 +963,7 @@ def reads_gen(self): @memoized_generator def reads_smart_gen(self, f): """ - Generate all read access to a given function. + Generate all read accesses to a given function. StencilDimensions, if any, are replaced with their extrema. diff --git a/devito/mpi/halo_scheme.py b/devito/mpi/halo_scheme.py index 998ae28881..198e5d096e 100644 --- a/devito/mpi/halo_scheme.py +++ b/devito/mpi/halo_scheme.py @@ -30,12 +30,14 @@ class HaloLabel(Tag): class HaloSchemeEntry(EnrichedTuple): __rargs__ = ('loc_indices', 'loc_dirs', 'halos', 'dims') + __rkwargs__ = ('bundle',) - def __new__(cls, loc_indices, loc_dirs, halos, dims, getters=None): + def __new__(cls, loc_indices, loc_dirs, halos, dims, bundle=None, getters=None): + getters = cls.__rargs__ + cls.__rkwargs__ items = [frozendict(loc_indices), frozendict(loc_dirs), - frozenset(halos), frozenset(dims)] - kwargs = dict(zip(cls.__rargs__, items)) - return super().__new__(cls, *items, getters=cls.__rargs__, **kwargs) + frozenset(halos), frozenset(dims), bundle] + kwargs = dict(zip(getters, items)) + return super().__new__(cls, *items, getters=getters, **kwargs) def __hash__(self): return hash((self.loc_indices, self.loc_dirs, self.halos, self.dims)) @@ -47,7 +49,8 @@ def union(self, other): exception is raised. """ if self.loc_indices != other.loc_indices or \ - self.loc_dirs != other.loc_dirs: + self.loc_dirs != other.loc_dirs or \ + self.bundle is not other.bundle: raise HaloSchemeException( "Inconsistency found while building a HaloScheme" ) @@ -56,7 +59,7 @@ def union(self, other): dims = self.dims | other.dims return HaloSchemeEntry(self.loc_indices, self.loc_dirs, halos, dims, - getters=self.getters) + bundle=self.bundle, getters=self.getters) Halo = namedtuple('Halo', 'dim side') @@ -168,7 +171,7 @@ def union(self, halo_schemes): elif not v.loc_indices or hse.loc_indices == v.loc_indices: loc_indices, loc_dirs = hse.loc_indices, hse.loc_dirs else: - # The `loc_dirs` must match otherwise it'd be a symptom there's + # These must match otherwise it'd be a symptom there's # something horribly broken elsewhere! assert hse.loc_dirs == v.loc_dirs assert list(hse.loc_indices) == list(v.loc_indices) @@ -185,7 +188,11 @@ def union(self, halo_schemes): halos = hse.halos | v.halos dims = hse.dims | v.dims - fmapper[k] = HaloSchemeEntry(loc_indices, loc_dirs, halos, dims) + assert hse.bundle is v.bundle + + fmapper[k] = HaloSchemeEntry( + loc_indices, loc_dirs, halos, dims, bundle=hse.bundle + ) # Compute the `honored` union for d, v in i.honored.items(): @@ -641,8 +648,12 @@ def _uxreplace_dispatch_haloscheme(hs0, rule): for i, v in rule.items(): if i is f: # Yes! - g = v - hse = hse0 + if v.is_Bundle: + g = f + hse = hse0._rebuild(bundle=v) + else: + g = v + hse = hse0 elif i.is_Indexed and i.function is f and v.is_Indexed: # Yes, but through an Indexed, hence the `loc_indices` may now diff --git a/devito/mpi/routines.py b/devito/mpi/routines.py index a3562a7eb1..df13e2afe8 100644 --- a/devito/mpi/routines.py +++ b/devito/mpi/routines.py @@ -17,9 +17,9 @@ from devito.symbolics import (Byref, CondNe, FieldFromPointer, FieldFromComposite, IndexedPointer, Macro, cast, subs_op_args) from devito.tools import (as_mapper, dtype_to_mpitype, dtype_len, infer_datasize, - flatten, generator, is_integer, split) -from devito.types import (Array, Bag, Dimension, Eq, Symbol, LocalObject, - CompositeObject, CustomDimension) + flatten, generator, is_integer) +from devito.types import (Array, Bag, BundleView, Dimension, Eq, Symbol, + LocalObject, CompositeObject, CustomDimension) __all__ = ['HaloExchangeBuilder', 'ReductionBuilder', 'mpi_registry'] @@ -292,19 +292,28 @@ def _make_bundles(self, hs): mapper = as_mapper(halo_scheme.fmapper, lambda i: halo_scheme.fmapper[i]) for hse, components in mapper.items(): - # We recast everything as Bags for simplicity -- worst case scenario - # all Bags only have one component. Existing Bundles are preserved halo_scheme = halo_scheme.drop(components) - bundles, candidates = split(tuple(components), lambda i: i.is_Bundle) - for b in bundles: - halo_scheme = halo_scheme.add(b, hse) + # Existing Bundles are preserved + if hse.bundle: + if set(components) == set(hse.bundle.components): + halo_scheme = halo_scheme.add(hse.bundle, hse) + else: + name = f'bundleview_{hse.bundle.name}' + bundle_view = BundleView( + name=name, components=components, parent=hse.bundle + ) + halo_scheme = halo_scheme.add(bundle_view, hse) + continue + + # We recast everything else as Bags for simplicity -- worst case + # scenario all Bags only have one component. try: - name = "bag_%s" % "".join(f.name for f in candidates) - bag = Bag(name=name, components=candidates) + name = "bag_%s" % "".join(f.name for f in components) + bag = Bag(name=name, components=components) halo_scheme = halo_scheme.add(bag, hse) except ValueError: - for i in candidates: + for i in components: name = "bag_%s" % i.name bag = Bag(name=name, components=i) halo_scheme = halo_scheme.add(bag, hse) @@ -363,10 +372,17 @@ def _make_copy(self, f, hse, key, swap=False): else: swap = lambda i, j: (j, i) name = 'scatter%s' % key + if isinstance(f, Bag): for i, c in enumerate(f.components): eqns.append(Eq(*swap(buf[[i] + bdims], c[findices]))) + elif isinstance(f, BundleView): + assert f.parent is hse.bundle + for i, c in enumerate(f.components): + indices = [f.parent.components.index(c), *findices] + eqns.append(Eq(*swap(buf[[i] + bdims], f.parent[indices]))) else: + assert f.is_Bundle for i in range(f.ncomp): eqns.append(Eq(*swap(buf[[i] + bdims], f[[i] + findices]))) diff --git a/devito/operator/operator.py b/devito/operator/operator.py index 0a9f9db62f..47106dd49a 100644 --- a/devito/operator/operator.py +++ b/devito/operator/operator.py @@ -13,11 +13,12 @@ from devito.data import default_allocator from devito.exceptions import (CompilationError, ExecutionError, InvalidArgument, InvalidOperator) -from devito.logger import debug, info, perf, warning, is_log_enabled_for, switch_log_level +from devito.logger import (debug, info, perf, warning, is_log_enabled_for, + switch_log_level) from devito.ir.equations import LoweredEq, lower_exprs, concretize_subdims from devito.ir.clusters import ClusterGroup, clusterize -from devito.ir.iet import (Callable, CInterface, EntryFunction, FindSymbols, MetaCall, - derive_parameters, iet_build) +from devito.ir.iet import (Callable, CInterface, EntryFunction, FindSymbols, + MetaCall, derive_parameters, iet_build) from devito.ir.support import AccessMode, SymbolRegistry from devito.ir.stree import stree_build from devito.operator.profiling import create_profile @@ -26,8 +27,7 @@ from devito.parameters import configuration from devito.passes import (Graph, lower_index_derivatives, generate_implicit, generate_macros, minimize_symbols, unevaluate, - error_mapper, is_on_device) -from devito.passes.iet.dtypes import lower_dtypes + error_mapper, is_on_device, lower_dtypes) from devito.symbolics import estimate_cost, subs_op_args from devito.tools import (DAG, OrderedSet, Signer, ReducerMap, as_mapper, as_tuple, flatten, filter_sorted, frozendict, is_integer, @@ -488,7 +488,7 @@ def _lower_iet(cls, uiet, profiler=None, **kwargs): # Extract the necessary macros from the symbolic objects generate_macros(graph, **kwargs) - # Add type specific metadata + # Target-specific lowering lower_dtypes(graph, **kwargs) # Target-independent optimizations diff --git a/devito/passes/clusters/buffering.py b/devito/passes/clusters/buffering.py index 88a07816f3..b9136dcffe 100644 --- a/devito/passes/clusters/buffering.py +++ b/devito/passes/clusters/buffering.py @@ -123,9 +123,7 @@ class InjectBuffers(Queue): def __init__(self, mapper, sregistry, options): super().__init__() - # Sort the mapper so that we always process the same Function in the - # same order, hence we get deterministic code generation - self.mapper = {i: mapper[i] for i in sorted(mapper, key=lambda i: i.name)} + self.mapper = mapper self.sregistry = sregistry self.options = options @@ -302,6 +300,9 @@ def generate_buffers(clusters, key, sregistry, options, **kwargs): # {candidate buffered Function -> [Clusters that access it]} bfmap = map_buffered_functions(clusters, key) + # Sort for deterministic code generation + bfmap = {i: bfmap[i] for i in sorted(bfmap, key=lambda i: i.name)} + # {buffered Function -> Buffer} xds = {} mapper = {} @@ -718,7 +719,7 @@ def offset_from_centre(d, indices): # `time/factor` -- the starting pointing at time_m or time_M v = indices[0] try: - p = sum(v.args[1:]) + p = v.func(*[i for i in v.args if not is_integer(i)]) if not ((p - v).is_Integer or (p - v).is_Symbol): raise ValueError except (IndexError, ValueError): diff --git a/devito/passes/iet/definitions.py b/devito/passes/iet/definitions.py index 57f7fca54f..ee440f9617 100644 --- a/devito/passes/iet/definitions.py +++ b/devito/passes/iet/definitions.py @@ -9,17 +9,22 @@ import numpy as np -from devito.ir import (Block, Call, Definition, DummyExpr, Return, EntryFunction, - FindNodes, FindSymbols, MapExprStmts, Transformer, - make_callable) +from devito.ir import ( + Block, Call, Definition, DummyExpr, Iteration, List, Return, EntryFunction, + FindNodes, FindSymbols, MapExprStmts, Transformer, make_callable +) from devito.passes import is_gpu_create from devito.passes.iet.engine import iet_pass from devito.passes.iet.langbase import LangBB -from devito.symbolics import (Byref, DefFunction, FieldFromPointer, IndexedPointer, - SizeOf, VOID, pow_to_mul, unevaluate) +from devito.symbolics import ( + Byref, DefFunction, FieldFromPointer, IndexedPointer, ListInitializer, + SizeOf, VOID, pow_to_mul, unevaluate +) from devito.tools import as_mapper, as_list, as_tuple, filter_sorted, flatten -from devito.types import (Array, ComponentAccess, CustomDimension, DeviceMap, - DeviceRM, Eq, Symbol) +from devito.types import ( + Array, ComponentAccess, CustomDimension, Dimension, DeviceMap, DeviceRM, + Eq, Symbol, size_t +) __all__ = ['DataManager', 'DeviceAwareDataManager', 'Storage'] @@ -40,6 +45,7 @@ def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) self.defined = set() + self.includes = set() def update(self, key, site, **kwargs): if key in self.defined: @@ -67,6 +73,10 @@ def map(self, key, site, k, v): self.defined.add((site[-1], key)) + def include(self, v): + if v: + self.includes.add(v) + class DataManager: @@ -132,8 +142,7 @@ def _alloc_array_on_global_mem(self, site, obj, storage): alloc = Call(name, efunc.parameters) storage.update(obj, site, allocs=alloc, efuncs=efunc) - - return self.langbb['header-memcpy'] + storage.include(self.langbb['header-memcpy']) def _alloc_scalar_on_low_lat_mem(self, site, expr, storage): """ @@ -168,53 +177,82 @@ def _alloc_mapped_array_on_high_bw_mem(self, site, obj, storage, *args): """ decl = Definition(obj) + # NOTE: the `arity` is calculated such as `sizeof(float3)/sizeof(float)` + # for portability reasons (since we don't know the size of compound + # types a priori) + arity_param = Symbol(name='arity', dtype=size_t) + arity_arg = (SizeOf(obj.indexed._C_typedata) / + SizeOf(obj.c0.indexed._C_typedata)) + ndims_param = Symbol(name='ndims', dtype=size_t) + ndims_arg = obj.ndim + shape_param = Array(name=f'{obj.name}_shape', dtype=np.uint64, + dimensions=(Dimension(name='d'),), scope='rvalue') + shape_arg = ListInitializer(obj.c0.symbolic_shape, dtype=shape_param.dtype) + + ffp0 = FieldFromPointer(obj._C_field_data, obj._C_symbol) + ffp1 = FieldFromPointer(obj._C_field_shape, obj._C_symbol) + ffp2 = FieldFromPointer(obj._C_field_size, obj._C_symbol) + ffp3 = FieldFromPointer(obj._C_field_nbytes, obj._C_symbol) + ffp4 = FieldFromPointer(obj._C_field_arity, obj._C_symbol) + # Allocate the Array struct memptr = VOID(Byref(obj._C_symbol), '**') alignment = obj._data_alignment nbytes = SizeOf(obj._C_typedata) - allocs = [self.langbb['host-alloc'](memptr, alignment, nbytes)] + alloc0 = self.langbb['host-alloc'](memptr, alignment, nbytes) - nbytes_param = Symbol(name='nbytes', dtype=np.uint64, is_const=True) - nbytes_arg = SizeOf(obj.indexed._C_typedata)*obj.size + # Allocate the shape array + memptr = VOID(Byref(ffp1), '**') + nbytes = SizeOf(obj._C_size_type)*ndims_param + alloc1 = self.langbb['host-alloc'](memptr, alignment, nbytes) + + # Initialize the Array metadata + dim, = shape_param.dimensions + init = [DummyExpr(ffp2, 1)] + init.append(Iteration( + List(body=( + DummyExpr(IndexedPointer(ffp1, dim), shape_param[dim]), + DummyExpr(ffp2, ffp2*shape_param[dim]) + )), + dim, + ndims_param - 1, + )) + init.append(DummyExpr(ffp3, ffp2*arity_param)) + init.append(DummyExpr(ffp4, arity_param)) # Allocate the underlying host data - ffp0 = FieldFromPointer(obj._C_field_data, obj._C_symbol) memptr = VOID(Byref(ffp0), '**') - allocs.append(self.langbb['host-alloc-pin'](memptr, alignment, nbytes_param)) - - # Initialize the Array struct - ffp1 = FieldFromPointer(obj._C_field_nbytes, obj._C_symbol) - init0 = DummyExpr(ffp1, nbytes_param) - ffp2 = FieldFromPointer(obj._C_field_size, obj._C_symbol) - init1 = DummyExpr(ffp2, 0) + alloc2 = self.langbb['host-alloc-pin'](memptr, alignment, ffp3) + # Free all of the allocated data frees = [self.langbb['host-free-pin'](ffp0), + self.langbb['host-free'](ffp1), self.langbb['host-free'](obj._C_symbol)] # Allocate the underlying device data, if required by the backend - alloc, free = self._make_dmap_allocfree(obj, nbytes_param) - - # Chain together all allocs and frees - allocs = as_tuple(allocs) + as_tuple(alloc) - frees = as_tuple(free) + as_tuple(frees) + alloc_dmap, free_dmap = self._make_dmap_allocfree(obj, ffp3) ret = Return(obj._C_symbol) # Wrap everything in a Callable so that we can reuse the same code # for equivalent Array structs name = self.sregistry.make_name(prefix='alloc') - body = (decl, *allocs, init0, init1, ret) + body = (decl, alloc0, alloc1, *init, alloc2, *as_tuple(alloc_dmap), ret) efunc0 = make_callable(name, body, retval=obj) args = list(efunc0.parameters) - args[args.index(nbytes_param)] = nbytes_arg + args[args.index(arity_param)] = arity_arg + args[args.index(ndims_param)] = ndims_arg + args[args.index(shape_param)] = shape_arg alloc = Call(name, args, retobj=obj) # Same story for the frees name = self.sregistry.make_name(prefix='free') + frees = as_tuple(free_dmap) + as_tuple(frees) efunc1 = make_callable(name, frees) free = Call(name, efunc1.parameters) storage.update(obj, site, allocs=alloc, frees=free, efuncs=(efunc0, efunc1)) + storage.include(self.langbb['header-array']) def _alloc_bundle_struct_on_high_bw_mem(self, site, obj, storage): """ @@ -222,35 +260,72 @@ def _alloc_bundle_struct_on_high_bw_mem(self, site, obj, storage): """ decl = Definition(obj) + # NOTE: the `arity` is calculated such as `sizeof(float3)/sizeof(float)` + # for portability reasons (since we don't know the size of compound + # types a priori) + arity_param = Symbol(name='arity', dtype=size_t) + arity_arg = (SizeOf(obj.indexed._C_typedata) / + SizeOf(obj.c0.indexed._C_typedata)) + ndims_param = Symbol(name='ndims', dtype=size_t) + ndims_arg = obj.ndim + shape_param = Array(name=f'{obj.name}_shape', dtype=np.uint64, + dimensions=(Dimension(name='d'),), scope='rvalue') + shape_arg = ListInitializer(obj.c0.symbolic_shape, dtype=shape_param.dtype) + + ffp1 = FieldFromPointer(obj._C_field_shape, obj._C_symbol) + ffp2 = FieldFromPointer(obj._C_field_size, obj._C_symbol) + ffp3 = FieldFromPointer(obj._C_field_nbytes, obj._C_symbol) + ffp4 = FieldFromPointer(obj._C_field_arity, obj._C_symbol) + # Allocate the Bundle struct memptr = VOID(Byref(obj._C_symbol), '**') alignment = obj._data_alignment nbytes = SizeOf(obj._C_typedata) - alloc = self.langbb['host-alloc'](memptr, alignment, nbytes) - - nbytes_param = Symbol(name='nbytes', dtype=np.uint64, is_const=True) - nbytes_arg = SizeOf(obj.indexed._C_typedata)*obj.size + alloc0 = self.langbb['host-alloc'](memptr, alignment, nbytes) - # Initialize the Bundle struct - ffp1 = FieldFromPointer(obj._C_field_nbytes, obj._C_symbol) - init0 = DummyExpr(ffp1, nbytes_param) - ffp2 = FieldFromPointer(obj._C_field_size, obj._C_symbol) - init1 = DummyExpr(ffp2, 0) + # Allocate the shape array + memptr = VOID(Byref(ffp1), '**') + nbytes = SizeOf(obj._C_size_type)*obj.ndim + alloc1 = self.langbb['host-alloc'](memptr, alignment, nbytes) - free = self.langbb['host-free'](obj._C_symbol) + # Initialize the Bundle metadata + dim, = shape_param.dimensions + init = [DummyExpr(ffp2, 1)] + init.append(Iteration( + List(body=( + DummyExpr(IndexedPointer(ffp1, dim), shape_param[dim]), + DummyExpr(ffp2, ffp2*shape_param[dim]) + )), + dim, + ndims_param - 1, + )) + init.append(DummyExpr(ffp3, ffp2*arity_param)) + init.append(DummyExpr(ffp4, arity_param)) + + # Free all of the allocated data + frees = [self.langbb['host-free'](ffp1), + self.langbb['host-free'](obj._C_symbol)] ret = Return(obj._C_symbol) # Wrap everything in a Callable so that we can reuse the same code # for equivalent Bundle structs name = self.sregistry.make_name(prefix='alloc') - body = (decl, alloc, init0, init1, ret) + body = (decl, alloc0, alloc1, *init, ret) efunc0 = make_callable(name, body, retval=obj) args = list(efunc0.parameters) - args[args.index(nbytes_param)] = nbytes_arg + args[args.index(arity_param)] = arity_arg + args[args.index(ndims_param)] = ndims_arg + args[args.index(shape_param)] = shape_arg alloc = Call(name, args, retobj=obj) - storage.update(obj, site, allocs=alloc, frees=free, efuncs=efunc0) + # Same story for the frees + name = self.sregistry.make_name(prefix='free') + efunc1 = make_callable(name, frees) + free = Call(name, efunc1.parameters) + + storage.update(obj, site, allocs=alloc, frees=free, efuncs=(efunc0, efunc1)) + storage.include(self.langbb['header-array']) def _alloc_object_array_on_low_lat_mem(self, site, obj, storage): """ @@ -414,18 +489,15 @@ def place_definitions(self, iet, globs=None, **kwargs): self._alloc_pointed_array_on_high_bw_mem(iet, i, storage) # Handle postponed global objects - includes = set() if isinstance(iet, EntryFunction) and globs: for i in sorted(globs, key=lambda f: f.name): - v = self._alloc_array_on_global_mem(iet, i, storage) - if v: - includes.add(v) + self._alloc_array_on_global_mem(iet, i, storage) iet, efuncs = self._inject_definitions(iet, storage) return iet, {'efuncs': efuncs, 'globals': as_tuple(globs), - 'includes': as_tuple(includes)} + 'includes': as_tuple(sorted(storage.includes))} @iet_pass def place_casts(self, iet, **kwargs): diff --git a/devito/passes/iet/engine.py b/devito/passes/iet/engine.py index b9b5bf15d4..a0d73c591f 100644 --- a/devito/passes/iet/engine.py +++ b/devito/passes/iet/engine.py @@ -1,18 +1,26 @@ from collections import OrderedDict, defaultdict from functools import partial, singledispatch, wraps -from devito.ir.iet import (Call, ExprStmt, Iteration, SyncSpot, AsyncCallable, - FindNodes, FindSymbols, MapNodes, MetaCall, Transformer, - EntryFunction, ThreadCallable, Uxreplace, - derive_parameters) +import numpy as np +from sympy import Mul + +from devito.ir.iet import ( + Call, DummyExpr, ExprStmt, Expression, List, Iteration, SyncSpot, + AsyncCallable, FindNodes, FindSymbols, MapNodes, MetaCall, Transformer, + EntryFunction, ThreadCallable, KernelLaunch, Uxreplace, derive_parameters +) from devito.ir.support import SymbolRegistry from devito.mpi.distributed import MPINeighborhood +from devito.mpi.routines import CopyBuffer from devito.passes import needs_transfer -from devito.symbolics import FieldFromComposite, FieldFromPointer +from devito.symbolics import (FieldFromComposite, FieldFromPointer, Byref, Cast, + Deref, search) from devito.tools import DAG, as_tuple, filter_ordered, sorted_priority, timed_pass -from devito.types import (Array, Bundle, CompositeObject, Lock, IncrDimension, - ModuloDimension, Indirection, Pointer, SharedData, - ThreadArray, Temp, NPThreads, NThreadsBase, Wildcard) +from devito.types import ( + Array, Bundle, ComponentAccess, CompositeObject, Lock, IncrDimension, + ModuloDimension, Indirection, Pointer, SharedData, ThreadArray, Symbol, Temp, + NPThreads, NThreadsBase, Wildcard +) from devito.types.args import ArgProvider from devito.types.dense import DiscreteFunction from devito.types.dimension import AbstractIncrDimension, BlockDimension @@ -147,6 +155,7 @@ def apply(self, func, **kwargs): # Minimize code size if len(efuncs) > len(self.efuncs): efuncs = reuse_compounds(efuncs, self.sregistry) + efuncs = abstract_component_accesses(self.root, efuncs, self.sregistry) efuncs = reuse_efuncs(self.root, efuncs, self.sregistry) self.efuncs = efuncs @@ -316,6 +325,81 @@ def _(i, sregistry=None): return i._rebuild(pname=pname, cfields=cfields, ncfields=ncfields, function=None) +def abstract_component_accesses(root, efuncs, sregistry=None): + """ + Generalise `efuncs` by replacing ComponentAccesses with pointer arithmetic + where possible and useful. + """ + candidates = (CopyBuffer,) + + efuncs = dict(efuncs) + + # Transform the candidate efuncs + for k, efunc in efuncs.items(): + if not isinstance(efunc, candidates): + continue + + # We have to run the pass when the KernelLaunches (if any) are visible, + # otherwise it gets way too complicated + calls = FindNodes(Call).visit(efunc) + try: + call, = [c for c in calls if isinstance(c, KernelLaunch)] + except ValueError: + continue + kernel = efuncs[call.name] + + # There's either one single ComponentAccess or it's not worth it (as + # in the case of a whole-Bundle access) + found = defaultdict(set) + for e in FindNodes(Expression).visit(kernel): + for v in search(e.expr, ComponentAccess): + found[e].add(v) + if len(found) != 1: + continue + expr, compaccs = found.popitem() + assert len(compaccs) == 1 + + ca, = compaccs + f = ca.function + + arity = Symbol(name='arity', dtype=np.uint32, is_const=True) + index = Symbol(name='index', dtype=np.uint32, is_const=True) + + # Access the same entry as `ca` but via pointer arithmetic instead + indices = [Mul(arity, i, evaluate=False) for i in ca.indices] + indices[-1] += index + kernel1 = Uxreplace({ca: f.c0.indexed[indices]}).visit(kernel) + + # Update the parameters list + parameters = [*kernel1.parameters, index, arity] + parameters[parameters.index(f.indexed)] = f.c0.indexed + kernel1 = kernel1._rebuild(parameters=parameters) + + # Update the Call site + ffp0 = FieldFromPointer(f._C_field_dmap, f._C_symbol) + ffp1 = FieldFromPointer(f._C_field_arity, f._C_symbol) + args = [*call.arguments, index, ffp1] + args[args.index(f.dmap)] = Cast(ffp0, dtype=f.c0.indexed._C_ctype, + reinterpret=True) + call1 = call._rebuild(arguments=args) + efunc1 = Transformer({call: call1}).visit(efunc) + + #TODO Propagate index through call stack until haloupdate... + m = {index: ca.index} + #TODO _filter->key + #TODO UPDATE key so that if efunc.name == root.name, use ca.index... + _filter = lambda arguments: [m.get(a, a) for a in arguments] + efuncs = update_call_stack(call, efuncs, _filter, cascade=True) + + # Store the new Callables + efuncs.up + processed.update({kernel1.name: kernel1, k: efunc1}) + + efuncs = {**efuncs, **processed} + + return efuncs + + def reuse_efuncs(root, efuncs, sregistry=None): """ Generalise `efuncs` so that syntactically identical Callables may be dropped, @@ -634,7 +718,13 @@ def update_args(root, efuncs, dag): # Create the new parameters and arguments lists - def _filter(v, efunc=None): + def callback(maybe_call, efunc=None): + #TODO: FIX UPDATE HERE + if isinstance(maybe_call, Call): + v = maybe_call.arguments + else: + v = maybe_call.parameters + processed = [a for i, a in enumerate(v) if i not in drop_params] for a in new_params: @@ -654,13 +744,30 @@ def _filter(v, efunc=None): return processed efuncs = OrderedDict(efuncs) - efuncs[root.name] = root._rebuild(parameters=_filter(root.parameters, root)) + efuncs[root.name] = root._rebuild(parameters=callback(root.parameters, root)) # Update all call sites to use the new signature + efuncs = update_call_stack(root, efuncs, callback, dag=dag) + + return efuncs + + +def update_call_stack(root, efuncs, callback, dag=None, cascade=False): + """ + Update, optionally recursively, the Calls into `root` via a given callback. + """ + efuncs = dict(efuncs) + dag = dag or create_call_graph(root.name, efuncs) + for n in dag.downstream(root.name): - mapper = {c: c._rebuild(arguments=_filter(c.arguments)) - for c in FindNodes(Call).visit(efuncs[n]) - if c.name == root.name} - efuncs[n] = Transformer(mapper).visit(efuncs[n]) + efunc = efuncs[n] + calls = FindNodes(Call).visit(efunc) + + #TODO WATCH OUT HERE AS I'M NOW ALSO PASSING IN efunc... + mapper = {c: c._rebuild(arguments=callback(c, efunc)) + for c in calls if c.name == root.name} + efuncs[n] = Transformer(mapper).visit(efunc) + + #TODO IMPLEMETN RECURSION IF CASCADE IS TRUE return efuncs diff --git a/devito/passes/iet/languages/C.py b/devito/passes/iet/languages/C.py index bfb0935a20..54366ba2ff 100644 --- a/devito/passes/iet/languages/C.py +++ b/devito/passes/iet/languages/C.py @@ -5,7 +5,8 @@ from devito.passes.iet.definitions import DataManager from devito.passes.iet.orchestration import Orchestrator from devito.passes.iet.langbase import LangBB -from devito.symbolics.extended_dtypes import c_complex, c_double_complex +from devito.symbolics import c_complex, c_double_complex +from devito.tools import dtype_to_cstr __all__ = ['CBB', 'CDataManager', 'COrchestrator'] @@ -13,6 +14,8 @@ class CBB(LangBB): mapper = { + # Misc + 'header-array': None, # Complex 'includes-complex': 'complex.h', # Allocs @@ -55,3 +58,12 @@ class CPrinter(BasePrinter, C99CodePrinter): def _print_ImaginaryUnit(self, expr): return '_Complex_I' + + def _print_ListInitializer(self, expr): + li = super()._print_ListInitializer(expr) + if expr.dtype: + # C99, unlike CXX, supports compound literals + tstr = dtype_to_cstr(expr.dtype) + return f'({tstr}[]){li}' + else: + return li diff --git a/devito/passes/iet/languages/CIR.py b/devito/passes/iet/languages/CIR.py new file mode 100644 index 0000000000..93e2eecc1f --- /dev/null +++ b/devito/passes/iet/languages/CIR.py @@ -0,0 +1,11 @@ +from .C import CPrinter + + +class CIRPrinter(CPrinter): + + """ + A smarter printer for IETs undergoing lowering. + """ + + def _print_ComponentAccess(self, expr): + return f"{self._print(expr.base)}.<{expr.sindex},{expr.function.ncomp}>" diff --git a/devito/passes/iet/languages/CXX.py b/devito/passes/iet/languages/CXX.py index 82e099e88c..6a09c375ab 100644 --- a/devito/passes/iet/languages/CXX.py +++ b/devito/passes/iet/languages/CXX.py @@ -3,7 +3,8 @@ from devito.ir import Call, UsingNamespace, BasePrinter from devito.passes.iet.langbase import LangBB -from devito.symbolics.extended_dtypes import c_complex, c_double_complex +from devito.symbolics import c_complex, c_double_complex +from devito.tools import dtype_to_cstr __all__ = ['CXXBB'] @@ -65,6 +66,8 @@ def std_arith(prefix=None): class CXXBB(LangBB): mapper = { + # Misc + 'header-array': 'array', # Complex 'includes-complex': 'complex', 'complex-namespace': [UsingNamespace('std::complex_literals')], @@ -112,3 +115,12 @@ def _print_Cast(self, expr): caster = 'reinterpret_cast' if expr.reinterpret else 'static_cast' cast = f'{caster}<{tstr}{self._print(expr.stars)}>' return self._print_UnaryOp(expr, op=cast, parenthesize=True) + + def _print_ListInitializer(self, expr): + li = super()._print_ListInitializer(expr) + if expr.dtype: + # CXX, unlike C99, does not support compound literals + tstr = dtype_to_cstr(expr.dtype) + return f'std::array<{tstr}, {len(expr.params)}>{li}.data()' + else: + return li diff --git a/devito/passes/iet/linearization.py b/devito/passes/iet/linearization.py index ddb0122794..032271f676 100644 --- a/devito/passes/iet/linearization.py +++ b/devito/passes/iet/linearization.py @@ -223,7 +223,7 @@ def linearize_accesses(iet, key0, tracker=None): # 2) What `iet` *offers* # E.g. `{x_fsz0 -> u_vec->size[1]}` - defines = FindSymbols('defines-aliases').visit(iet) + defines = FindSymbols('defines').visit(iet) offers = filter_ordered(i for i in defines if key0(i.function)) instances = {} for i in offers: @@ -294,16 +294,9 @@ def _(f, d): @_generate_fsz.register(Array) -def _(f, d): - return f.symbolic_shape[d] - - @_generate_fsz.register(Bundle) def _(f, d): - if f.is_DiscreteFunction: - return _generate_fsz.registry[DiscreteFunction](f, d) - else: - return _generate_fsz.registry[Array](f, d) + return f.symbolic_shape[d] @singledispatch diff --git a/devito/passes/iet/mpi.py b/devito/passes/iet/mpi.py index 930cc108b2..73670af072 100644 --- a/devito/passes/iet/mpi.py +++ b/devito/passes/iet/mpi.py @@ -83,7 +83,6 @@ def _hoist_invariant(iet): for it, halo_spots in iter_mapper.items(): for hs0, hs1 in combinations(halo_spots, r=2): - if _check_control_flow(hs0, hs1, cond_mapper): continue @@ -129,10 +128,9 @@ def _hoist_invariant(iet): def _merge_halospots(iet): """ - Merge HaloSpots on the same Iteration tree level where all data dependencies - would be honored. Avoids redundant halo exchanges when the same data is - redundantly exchanged within the same Iteration tree level as well as to initiate - multiple halo exchanges at once. + Using data dependence analysis, merge HaloSpots on the same IET level. This + has two effects: anticipating communication over computation, and (potentially) + avoiding redundant halo exchanges. Example: @@ -141,7 +139,6 @@ def _merge_halospots(iet): W v[t1]- R v[t0] W v[t1]- R v[t0] haloupd v[t0], h W g[t1]- R v[t0], h W g[t1]- R v[t0], h - """ # Analysis @@ -155,16 +152,15 @@ def _merge_halospots(iet): hs0 = halo_spots[0] for hs1 in halo_spots[1:]: - if _check_control_flow(hs0, hs1, cond_mapper): continue for f, v in hs1.fmapper.items(): for dep in scope.d_flow.project(f): - if not any(r(dep, hs1, v.loc_indices) for r in rules): + if not any(rule(dep, hs1, v.loc_indices) for rule in rules): break else: - # hs1 is merged with hs0 + # All good -- `hs1` can be merged with `hs0` hs = hs1.halo_scheme.project(f) mapper[hs0] = HaloScheme.union([mapper.get(hs0, hs0.halo_scheme), hs]) mapper[hs1] = mapper.get(hs1, hs1.halo_scheme).drop(f) @@ -198,8 +194,8 @@ def _drop_if_unwritten(iet, options=None, **kwargs): writes = {i.write for i in FindNodes(Expression).visit(iet)} mapper = {} for hs in FindNodes(HaloSpot).visit(iet): - for f in hs.fmapper: - if f not in writes and key(f): + for f, v in hs.fmapper.items(): + if not writes.intersection({f, v.bundle}) and key(f): mapper[hs] = mapper.get(hs, hs.halo_scheme).drop(f) # Post-process analysis diff --git a/devito/symbolics/extended_sympy.py b/devito/symbolics/extended_sympy.py index ef91ef4d49..eca75fefb2 100644 --- a/devito/symbolics/extended_sympy.py +++ b/devito/symbolics/extended_sympy.py @@ -148,7 +148,10 @@ def is_commutative(self): """ Overridden SymPy assumption -- now based on the wrapped object dtype. """ - return issubclass(self.dtype, np.number) + try: + return issubclass(self.dtype, np.number) + except TypeError: + return self.dtype in ctypes_vector_mapper def _sympystr(self, printer): return str(self) @@ -289,8 +292,9 @@ class ListInitializer(sympy.Expr, Pickable): """ __rargs__ = ('params',) + __rkwargs__ = ('dtype',) - def __new__(cls, params): + def __new__(cls, params, dtype=None): args = [] for p in as_tuple(params): try: @@ -298,7 +302,10 @@ def __new__(cls, params): except sympy.SympifyError: raise ValueError(f"Illegal param `{p}`") obj = sympy.Expr.__new__(cls, *args) + obj.params = tuple(args) + obj.dtype = dtype + return obj def __str__(self): diff --git a/devito/symbolics/queries.py b/devito/symbolics/queries.py index c4002508cb..4b7f04253b 100644 --- a/devito/symbolics/queries.py +++ b/devito/symbolics/queries.py @@ -6,13 +6,14 @@ from devito.types.basic import AbstractFunction from devito.types.constant import Constant from devito.types.dimension import Dimension +from devito.types.array import ComponentAccess from devito.types.object import AbstractObject __all__ = ['q_leaf', 'q_indexed', 'q_terminal', 'q_function', 'q_routine', 'q_terminalop', 'q_indirect', 'q_constant', 'q_affine', 'q_linear', - 'q_identity', 'q_symbol', 'q_multivar', 'q_monoaffine', 'q_dimension', - 'q_positive', 'q_negative'] + 'q_identity', 'q_symbol', 'q_comp_acc', 'q_multivar', 'q_monoaffine', + 'q_dimension', 'q_positive', 'q_negative'] # The following SymPy objects are considered tree leaves: @@ -31,6 +32,10 @@ def q_symbol(expr): return False +def q_comp_acc(expr): + return isinstance(expr, ComponentAccess) + + def q_leaf(expr): return (expr.is_Atom or expr.is_Indexed or diff --git a/devito/types/array.py b/devito/types/array.py index 6a1bc9eb90..f9fd6358b1 100644 --- a/devito/types/array.py +++ b/devito/types/array.py @@ -1,4 +1,4 @@ -from ctypes import POINTER, Structure, c_void_p, c_ulong, c_uint64 +from ctypes import POINTER, Structure, c_void_p, c_uint64 from functools import cached_property import numpy as np @@ -10,7 +10,7 @@ from devito.types.utils import CtypesFactory, DimensionTuple __all__ = ['Array', 'ArrayMapped', 'ArrayObject', 'PointerArray', 'Bundle', - 'ComponentAccess', 'Bag'] + 'ComponentAccess', 'Bag', 'BundleView'] class ArrayBasic(AbstractFunction, LocalType): @@ -40,7 +40,7 @@ def __indices_setup__(cls, *args, **kwargs): @property def _C_name(self): - if self._mem_stack or self._mem_constant: + if self._mem_stack or self._mem_constant or self._mem_rvalue: # No reason to distinguish between two different names, that is # the _C_name and the name -- just `self.name` is enough return self.name @@ -60,6 +60,13 @@ def shape_allocated(self): def is_const(self): return self._is_const + @property + def c0(self): + # ArrayBasic can be used as a base class for tensorial objects (that is, + # arrays whose components are AbstractFunctions). This property enables + # treating the two cases uniformly in some lowering passes + return self + class Array(ArrayBasic): @@ -90,7 +97,7 @@ class Array(ArrayBasic): to 'local'. Used to override `_mem_local` and `_mem_mapped`. scope : str, optional The scope in the given memory space. Allowed values: 'heap', 'stack', - 'static', 'constant', 'shared', 'shared-remote', 'registers'. + 'static', 'constant', 'shared', 'shared-remote', 'registers', 'rvalue'. 'static' refers to a static array in a C/C++ sense. 'constant' and 'shared' mean that the Array represents an object allocated in so called constant and shared memory, respectively, which are typical of @@ -100,7 +107,9 @@ class Array(ArrayBasic): architecture doesn't have something akin to constant memory, the Array falls back to a global, const, static array in a C/C++ sense. 'registers' is used to indicate that the Array has a small static size - and, as such, it could be allocated in registers. Defaults to 'heap'. + and, as such, it could be allocated in registers. If `rvalue`, the + Array is treated as a temporary or "transient" object, just like + C++'s rvalue references or C's compound literals. Defaults to 'heap'. Note that not all scopes make sense for a given space. grid : Grid, optional Only necessary for distributed-memory parallelism; a Grid contains @@ -135,7 +144,7 @@ def __init_finalize__(self, *args, **kwargs): self._scope = kwargs.get('scope', 'heap') assert self._scope in ['heap', 'stack', 'static', 'constant', 'shared', - 'shared-remote', 'registers'] + 'shared-remote', 'registers', 'rvalue'] self._initvalue = kwargs.get('initvalue') assert self._initvalue is None or self._scope != 'heap' @@ -190,6 +199,10 @@ def _mem_registers(self): def _mem_constant(self): return self._scope == 'constant' + @property + def _mem_rvalue(self): + return self._scope == 'rvalue' + @property def initvalue(self): return self._initvalue @@ -202,19 +215,29 @@ def _make_pointer(self, dim): return PointerArray(name='p%s' % self.name, dimensions=dim, array=self) -class ArrayMapped(Array): +class MappedArrayMixin: _C_structname = 'array' _C_field_data = 'data' _C_field_dmap = 'dmap' - _C_field_nbytes = 'nbytes' + _C_field_shape = 'shape' _C_field_size = 'size' + _C_field_nbytes = 'nbytes' + _C_field_arity = 'arity' + + _C_size_type = c_uint64 _C_ctype = POINTER(type(_C_structname, (Structure,), {'_fields_': [(_C_field_data, c_restrict_void_p), - (_C_field_nbytes, c_ulong), (_C_field_dmap, c_void_p), - (_C_field_size, c_uint64)]})) + (_C_field_shape, POINTER(_C_size_type)), + (_C_field_size, _C_size_type), + (_C_field_nbytes, _C_size_type), + (_C_field_arity, _C_size_type)]})) + + +class ArrayMapped(MappedArrayMixin, Array): + pass class ArrayObject(ArrayBasic): @@ -343,7 +366,7 @@ def array(self): return self._array -class Bundle(ArrayBasic): +class Bundle(MappedArrayMixin, ArrayBasic): """ Tensor symbol representing an unrolled vector of AbstractFunctions. @@ -417,7 +440,6 @@ def __halo_setup__(self, components=(), **kwargs): @property def c0(self): - # Shortcut for self.components[0] return self.components[0] # Class attributes overrides @@ -456,18 +478,27 @@ def ncomp(self): def initvalue(self): return None - # Overrides defaulting to self.c0's behaviour - + # Defaulting to self.c0's behaviour for i in ('_mem_internal_eager', '_mem_internal_lazy', '_mem_local', '_mem_mapped', '_mem_host', '_mem_stack', '_mem_constant', - '_mem_shared', '_mem_shared_remote', '__padding_dtype__', - '_size_domain', '_size_halo', '_size_owned', '_size_padding', - '_size_nopad', '_size_nodomain', '_offset_domain', - '_offset_halo', '_offset_owned', '_dist_dimensions', - '_C_get_field', 'grid', 'symbolic_shape', + '_mem_shared', '_mem_shared_remote', '_mem_registers', + '_mem_rvalue', '__padding_dtype__', '_size_domain', '_size_halo', + '_size_owned', '_size_padding', '_size_nopad', '_size_nodomain', + '_offset_domain', '_offset_halo', '_offset_owned', + '_dist_dimensions', '_C_get_field', 'grid', *AbstractFunction.__properties__): locals()[i] = property(lambda self, v=i: getattr(self.c0, v)) + # Other overrides + + @cached_property + def symbolic_shape(self): + from devito.symbolics import FieldFromPointer, IndexedPointer # noqa + ffp = FieldFromPointer(self._C_field_shape, self._C_symbol) + ret = [s if is_integer(s) else IndexedPointer(ffp, i) + for i, s in enumerate(super().symbolic_shape)] + return DimensionTuple(*ret, getters=self.dimensions) + @property def _mem_heap(self): return not any([self._mem_stack, self._mem_shared, self._mem_shared_remote]) @@ -490,16 +521,10 @@ def __getitem__(self, index): raise ValueError("Expected %d or %d indices, got %d instead" % (self.ndim, self.ndim + 1, len(index))) - _C_structname = ArrayMapped._C_structname - _C_field_data = ArrayMapped._C_field_data - _C_field_nbytes = ArrayMapped._C_field_nbytes - _C_field_dmap = ArrayMapped._C_field_dmap - _C_field_size = ArrayMapped._C_field_size - @property def _C_ctype(self): if self._mem_mapped: - return ArrayMapped._C_ctype + return super()._C_ctype else: return POINTER(dtype_to_ctype(self.dtype)) @@ -518,6 +543,31 @@ def handles(self): return self.components +class BundleView(Bundle): + + """ + A BundleView is like a Bundle but it doesn't represent a concrete object + in the generated code. It's used by the compiler to represent a subset + of the components of a Bundle. + """ + + __rkwargs__ = Bundle.__rkwargs__ + ('parent',) + + def __new__(cls, *args, parent=None, **kwargs): + obj = super().__new__(cls, *args, **kwargs) + obj._parent = parent + + return obj + + @property + def parent(self): + return self._parent + + @property + def handles(self): + return (self.parent,) + + class ComponentAccess(Expr, Pickable): _component_names = ('x', 'y', 'z', 'w') @@ -569,6 +619,10 @@ def sindex(self): def function(self): return self.base.function + @property + def function_access(self): + return self.function.components[self.index] + @property def indices(self): return self.base.indices diff --git a/devito/types/basic.py b/devito/types/basic.py index 04e4281f12..466508df75 100644 --- a/devito/types/basic.py +++ b/devito/types/basic.py @@ -42,7 +42,8 @@ class CodeSymbol: * "liveness": `_mem_external`, `_mem_internal_eager`, `_mem_internal_lazy` * "space": `_mem_local`, `_mem_mapped`, `_mem_host` * "scope": `_mem_stack`, `_mem_heap`, `_mem_global`, `_mem_shared`, - `_mem_shared_remote`, `_mem_constant` + `_mem_shared_remote`, `_mem_constant`, `_mem_registers`, + `_mem_rvalue` For example, an object that is `<_mem_internal_lazy, _mem_local, _mem_heap>` is allocated within the Operator entry point, on either the host or device @@ -230,6 +231,21 @@ def _mem_shared_remote(self): """ return False + @property + def _mem_registers(self): + """ + True if the associated data is allocated in registers, False otherwise. + """ + return False + + @property + def _mem_rvalue(self): + """ + True if the associated data is allocated in a temporary (or "transient") + variable, such as rvalues in CXX, False otherwise. + """ + return False + class Basic(CodeSymbol): diff --git a/devito/types/misc.py b/devito/types/misc.py index ce03ef60d3..47fa6d601a 100644 --- a/devito/types/misc.py +++ b/devito/types/misc.py @@ -14,7 +14,8 @@ __all__ = ['Timer', 'Pointer', 'VolatileInt', 'FIndexed', 'Wildcard', 'Fence', 'Global', 'Hyperplane', 'Indirection', 'Temp', 'TempArray', 'Jump', - 'nop', 'WeakFence', 'CriticalRegion', 'Auto', 'AutoRef', 'auto'] + 'nop', 'WeakFence', 'CriticalRegion', 'Auto', 'AutoRef', 'auto', + 'size_t'] class Timer(CompositeObject): @@ -344,7 +345,9 @@ def closing(self): """ -# *** CXX support types +# *** C/CXX support types + +size_t = CustomDtype('size_t') # NOTE: In C++, `auto` is a type specifier more than a type itself, but # it's a distinction we can afford to ignore, at least for now diff --git a/tests/test_buffering.py b/tests/test_buffering.py index c1196466e3..d3f04826ba 100644 --- a/tests/test_buffering.py +++ b/tests/test_buffering.py @@ -23,7 +23,7 @@ def test_read_write(): # Check generated code assert len(retrieve_iteration_tree(op1)) == 3 - buffers = [i for i in FindSymbols().visit(op1) if i.is_Array] + buffers = [i for i in FindSymbols().visit(op1) if i.is_Array and i._mem_heap] assert len(buffers) == 1 assert buffers.pop().symbolic_shape[0] == 2 @@ -51,7 +51,7 @@ def test_write_only(): # Check generated code assert len(retrieve_iteration_tree(op1)) == 3 - buffers = [i for i in FindSymbols().visit(op1) if i.is_Array] + buffers = [i for i in FindSymbols().visit(op1) if i.is_Array and i._mem_heap] assert len(buffers) == 1 op0.apply(time_M=nt-2) @@ -79,7 +79,7 @@ def test_read_only(): # Check generated code assert len(retrieve_iteration_tree(op1)) == 4 - buffers = [i for i in FindSymbols().visit(op1) if i.is_Array] + buffers = [i for i in FindSymbols().visit(op1) if i.is_Array and i._mem_heap] assert len(buffers) == 1 op0.apply(time_M=nt-2) @@ -128,7 +128,7 @@ def test_read_only_backwards(): # Check generated code assert len(retrieve_iteration_tree(op1)) == 4 - buffers = [i for i in FindSymbols().visit(op1) if i.is_Array] + buffers = [i for i in FindSymbols().visit(op1) if i.is_Array and i._mem_heap] assert len(buffers) == 1 op0.apply(time_m=1) @@ -159,7 +159,7 @@ def test_read_only_backwards_unstructured(): # Check generated code assert len(retrieve_iteration_tree(op1)) == 3 - buffers = [i for i in FindSymbols().visit(op1) if i.is_Array] + buffers = [i for i in FindSymbols().visit(op1) if i.is_Array and i._mem_heap] assert len(buffers) == 1 op0.apply(time_m=2) @@ -183,7 +183,7 @@ def test_async_degree(async_degree): # Check generated code assert len(retrieve_iteration_tree(op1)) == 3 - buffers = [i for i in FindSymbols().visit(op1) if i.is_Array] + buffers = [i for i in FindSymbols().visit(op1) if i.is_Array and i._mem_heap] assert len(buffers) == 1 assert buffers.pop().symbolic_shape[0] == async_degree @@ -212,7 +212,7 @@ def test_two_homogeneous_buffers(): # Check generated code assert len(retrieve_iteration_tree(op1)) == 5 assert len(retrieve_iteration_tree(op2)) == 3 - buffers = [i for i in FindSymbols().visit(op1.body) if i.is_Array] + buffers = [i for i in FindSymbols().visit(op1.body) if i.is_Array and i._mem_heap] assert len(buffers) == 2 op0.apply(time_M=nt-2) @@ -243,7 +243,7 @@ def test_two_heterogeneous_buffers(): # Check generated code assert len(retrieve_iteration_tree(op1)) == 5 - buffers = [i for i in FindSymbols().visit(op1.body) if i.is_Array] + buffers = [i for i in FindSymbols().visit(op1.body) if i.is_Array and i._mem_heap] assert len(buffers) == 2 op0.apply(time_M=nt-2) @@ -410,7 +410,7 @@ def test_subdims(): # Check generated code assert len(retrieve_iteration_tree(op1)) == 3 - assert len([i for i in FindSymbols().visit(op1) if i.is_Array]) == 1 + assert len([i for i in FindSymbols().visit(op1) if i.is_Array and i._mem_heap]) == 1 op0.apply(time_M=nt-2) op1.apply(time_M=nt-2, u=u1) @@ -442,7 +442,7 @@ def test_conddim_backwards(): # Check generated code assert len(retrieve_iteration_tree(op1)) == 4 - buffers = [i for i in FindSymbols().visit(op1) if i.is_Array] + buffers = [i for i in FindSymbols().visit(op1) if i.is_Array and i._mem_heap] assert len(buffers) == 1 op0.apply(time_m=1, time_M=9) @@ -475,7 +475,7 @@ def test_conddim_backwards_multi_slots(): # Check generated code assert len(retrieve_iteration_tree(op1)) == 4 - buffers = [i for i in FindSymbols().visit(op1) if i.is_Array] + buffers = [i for i in FindSymbols().visit(op1) if i.is_Array and i._mem_heap] assert len(buffers) == 1 op0.apply(time_m=1, time_M=9) @@ -512,7 +512,7 @@ def test_conddim_backwards_unstructured(): # Check generated code assert len(retrieve_iteration_tree(op1)) == 4 - buffers = [i for i in FindSymbols().visit(op1) if i.is_Array] + buffers = [i for i in FindSymbols().visit(op1) if i.is_Array and i._mem_heap] assert len(buffers) == 1 # Note 1: cannot use time_m<4 or time_M>14 or there would be OOB accesses @@ -557,7 +557,7 @@ def test_conddim_w_shifting(): # Check generated code assert len(retrieve_iteration_tree(op1)) == 4 - buffers = [i for i in FindSymbols().visit(op1) if i.is_Array] + buffers = [i for i in FindSymbols().visit(op1) if i.is_Array and i._mem_heap] assert len(buffers) == 1 # From time_m=15 to time_M=35 with a factor=5 -- it means that, thanks @@ -596,7 +596,7 @@ def test_multi_access(): # Check generated code assert len(retrieve_iteration_tree(op1)) == 3 - buffers = [i for i in FindSymbols().visit(op1) if i.is_Array] + buffers = [i for i in FindSymbols().visit(op1) if i.is_Array and i._mem_heap] assert len(buffers) == 1 op0.apply(time_M=nt-2) @@ -660,7 +660,8 @@ def test_everything(): op1 = Operator(eqns, opt='buffering') # Check generated code - assert len([i for i in FindSymbols().visit(op1.body) if i.is_Array]) == 2 + assert len([i for i in FindSymbols().visit(op1.body) if i.is_Array + and i._mem_heap]) == 2 op0.apply(time_m=15, time_M=35, save_shift=0) op1.apply(time_m=15, time_M=35, save_shift=0, u=u1) diff --git a/tests/test_gpu_common.py b/tests/test_gpu_common.py index 13239687bc..6b28056d7c 100644 --- a/tests/test_gpu_common.py +++ b/tests/test_gpu_common.py @@ -461,8 +461,8 @@ def test_tasking_lock_placement(self): 'while(lock0[t1] == 0);' @pytest.mark.parametrize('opt,ntmps', [ - (('buffering', 'streaming', 'orchestrate'), 2), - (('buffering', 'streaming', 'orchestrate', {'linearize': True}), 2), + (('buffering', 'streaming', 'orchestrate'), 3), + (('buffering', 'streaming', 'orchestrate', {'linearize': True}), 3), ]) def test_streaming_basic(self, opt, ntmps): nt = 10 @@ -491,8 +491,8 @@ def test_streaming_basic(self, opt, ntmps): assert np.all(u.data[1] == 36) @pytest.mark.parametrize('opt,ntmps', [ - (('buffering', 'streaming', 'orchestrate'), 13), - (('buffering', 'streaming', 'fuse', 'orchestrate', {'fuse-tasks': True}), 7), + (('buffering', 'streaming', 'orchestrate'), 14), + (('buffering', 'streaming', 'fuse', 'orchestrate', {'fuse-tasks': True}), 8), ]) def test_streaming_two_buffers(self, opt, ntmps): nt = 10 @@ -631,7 +631,7 @@ def test_streaming_conddim_backward(self, opt): assert np.all(u.data[1] == 9) @pytest.mark.parametrize('opt,ntmps', [ - (('buffering', 'streaming', 'orchestrate'), 2), + (('buffering', 'streaming', 'orchestrate'), 3), ]) def test_streaming_multi_input(self, opt, ntmps): nt = 100 @@ -720,7 +720,7 @@ def test_streaming_multi_input_conddim_backward(self): assert np.all(v.data == v1.data) @pytest.mark.parametrize('opt,ntmps', [ - (('buffering', 'streaming', 'orchestrate'), 2), + (('buffering', 'streaming', 'orchestrate'), 3), ]) def test_streaming_postponed_deletion(self, opt, ntmps): nt = 10 @@ -1094,8 +1094,8 @@ def test_save_w_subdims(self): assert np.all(usave.data[i, :, -3:] == 0) @pytest.mark.parametrize('opt,ntmps', [ - (('buffering', 'streaming', 'orchestrate'), 2), - (('buffering', 'streaming', 'orchestrate', {'linearize': True}), 2), + (('buffering', 'streaming', 'orchestrate'), 3), + (('buffering', 'streaming', 'orchestrate', {'linearize': True}), 3), ]) def test_streaming_w_shifting(self, opt, ntmps): nt = 50 @@ -1178,11 +1178,11 @@ def test_streaming_complete(self): # Check generated code diff = int(configuration['language'] == 'openmp') assert len(op1._func_table) == 14 - diff - assert len([i for i in FindSymbols().visit(op1) if i.is_Array]) == 8 - diff + assert len([i for i in FindSymbols().visit(op1) if i.is_Array]) == 9 - diff assert len(op2._func_table) == 14 - diff - assert len([i for i in FindSymbols().visit(op2) if i.is_Array]) == 8 - diff + assert len([i for i in FindSymbols().visit(op2) if i.is_Array]) == 9 - diff assert len(op3._func_table) == 10 - diff - assert len([i for i in FindSymbols().visit(op3) if i.is_Array]) == 7 - diff + assert len([i for i in FindSymbols().visit(op3) if i.is_Array]) == 8 - diff op0.apply(time_m=15, time_M=35, save_shift=0) op1.apply(time_m=15, time_M=35, save_shift=0, u=u1) diff --git a/tests/test_ir.py b/tests/test_ir.py index 350de3ae20..8706708ffd 100644 --- a/tests/test_ir.py +++ b/tests/test_ir.py @@ -17,7 +17,7 @@ from devito.ir.support.guards import GuardOverflow from devito.symbolics import DefFunction, FieldFromPointer from devito.tools import prod -from devito.types import Array, CriticalRegion, Jump, Scalar, Symbol +from devito.types import Array, Bundle, CriticalRegion, Jump, Scalar, Symbol class TestVectorHierarchy: @@ -887,6 +887,32 @@ def test_critical_region_v1(self): assert len(scope.reads[mocksym1]) == 2 assert len(scope.d_all) == 9 + def test_bundle_components(self): + grid = Grid(shape=(4, 4)) + x, y = grid.dimensions + + f = Function(name='f', grid=grid) + g = Function(name='g', grid=grid) + v = Function(name='v', grid=grid) + w = Function(name='w', grid=grid) + u0 = Function(name='u0', grid=grid) + u1 = Function(name='u1', grid=grid) + + fg = Bundle(name='fg', components=(f, g)) + vw = Bundle(name='vw', components=(v, w)) + + exprs = [Eq(fg.indexify(), 1), + Eq(u0.indexify(), fg[0, x, y] + 2), + Eq(vw[0, x, y], 3), + Eq(u1.indexify(), vw[1, x, y] + 4)] + exprs = [LoweredEq(i) for i in exprs] + + scope = Scope(exprs) + assert len(scope.d_all) == 1 + assert len(scope.d_flow) == 1 + dep, = scope.d_flow + assert dep.function is f + class TestParallelismAnalysis: diff --git a/tests/test_linearize.py b/tests/test_linearize.py index 56f31ebe1f..113ccc8f0a 100644 --- a/tests/test_linearize.py +++ b/tests/test_linearize.py @@ -342,20 +342,21 @@ def test_strides_forwarding1(): linearize(graph, callback=True, options={'index-mode': 'int32'}, sregistry=SymbolRegistry()) - # Despite `a` is passed via `a.indexed`, and since it's an Array (which - # have symbolic shape), we expect the stride exprs to be placed in `bar`, - # and in `bar` only, as `foo` doesn't really use `a`, it just propagates it - # down to `bar` + # `a` is passed via `a.indexed`, so the stride exprs are expected to be + # placed in `foo` and then passed down to `bar` as arguments foo = graph.root bar = graph.efuncs['bar'] assert len(foo.body.body) == 1 assert foo.body.body[0].is_Call + assert len(foo.body.strides) == 3 + assert foo.body.strides[0].write.name == 'y_fsz0' + assert foo.body.strides[2].write.name == 'y_stride0' + assert len(bar.parameters) == 2 + assert bar.parameters[0].name == 'a' + assert bar.parameters[1].name == 'y_stride0' assert len(bar.body.body) == 1 - assert len(bar.body.strides) == 3 - assert bar.body.strides[0].write.name == 'y_fsz0' - assert bar.body.strides[2].write.name == 'y_stride0' def test_strides_forwarding2():