diff --git a/accelforge/frontend/arch/_flattened_arch.py b/accelforge/frontend/arch/_flattened_arch.py index 2567b278..15e60cb8 100644 --- a/accelforge/frontend/arch/_flattened_arch.py +++ b/accelforge/frontend/arch/_flattened_arch.py @@ -1,3 +1,12 @@ +from typing import TypeVar, Callable + + +_FIND_SENTINEL = object() + +D = TypeVar("D") +T = TypeVar("T") + + class FlattenedArch: """ A flattened arch is an architecture spec that has been @@ -52,3 +61,124 @@ def is_above(self, name_a: str, name_b: str): idx_a = self.index(name_a) idx_b = self.index(name_b) return idx_a < idx_b + + def find_first_of_type_between( + self, + node_type: T, + name_lower: str, + name_upper: str, + default: D = _FIND_SENTINEL, + top_bottom: bool = True, + ) -> T | D: + """ + Returns the first node with type `node_type` above `name_lower` and under `name_upper`. + + If `name` does not exist, raises an error. + + If no node of `node_type` is found, either `default` is + returned (if provided) or raises an error. + """ + upper_idx = self.index(name_upper) + lower_idx = self.index(name_lower) + + iterator = self.nodes + if not top_bottom: + iterator = reversed(top_bottom) + for i, node in enumerate(iterator): + if not isinstance(node, node_type) or i <= upper_idx or i >= lower_idx: + continue + else: + return node + if default is not _FIND_SENTINEL: + return default + else: + raise ValueError(f"node with type {node_type} between {name_upper} and {name_lower} not found") + + def find_first_of_type_above( + self, + node_type: T, + name_lower: str, + default: D = _FIND_SENTINEL, + top_bottom: bool = True, + ) -> T | D: + """ + Returns the first node with type `node_type` above `name_lower` and under `name_upper`. + + If `name` does not exist, raises an error. + + If no node of `node_type` is found, either `default` is + returned (if provided) or raises an error. + """ + lower_idx = self.index(name_lower) + + iterator = self.nodes + if not top_bottom: + iterator = reversed(top_bottom) + for i, node in enumerate(iterator): + if not isinstance(node, node_type) or i >= lower_idx: + continue + else: + return node + if default is not _FIND_SENTINEL: + return default + else: + raise ValueError(f"node with type {node_type} above {name_lower} not found") + + def find_first_of_type_below( + self, + node_type: T, + name_upper: str, + default: D = _FIND_SENTINEL, + top_bottom: bool = True, + ) -> T | D: + """ + Returns the first node with type `node_type` above `name_lower` and under `name_upper`. + + If `name` does not exist, raises an error. + + If no node of `node_type` is found, either `default` is + returned (if provided) or raises an error. + """ + upper_idx = self.index(name_upper) + + iterator = self.nodes + if not top_bottom: + iterator = reversed(top_bottom) + for i, node in enumerate(iterator): + if not isinstance(node, node_type) or i <= upper_idx: + continue + else: + return node + if default is not _FIND_SENTINEL: + return default + else: + raise ValueError(f"node with type {node_type} below {name_upper} not found") + + def first_below( + self, + name: str, + filter: Callable = None, + default: D = _FIND_SENTINEL, + ) -> T | D: + """ + Returns the first node with type `node_type` above `name_lower` and under `name_upper`. + + If `name` does not exist, raises an error. + + If no node of `node_type` is found, either `default` is + returned (if provided) or raises an error. + """ + idx = self.index(name) + + if filter is None: + filter = lambda x: True + + for i, node in enumerate(self.nodes): + if not filter(node) or i <= idx: + continue + else: + return node + if default is not _FIND_SENTINEL: + return default + else: + raise ValueError(f"node below {name} not found") diff --git a/accelforge/frontend/arch/components.py b/accelforge/frontend/arch/components.py index f30360ae..867cb03b 100644 --- a/accelforge/frontend/arch/components.py +++ b/accelforge/frontend/arch/components.py @@ -145,7 +145,7 @@ def _set_n_calls(self, value: int | float) -> None: @classmethod def _deprecate_latency_fields(cls, data): if isinstance(data, dict): - if "latency" in data: + if "latency" in data and not "throughput" in data: l = data.pop("latency") warnings.warn( f"Setting `latency` on `{cls.__name__}` is deprecated; use " @@ -155,16 +155,11 @@ def _deprecate_latency_fields(cls, data): DeprecationWarning, stacklevel=2, ) - if "throughput" in data: - raise ValueError( - f"Cannot specify both `latency` and `throughput` on " - f"`{cls.__name__}`. Drop the deprecated `latency` field." - ) l = str(l).strip() data["throughput"] = ( f"1 / ({l}) if ({l}) != 0 else float('inf')" ) - if "latency_scale" in data: + if "latency_scale" in data and not "throughput_scale" in data: ls = data.pop("latency_scale") warnings.warn( f"Setting `latency_scale` on `{cls.__name__}` is deprecated; use " @@ -174,11 +169,6 @@ def _deprecate_latency_fields(cls, data): DeprecationWarning, stacklevel=2, ) - if "throughput_scale" in data: - raise ValueError( - f"Cannot specify both `latency_scale` and `throughput_scale` " - f"on `{cls.__name__}`. Drop the deprecated `latency_scale`." - ) ls = str(ls).strip() data["throughput_scale"] = ( f"1 / ({ls}) if ({ls}) != 0 else float('inf')" @@ -1304,8 +1294,9 @@ def _render_node_color(self) -> str: return "#E0EEFF" -class TopologySpec(str, enum.Enum): +class TopologySpec(enum.StrEnum): MESH = "mesh" + ALL_TO_ALL = "all_to_all" class Network(Component, Leaf): @@ -1316,6 +1307,20 @@ class Network(Component, Leaf): of the spatial nodes from top to bottom. """ + total_latency: str | int | float = "max(max_hops*actions['hops'].latency, max_link_traffic/actions['hops'].throughput)" + """ + Models latency as either: + - *Latency-bound*, which means that the latency of the route with the most number of + hops dominate the overall communication latency. + - *Bandwidth-bound*, which means that the traffic over the most congested link + dominates the overall communication latency. + + Keywords: + - `max_hops` returns the number of hops in the longest route. + - `max_link_traffic` returns the amount of traffic (in bits) over the most congested + link. + """ + bits_per_value: EvalsTo[dict] = {} """ Sets the bits per value for tensors in this `TensorHolder`. Keys are evaluated as diff --git a/accelforge/frontend/arch/spatialable.py b/accelforge/frontend/arch/spatialable.py index 44fa23f5..0b767302 100644 --- a/accelforge/frontend/arch/spatialable.py +++ b/accelforge/frontend/arch/spatialable.py @@ -93,6 +93,19 @@ def _eval_expressions(self, *args, **kwargs): return super(self.__class__, self)._eval_expressions(*args, **kwargs) +class PhysicalSpatial(EvalableModel): + name: str + """ + The name of the dimension over which this spatial fanout is occurring (e.g., X or Y). + """ + + fanout: EvalsTo[int] + """ The size of this fanout. """ + + stride: EvalsTo[int] + """ The number of array coordinates between each spatial fanout coordinate.""" + + class Spatialable(EvalableModel): """Something that can be duplicated to create an array of.""" @@ -107,7 +120,7 @@ class Spatialable(EvalableModel): specified at this level also apply to lower-level `Leaf` nodes in the architecture. """ - _physical_spatial: NoParse[Spatial] = EvalableList() + _physical_spatial: NoParse[PhysicalSpatial] = EvalableList() """ The physical spatial fanout of this node. Should only have a value for a flattened arch. Otherwise, the `spatial` attribute is authoritative. @@ -123,14 +136,29 @@ def get_fanout_along(self, dim_name: str, default: int = 1) -> int: return s.fanout return default + def _has_physical_dim(self, dim_name: str) -> bool: + for s in self._physical_spatial: + if s.name == dim_name: + return True + return False + def _get_physical_fanout_along(self, dim_name: str, default: int = 1) -> int: for s in self._physical_spatial: if s.name == dim_name: return s.fanout return default + def _get_physical_stride_along(self, dim_name: str) -> int: + for s in self._physical_spatial: + if s.name == dim_name: + return s.stride + raise ValueError(f"dimension {dim_name} not found") + def _spatial_str(self, include_newline=True) -> str: if not self.spatial: return "" result = ", ".join(f"{s.fanout}× {s.name}" for s in self.spatial) return f"\n[{result}]" if include_newline else result + + def _is_distributed(self): + return any(s.fanout > 1 for s in self._physical_spatial) \ No newline at end of file diff --git a/accelforge/frontend/arch/structure.py b/accelforge/frontend/arch/structure.py index 157a385a..8ceb7370 100644 --- a/accelforge/frontend/arch/structure.py +++ b/accelforge/frontend/arch/structure.py @@ -20,7 +20,7 @@ from accelforge.util.exceptions import EvaluationError -from accelforge.frontend.arch.spatialable import Spatialable +from accelforge.frontend.arch.spatialable import Spatialable, PhysicalSpatial from accelforge.frontend.arch._flattened_arch import FlattenedArch from pydantic import Discriminator @@ -334,6 +334,10 @@ def _flatten( nodes = [] + # Nodes inside an array are flattened to fit into a hierarchical + # model in order to map. + # However, we will keep information about how these nodes are + # arranged for modeling. for node in self.nodes: try: if isinstance(node, Branch): @@ -342,7 +346,14 @@ def _flatten( if isinstance(node, Spatialable): fanout *= node.get_fanout() node = deepcopy(node) - node._physical_spatial = node.spatial + node._physical_spatial = [ + PhysicalSpatial( + name=s.name, + fanout=s.fanout, + stride=self.get_fanout_along(s.name)/s.fanout + ) + for s in node.spatial + ] node.spatial = EvalableList() nodes.append(node) else: diff --git a/accelforge/model/_looptree/latency/memory.py b/accelforge/model/_looptree/latency/memory.py index c96ec9b1..080e3c04 100755 --- a/accelforge/model/_looptree/latency/memory.py +++ b/accelforge/model/_looptree/latency/memory.py @@ -14,7 +14,7 @@ from accelforge.model._looptree.reuse.symbolic import BuffetStats from accelforge.util._eval_expressions import MATH_FUNCS, eval_expression -from accelforge.util._sympy.broadcast_max import Max, Min +from accelforge.util._sympy.broadcast_max import Max, Min, MaxGeqZero from accelforge.util._basetypes import EvalableList import symengine as se @@ -71,6 +71,10 @@ def component_latency( component_to_actions: dict[str, dict[str, float]] = defaultdict( lambda: defaultdict(lambda: 0) ) + # Holds ``keywords" that do not map neatly to actions, e.g., max_hops for network + component_to_keywords: dict[str, dict[str, float]] = defaultdict( + lambda: defaultdict(lambda: 0) + ) name2component: dict[str, Component] = {node.name: node for node in flattened_arch} compute_obj = flattened_arch[-1] @@ -103,6 +107,30 @@ def component_latency( f"Component {component} is not a TensorHolder or Compute" ) + network_to_max_link_traffic = defaultdict(lambda: defaultdict(lambda: 0)) + network_to_max_hops = defaultdict(lambda: []) + # Aggregates across tensors + for network, network_stats in looptree_results.network_stats.items(): + component = network.component + if component not in name2component: + raise ValueError(f"Component {component} found in mapping but not arch") + + dim_traffic = network_to_max_link_traffic[component] + for dim, max_traffic_in_dim in network_stats.max_traffic.items(): + dim_traffic[dim] += max_traffic_in_dim + + network_to_max_hops[component].append(network_stats.max_hops) + + for network, network_stats in looptree_results.network_stats.items(): + component = network.component + keywords = component_to_keywords[component] + keywords["max_link_traffic"] = MaxGeqZero( + *network_to_max_link_traffic[component].values() + ) + keywords["max_hops"] = MaxGeqZero( + *network_to_max_hops[component] + ) + longest_compute_latency = Max( 0, *[s.max_latency for s in looptree_results.compute_stats.values()] ) @@ -138,13 +166,18 @@ def component_latency( "sum": _sum, } - for component in component_to_actions: + for component in name2component: + if component not in component_to_actions and component not in component_to_keywords: + continue component_obj = name2component[component] dump = component_obj.shallow_model_dump(include_None=True) # Replace serialized `actions` dump with local Action copies that carry # the correct n_calls for this job, so formulas can access `a.n_calls`, # `a.throughput`, etc. without mutating the shared spec state. - dump["actions"] = component_to_actions[component] + if component in component_to_actions: + dump["actions"] = component_to_actions[component] + if component in component_to_keywords: + dump |= component_to_keywords[component] symbol_table = {**symbol_table_base, **dump} if component_obj.total_latency is not None: component_latency[component] = eval_expression( diff --git a/accelforge/model/_looptree/reuse/symbolic/_network.py b/accelforge/model/_looptree/reuse/symbolic/_network.py index aec14fe6..0c833354 100644 --- a/accelforge/model/_looptree/reuse/symbolic/_network.py +++ b/accelforge/model/_looptree/reuse/symbolic/_network.py @@ -1,12 +1,11 @@ -import copy -from accelforge.frontend.arch import Network as NetworkSpec -from accelforge.frontend.mapping import ( - Spatial, -) +from abc import ABC, abstractmethod +from dataclasses import dataclass +from typing import Any from accelforge.frontend.mapping import ( - Spatial, + Spatial ) +from accelforge.frontend.arch.components import TopologySpec from accelforge.frontend._workload_isl._symbolic import ( compute_dense_tile_occupancy, Irrelevant, @@ -14,94 +13,341 @@ PartiallyRelevant, ) -from accelforge.util._sympy.broadcast_max import Min, Max, MaxGeqZero +from accelforge.util._sympy.broadcast_max import MaxGeqZero, MinGeqZero from ._common import AnalysisInfo -from ._stats import NetworkStats +from ._stats import NetworkStats, SymbolicAnalysisOutput + + +@dataclass +class PerLoopTransferCost: + """The per-spatial-loop cost contributed by a single network, as computed + by a :class:`TopologyModel`.""" + + total_cost: Any + """Total hops contributed by data movement over this spatial loop.""" + max_hops: Any + """Hops added to the longest route by this spatial loop.""" + max_traffic: Any + """Maximum traffic (in actions) on any single link along this dimension.""" + + +class TopologyModel(ABC): + """Computes the cost of moving data across a network of a given topology. + + Subclasses encapsulate everything topology-specific about how a tensor's + data is delivered across a spatial fanout. :class:`NetworkAnalyzer` selects + the model for each network from its component's + :class:`~accelforge.frontend.arch.components.TopologySpec` and remains + agnostic to the topology itself. + + Instances are stateful: they accumulate per-network max hops across the + repeated spatial-loop iterations of a single :class:`NetworkAnalyzer`, so a + fresh model is constructed for each analyzer (see :func:`get_topology_model`). + """ + + def __init__(self): + # Running total of max hops per network, accumulated across the + # repeated spatial-loop iterations handled by one NetworkAnalyzer. + self.overall_max_hops: dict = {} + + def accumulate_max_hops(self, network, max_hops): + """Add this loop's ``max_hops`` to ``network``'s running total and + return the updated total. + + Each call to :meth:`NetworkAnalyzer.accumulate_child_result` (i.e., over + a different iteration of a spatial loop) adds more to the max hops. + """ + self.overall_max_hops[network] = ( + self.overall_max_hops.get(network, 0) + max_hops + ) + return self.overall_max_hops[network] + + @abstractmethod + def per_loop_transfer_cost( + self, + relevancy, + *, + shape_repeats, + last_fanout, + volume, + src_component, + dim_name: str, + ) -> PerLoopTransferCost: + """Return the :class:`PerLoopTransferCost` for moving ``volume`` of data across one + spatial loop. + + Args: + relevancy: The relevancy of the spatial loop's rank variable to the + tensor (``Irrelevant``, ``Relevant``, or ``PartiallyRelevant``). + shape_repeats: The number of iterations of this spatial loop. + last_fanout: The fanout in this dimension among mapping nodes below + (i.e., the stride). + volume: The data volume (in actions) moved per destination. + src_component: The flattened-arch component sourcing the data, used + to query physical fanout/stride. + dim_name: The name of the spatial dimension (e.g., ``X`` or ``Y``). + """ + raise NotImplementedError + + +class MeshTopologyModel(TopologyModel): + """Cost model for a mesh network. + + Data travels along one axis of the mesh. Multicast delivers a value to every + point along the dimension; unicast delivers a distinct value to each point. + When the source is physically distributed, data is bound as locally as + possible across the physical buffers. + """ + + def per_loop_transfer_cost( + self, + relevancy, + *, + shape_repeats, + last_fanout, + volume, + src_component, + dim_name, + ) -> PerLoopTransferCost: + if isinstance(relevancy, Irrelevant): + # The volume travels through link by link in one axis of the mesh + # Distributed or not, the amount of total cost is the same. + # However, the accesses now come from different physical memories + total_cost = multicast_cost(shape_repeats, last_fanout) * volume + max_hops = shape_repeats * last_fanout + max_traffic = volume + elif isinstance(relevancy, Relevant): + # If distributed, then we bind data as locally as possible in the + # physical buffers + if src_component._get_physical_fanout_along(dim_name) > 1: + physical_stride = src_component._get_physical_stride_along(dim_name) + + n_dsts_per_physical = MinGeqZero( + # if last_fanout > physical_stride, set n_dst to 1, which results in 0 hops + # later (which is correct because the set of destinations always overlap + # the set of sources). + MaxGeqZero(physical_stride / last_fanout, 1), + shape_repeats + ) + n_activated_physical = MaxGeqZero(shape_repeats * last_fanout / physical_stride, 1) + total_cost = ( + n_activated_physical + * + unicast_cost(n_dsts_per_physical, last_fanout) + * + volume + ) + max_hops = MinGeqZero((n_dsts_per_physical - 1) * last_fanout, physical_stride) + max_traffic = (n_dsts_per_physical - 1) * volume + else: + total_cost = unicast_cost(shape_repeats, last_fanout) * volume + max_hops = shape_repeats * last_fanout + max_traffic = (shape_repeats - 1) * volume + elif isinstance(relevancy, PartiallyRelevant): + raise NotImplementedError() + else: + raise RuntimeError(f"unhandled relevancy type {relevancy}") + + return PerLoopTransferCost(total_cost=total_cost, max_hops=max_hops, max_traffic=max_traffic) + + +class AllToAllTopologyModel(TopologyModel): + """Cost model for an all-to-all network using a switch (e.g. NVLink). + + Every node connects to every other node through a switch, so any + source reaches any destination in one hop regardless of + + Physical stride is irrelevant, so ``last_fanout`` and physical distribution + are not used. + """ + + HOPS_PER_TRANSFER = 1 + """Hops charged for one source-to-destination transfer across the switch. + One switch traversal is treated as a single hop; the per-hop energy and + latency come from the network component's ``hops`` action.""" + + def per_loop_transfer_cost( + self, + relevancy, + *, + shape_repeats, + last_fanout, + volume, + src_component, + dim_name, + ) -> PerLoopTransferCost: + hops = self.HOPS_PER_TRANSFER + + # n - 1 other instances each receive the data across the switch. The + # source already holds it (the set of destinations overlaps the set of + # sources), so it needs no transfer to itself. + n_dsts = shape_repeats - 1 + + if isinstance(relevancy, (Irrelevant, Relevant)): + # Same delivery count (and hence energy) whether the data is shared + # (multicast) or distinct per instance (unicast): each of the n - 1 + # destinations is one switch traversal away. + total_cost = n_dsts * hops * volume + # Every route is a single switch traversal, independent of distance. + max_hops = hops + if isinstance(relevancy, Irrelevant): + # Multicast: the switch replicates, so each link carries the + # value at most once. + max_traffic = volume + else: + # Unicast: the source's uplink to the switch carries all n - 1 + # distinct messages, making it the most congested link. + max_traffic = n_dsts * volume + elif isinstance(relevancy, PartiallyRelevant): + raise NotImplementedError() + else: + raise RuntimeError(f"unhandled relevancy type {relevancy}") + + return PerLoopTransferCost( + total_cost=total_cost, max_hops=max_hops, max_traffic=max_traffic + ) + + +# Registry of topology models +TOPOLOGY_MODELS: dict[TopologySpec, type[TopologyModel]] = { + TopologySpec.MESH: MeshTopologyModel, + TopologySpec.ALL_TO_ALL: AllToAllTopologyModel, +} + + +def get_topology_model(topology) -> TopologyModel: + """Construct a fresh :class:`TopologyModel` for the given topology.""" + return TOPOLOGY_MODELS[topology]() class NetworkAnalyzer: - def __init__(self, network_stats): - self.overall_max_hops = 0 + def __init__(self, network_stats, info: AnalysisInfo, einsum_name, node: Spatial): self.network_stats = network_stats + # These don't change across calls to accumulate_child_result. + self.info = info + self.einsum_name = einsum_name + self.node = node + # Each network gets its own topology model, since different networks may + # have different topologies. Models are constructed lazily, the first + # time a network needs costing, and reused for the analyzer's lifetime so + # their accumulated max hops persist. + self.topology_models: dict = {} + + def _get_topology_model(self, network, topology) -> TopologyModel: + if network not in self.topology_models: + self.topology_models[network] = get_topology_model(topology) + return self.topology_models[network] def accumulate_child_result( self, - child_result, - info: AnalysisInfo, + child_result: SymbolicAnalysisOutput, shape_repeats, - einsum_name, child_shape, - node, ): + """This function is called for every repeated shape.""" + flattened_arch = self.info.job.flattened_arch + for network, child_network_stats in child_result.network_stats.items(): + src_component = flattened_arch[network.source.level] if network not in self.network_stats: self.network_stats[network] = NetworkStats() accumulated_network_stats = self.network_stats[network] + # We only need to update the summary if the spatial loop is for + # a component higher than the network of interest + if flattened_arch.is_above(self.node.component, network.component): + accumulated_network_stats.total_hops += ( + child_network_stats.total_hops * shape_repeats + ) + accumulated_network_stats.max_hops = MaxGeqZero( + accumulated_network_stats.max_hops, + child_network_stats.max_hops, + ) + for k, v in child_network_stats.max_traffic.items(): + accumulated_network_stats.max_traffic[k] = MaxGeqZero( + accumulated_network_stats.max_traffic.get(k, 0), + v + ) + continue + + volume = self._get_data_volume(network, child_shape) + + relevancy = self.info.tensor_to_relevancy[network.tensor][self.node.rank_variable] + + # The fanout in this dimension in mapping nodes below, i.e., the stride + last_fanout = child_result.fanout.get((self.node.component, self.einsum_name), {}) + last_fanout = last_fanout.get(self.node.name, 1) + + topology_model = self._get_topology_model( + network, flattened_arch[network.component].topology + ) + per_loop_transfer_cost = topology_model.per_loop_transfer_cost( + relevancy, + shape_repeats=shape_repeats, + last_fanout=last_fanout, + volume=volume, + src_component=src_component, + dim_name=self.node.name, + ) + + overall_max_hops = topology_model.accumulate_max_hops( + network, per_loop_transfer_cost.max_hops + ) + accumulated_network_stats.total_hops += ( - child_network_stats.total_hops * shape_repeats + per_loop_transfer_cost.total_cost + + child_network_stats.total_hops * shape_repeats ) accumulated_network_stats.max_hops = MaxGeqZero( accumulated_network_stats.max_hops, - child_network_stats.max_hops, + overall_max_hops + child_network_stats.max_hops, ) - projection = info.einsum_tensor_to_projection[(einsum_name, network.tensor)] - component_object = info.job.flattened_arch[network.component] - workload_bpv = info.job.einsum.tensor_accesses[ - network.tensor - ].bits_per_value - bits_per_value = component_object.bits_per_value.get( - network.tensor, workload_bpv - ) - bits_per_action = component_object.bits_per_action - if bits_per_action is not None: - actions_per_value = bits_per_value / bits_per_action - else: - actions_per_value = bits_per_value - volume = ( - compute_dense_tile_occupancy(projection, child_shape) - * actions_per_value + accumulated_network_stats.max_traffic[self.node.name] = MaxGeqZero( + accumulated_network_stats.max_traffic.get(self.node.name, 0), + per_loop_transfer_cost.max_traffic + child_network_stats.max_traffic.get(self.node.name, 0) ) - if info.job.spec_one_einsum.arch.is_above( - node.component, network.component - ): - continue + overall_max_hops = {} + for model in self.topology_models.values(): + overall_max_hops.update(model.overall_max_hops) + return overall_max_hops - relevancy = info.tensor_to_relevancy[network.tensor][node.rank_variable] + def _get_data_volume(self, network, child_shape): + info = self.info + einsum_name = self.einsum_name + flattened_arch = info.job.flattened_arch + projection = info.einsum_tensor_to_projection[(einsum_name, network.tensor)] + component_object = flattened_arch[network.component] + workload_bpv = info.job.einsum.tensor_accesses[ + network.tensor + ].bits_per_value + bits_per_value = component_object.bits_per_value.get( + network.tensor, workload_bpv + ) + bits_per_action = component_object.bits_per_action + if bits_per_action is not None: + actions_per_value = bits_per_value / bits_per_action + else: + actions_per_value = bits_per_value + volume = ( + compute_dense_tile_occupancy(projection, child_shape) + * actions_per_value + ) + return volume - last_fanout = child_result.fanout.get((node.component, einsum_name), {}) - last_fanout = last_fanout.get(node.name, 1) - if isinstance(relevancy, Irrelevant): - # Cost of multicasting is the cost of delivering along the dimension - multicast_hops = shape_repeats * last_fanout - multicast_cost = multicast_hops * volume - self.overall_max_hops += multicast_hops - accumulated_network_stats.total_hops += multicast_cost - accumulated_network_stats.max_hops = MaxGeqZero( - accumulated_network_stats.max_hops, - self.overall_max_hops + child_network_stats.max_hops, - ) - elif isinstance(relevancy, Relevant): - # Cost of unicast is the cost of delivering to each point in - # the dimension with shape as stride - # TODO: we should use the actual stride - total_unicast_cost = ( - 0.5 * (shape_repeats + 1) * shape_repeats * last_fanout * volume - ) - max_unicast_hops = shape_repeats * last_fanout - self.overall_max_hops += max_unicast_hops +def multicast_cost(n_dsts, stride): + """Returns total hops of multicast along a dimension.""" + return (n_dsts-1)*stride + + +def unicast_cost(n_dsts, stride): + """Returns total hops of unicast along a dimension.""" + # Cost of unicast is the cost of delivering to each point in + # the dimension with shape as stride + return arithmetic_sum(n_dsts-1)*stride - accumulated_network_stats.total_hops += total_unicast_cost - accumulated_network_stats.max_hops = MaxGeqZero( - accumulated_network_stats.max_hops, - self.overall_max_hops + child_network_stats.max_hops, - ) - elif isinstance(relevancy, PartiallyRelevant): - raise NotImplementedError() - else: - raise RuntimeError(f"unhandled relevancy type {relevancy}") - return self.overall_max_hops +def arithmetic_sum(n): + return 0.5 * (n+1) * n diff --git a/accelforge/model/_looptree/reuse/symbolic/_stats.py b/accelforge/model/_looptree/reuse/symbolic/_stats.py index 8368937d..aa2c1d90 100644 --- a/accelforge/model/_looptree/reuse/symbolic/_stats.py +++ b/accelforge/model/_looptree/reuse/symbolic/_stats.py @@ -21,7 +21,11 @@ @dataclass class NetworkStats: total_hops: Any = field(default=0) + """Total number of hops overall. Useful to calculate energy.""" max_hops: Any = field(default=0) + """Longest hops among all routes.""" + max_traffic: dict[int | str, Any] = field(default_factory=dict) + """Maximum traffic occuring on any single link along a dimension.""" def repeat(self, n_repeats): new = copy.copy(self) @@ -32,10 +36,6 @@ def repeat(self, n_repeats): new.total_hops = new.total_hops * n_repeats return new - def combine(self, other: "NetworkStats"): - self.total_hops += other.total_hops - self.max_hops = max(self.max_hops, other.max_hops) - @dataclass class BuffetStats: @@ -100,6 +100,12 @@ def repeat_temporal(self, factor: int, is_fully_relevant: bool) -> "BuffetStats" return new def repeat_spatial(self, factor: int, reuse_parent_accesses: bool) -> "BuffetStats": + """ + Repeat buffet stats due to spatial loop `factor` number of times. + + For accesses to parent, the amount of repetition is `factor` if `reuse_parent_access` + is False; otherwise, there is no repetition. + """ new = copy.copy(self) if factor == 1: return new diff --git a/accelforge/model/_looptree/reuse/symbolic/_symbolic.py b/accelforge/model/_looptree/reuse/symbolic/_symbolic.py index abc9238f..e8d628e7 100755 --- a/accelforge/model/_looptree/reuse/symbolic/_symbolic.py +++ b/accelforge/model/_looptree/reuse/symbolic/_symbolic.py @@ -585,13 +585,16 @@ def analyze_spatial(node_idx, current_shape, info: AnalysisInfo): node: Spatial = mapping[node_idx] rank_var = node.rank_variable node_dim = node.name - spatial_component = info.job.flattened_arch[node.component] + flattened_arch = info.job.flattened_arch + spatial_component = flattened_arch[node.component] component_spatial_dim = spatial_component.spatial[node_dim] stride_and_shape = loop_stride_and_shape(node, current_shape, node_idx, info) result_accumulator = SymbolicAnalysisOutput() - network_analyzer = NetworkAnalyzer(result_accumulator.network_stats) + network_analyzer = NetworkAnalyzer( + result_accumulator.network_stats, info, einsum_name, node + ) def handle_repeated_value(repeated_shape): shape_value = repeated_shape.value @@ -605,7 +608,6 @@ def handle_repeated_value(repeated_shape): accumulated_buffet_stats = result_accumulator.buffet_stats child_stats = list(child_result.buffet_stats.items()) for i, (buffet, buffet_stats) in enumerate(child_stats): - stats = buffet_stats accumulated_stats = accumulated_buffet_stats.setdefault( buffet, BuffetStats.blank() ) @@ -627,13 +629,13 @@ def handle_repeated_value(repeated_shape): and buffet.tensor in component_spatial_dim.may_reuse ) - stats.n_loops_above = stats.n_loops_above + 1 - accumulated_stats += stats.repeat_spatial( + buffet_stats.n_loops_above = buffet_stats.n_loops_above + 1 + accumulated_stats += buffet_stats.repeat_spatial( shape_repeats, reuse_parent_accesses ) network_analyzer.accumulate_child_result( - child_result, info, shape_repeats, einsum_name, child_shape, node + child_result, shape_repeats, child_shape ) for einsum, child_steps in child_result.temporal_steps.items(): @@ -691,6 +693,7 @@ def analyze_storage( count_writes: bool = True, ): mapping = info.mapping + flattened_arch = info.job.flattened_arch einsum_name = mapping[-1].einsum node: TensorHolder = mapping[node_idx] @@ -797,25 +800,49 @@ def inherit_add(attr: str, default_value: Any = fills) -> Any: else: write_scale = 0 + # ======================= + # For distributed buffers + n_active_physical_units = 1 + if child is not None: + next_spatial = flattened_arch.first_below( + node.component, + lambda n: isinstance(n, arch.Spatialable) and len(n.spatial) > 0, + default=None, + ) + if component_object._is_distributed() and next_spatial is not None: + for (b, e), dim_fanout in child_result.fanout.items(): + if b != next_spatial.name: + continue + for d in dim_fanout: + if not component_object._has_physical_dim(d): + continue + n_active_physical_units *= ( + dim_fanout[d] / component_object._get_physical_stride_along(d) + ) + + # ========================== + # Recalculate usage of distributed buffers + stats.max_occupancy /= n_active_physical_units + # ========================== # Data exchanges with parent if count_downward_movement[tensor]: # Parent -> Me stats.total_write_actions += stats.total_reads_to_parent * write_scale stats.max_per_unit_write_actions += ( - stats.total_reads_to_parent * write_scale + stats.total_reads_to_parent * write_scale / n_active_physical_units ) stats.total_skipped_first_write_actions += ( stats.total_skipped_first_reads_to_parent * write_scale ) stats.min_per_unit_skipped_first_write_actions += ( - stats.min_per_parent_skipped_first_reads_to_parent * write_scale + stats.min_per_parent_skipped_first_reads_to_parent * write_scale / n_active_physical_units ) if count_upward_movement[tensor]: # Me -> Parent # Comment this to have the final writeback to a buffer hit both that buffer and # go directly to the parent without incurring another read from the buffer. stats.total_read_actions += stats.total_writes_to_parent * read_scale - stats.max_per_unit_read_actions += stats.total_writes_to_parent * read_scale + stats.max_per_unit_read_actions += stats.total_writes_to_parent * read_scale / n_active_physical_units # ======================== # Data exchanges with peer @@ -828,7 +855,7 @@ def inherit_add(attr: str, default_value: Any = fills) -> Any: if count_downward_movement[tensor]: # Me -> Child stats.total_read_actions += child.total_reads_to_parent * read_scale stats.max_per_unit_read_actions += ( - child.max_per_parent_reads_to_parent * read_scale + child.max_per_parent_reads_to_parent * read_scale / n_active_physical_units ) # Skip first read if skip_initial: @@ -836,13 +863,13 @@ def inherit_add(attr: str, default_value: Any = fills) -> Any: child.total_skipped_first_reads_to_parent * read_scale ) stats.min_per_unit_skipped_first_read_actions += ( - child.min_per_parent_skipped_first_reads_to_parent * read_scale + child.min_per_parent_skipped_first_reads_to_parent * read_scale / n_active_physical_units ) if count_upward_movement[tensor]: # Child -> Me stats.total_write_actions += child.total_writes_to_parent * write_scale stats.max_per_unit_write_actions += ( - child.max_per_parent_writes_to_parent * write_scale + child.max_per_parent_writes_to_parent * write_scale / n_active_physical_units ) return child_result @@ -902,19 +929,21 @@ def analyze_reservation(node_idx, current_shape, info: AnalysisInfo): child_result.buffet_stats[buffet] = stats # Reservation nodes are the first to produce stats for a network - network_node = info.job.spec_one_einsum.arch.find_first_of_type_above( - NetworkSpec, buffet.level, default=None - ) - if network_node is not None: - network = Network( - tensor, - einsum_name, - info.data_movement_connections.get_src(buffet), - buffet, - component=network_node.name if network_node else network_node, + src = info.data_movement_connections.get_src(buffet) + if src is not None: + network_node = info.job.flattened_arch.find_first_of_type_between( + NetworkSpec, buffet.level, src.level, default=None ) - assert network not in child_result.network_stats - child_result.network_stats[network] = NetworkStats() + if network_node is not None: + network = Network( + tensor, + einsum_name, + src, + buffet, + component=network_node.name if network_node else network_node, + ) + assert network not in child_result.network_stats + child_result.network_stats[network] = NetworkStats() fanout_key = (node.resource, einsum_name) if fanout_key not in child_result.fanout: @@ -964,18 +993,20 @@ def analyze_compute( stats.max_occupancy = 1 result_accumulator.buffet_stats[buffet] = stats - network_node = info.job.spec_one_einsum.arch.find_first_of_type_above( - NetworkSpec, node.component, default=None - ) - if network_node is not None: - network = Network( - tensor, - info.job.einsum_name, - info.data_movement_connections.get_src(buffet), - buffet, - component=network_node.name if network_node else network_node, + src = info.data_movement_connections.get_src(buffet) + if src is not None: + network_node = info.job.flattened_arch.find_first_of_type_between( + NetworkSpec, node.component, src.level, default=None ) - result_accumulator.network_stats[network] = NetworkStats() + if network_node is not None: + network = Network( + tensor, + info.job.einsum_name, + src, + buffet, + component=network_node.name if network_node else network_node, + ) + result_accumulator.network_stats[network] = NetworkStats() return result_accumulator diff --git a/tests/input_files/networked/flat.yaml b/tests/network/input_files/networked/flat.yaml similarity index 61% rename from tests/input_files/networked/flat.yaml rename to tests/network/input_files/networked/flat.yaml index 2c1b2cb0..28679d21 100644 --- a/tests/input_files/networked/flat.yaml +++ b/tests/network/input_files/networked/flat.yaml @@ -10,12 +10,6 @@ arch: - {name: read, energy: 0, throughput: inf} - {name: write, energy: 0, throughput: inf} - - !Network - name: NoC - area: 0 - leak_power: 0 - actions: [] - - !Array name: Array spatial: @@ -37,10 +31,10 @@ arch: size: inf area: 0 leak_power: 0 - tensors: {keep: ~MainMemory, may_keep: All} + tensors: {keep: input, may_keep: input} actions: - - {name: read, energy: 0, throughput: inf} - - {name: write, energy: 0, throughput: inf} + - {name: read, energy: 5, throughput: 1} + - {name: write, energy: 5, throughput: inf} spatial: - {name: X, fanout: 4} @@ -49,10 +43,10 @@ arch: size: inf area: 0 leak_power: 0 - tensors: {keep: ~MainMemory, may_keep: All} + tensors: {keep: output, may_keep: output} actions: - - {name: read, energy: 0, throughput: inf} - - {name: write, energy: 0, throughput: inf} + - {name: read, energy: 5, throughput: inf} + - {name: write, energy: 5, throughput: inf} spatial: - {name: Y, fanout: 4} @@ -61,26 +55,34 @@ arch: size: inf area: 0 leak_power: 0 - tensors: {keep: ~MainMemory, may_keep: All} + tensors: {keep: weight, may_keep: weight} actions: - - {name: read, energy: 0, throughput: inf} - - {name: write, energy: 0, throughput: inf} + - {name: read, energy: 5, throughput: 1} + - {name: write, energy: 5, throughput: 1} spatial: - {name: X, fanout: 2} - {name: Y, fanout: 2} + - !Network + name: NoC + area: 0 + leak_power: 0 + actions: + - {name: hops, energy: 1, latency: 0, throughput: inf} + - !Memory name: Scratchpad size: inf area: 0 leak_power: 0 + tensors: {keep: weight, may_keep: weight} actions: - - {name: read, energy: 0, throughput: inf} - - {name: write, energy: 0, throughput: inf} + - {name: read, energy: 1, throughput: inf} + - {name: write, energy: 1, throughput: inf} - !Compute name: MAC area: 0 leak_power: 0 actions: - - {name: compute, energy: 0, throughput: inf} \ No newline at end of file + - {name: compute, energy: 1, throughput: inf} diff --git a/tests/network/input_files/networked/hierarchical.yaml b/tests/network/input_files/networked/hierarchical.yaml new file mode 100644 index 00000000..f268ef7e --- /dev/null +++ b/tests/network/input_files/networked/hierarchical.yaml @@ -0,0 +1,58 @@ +arch: + nodes: + - !Memory + name: MainMemory + size: inf + area: 0 + leak_power: 0 + tensors: {keep: All} + actions: + - {name: read, energy: 0, throughput: 1e9} + - {name: write, energy: 0, throughput: 1e9} + + - !Memory + name: GlobalBuffer + size: inf + area: 0 + leak_power: 0 + tensors: {keep: ~MainMemory, may_keep: All} + actions: + - {name: read, energy: 0, throughput: 4e9} + - {name: write, energy: 0, throughput: 4e9} + + - !Network + name: PeArray + area: 0 + leak_power: 0 + actions: + - {name: hops, energy: 1, latency: 0, throughput: 4e9} + + - !Memory + name: Scratchpad + size: inf + area: 0 + leak_power: 0 + tensors: {keep: All} + actions: + - {name: read, energy: 0, throughput: 16e9} + - {name: write, energy: 0, throughput: 16e9} + spatial: + - {name: X, fanout: 2} + - {name: Y, fanout: 2} + + - !Network + name: MacArray + area: 0 + leak_power: 0 + actions: + - {name: hops, energy: 1, latency: 0, throughput: 16e9} + + - !Compute + name: MAC + area: 0 + leak_power: 0 + actions: + - {name: compute, energy: 0, throughput: 1e9} + spatial: + - {name: X, fanout: 2} + - {name: Y, fanout: 2} \ No newline at end of file diff --git a/tests/input_files/networked/hierarchical_1d.yaml b/tests/network/input_files/networked/hierarchical_1d.yaml similarity index 87% rename from tests/input_files/networked/hierarchical_1d.yaml rename to tests/network/input_files/networked/hierarchical_1d.yaml index 15af7af1..167212ff 100644 --- a/tests/input_files/networked/hierarchical_1d.yaml +++ b/tests/network/input_files/networked/hierarchical_1d.yaml @@ -24,8 +24,9 @@ arch: name: PeArray area: 0 leak_power: 0 + total_latency: "max_hops" actions: - - {name: hops, energy: 1, throughput: inf} + - {name: hops, energy: 1, latency: 0, throughput: 1} - !Memory name: Scratchpad @@ -44,7 +45,7 @@ arch: area: 0 leak_power: 0 actions: - - {name: hops, energy: 1, throughput: inf} + - {name: hops, energy: 1, latency: 1, throughput: inf} - !Compute name: MAC diff --git a/tests/input_files/networked/hierarchical.yaml b/tests/network/input_files/networked/hierarchical_1d_all_to_all.yaml similarity index 76% rename from tests/input_files/networked/hierarchical.yaml rename to tests/network/input_files/networked/hierarchical_1d_all_to_all.yaml index 61d7cf70..bbb14f8c 100644 --- a/tests/input_files/networked/hierarchical.yaml +++ b/tests/network/input_files/networked/hierarchical_1d_all_to_all.yaml @@ -24,8 +24,9 @@ arch: name: PeArray area: 0 leak_power: 0 + total_latency: "max_hops" actions: - - {name: hops, energy: 1, throughput: inf} + - {name: hops, energy: 1, latency: 1, throughput: inf} - !Memory name: Scratchpad @@ -37,15 +38,16 @@ arch: - {name: read, energy: 0, throughput: inf} - {name: write, energy: 0, throughput: inf} spatial: - - {name: X, fanout: 2} - - {name: Y, fanout: 2} + - {name: X, fanout: 4} + # All-to-all switch (NVLink-like): every node is one hop from every other - !Network name: MacArray + topology: all_to_all area: 0 leak_power: 0 actions: - - {name: hops, energy: 1, throughput: inf} + - {name: hops, energy: 1, latency: 1, throughput: inf} - !Compute name: MAC @@ -54,5 +56,4 @@ arch: actions: - {name: compute, energy: 0, throughput: inf} spatial: - - {name: X, fanout: 2} - - {name: Y, fanout: 2} \ No newline at end of file + - {name: X, fanout: 4} diff --git a/tests/network/input_files/networked/hierarchical_switched.yaml b/tests/network/input_files/networked/hierarchical_switched.yaml new file mode 100644 index 00000000..0bfd6592 --- /dev/null +++ b/tests/network/input_files/networked/hierarchical_switched.yaml @@ -0,0 +1,58 @@ +arch: + nodes: + - !Memory + name: MainMemory + size: inf + area: 0 + leak_power: 0 + tensors: {keep: All} + actions: + - {name: read, energy: 100, latency: 1e-9} + - {name: write, energy: 100, latency: 1e-9} + + - !Memory + name: GlobalBuffer + size: inf + area: 0 + leak_power: 0 + tensors: {keep: ~MainMemory, may_keep: All} + actions: + - {name: read, energy: 10, latency: 1e-9/4} + - {name: write, energy: 10, latency: 1e-9/4} + + - !Network + name: PeArray + area: 0 + leak_power: 0 + actions: + - {name: hops, energy: 5, latency: 1e-9/4} + + - !Memory + name: Scratchpad + size: inf + area: 0 + leak_power: 0 + tensors: {keep: All} + actions: + - {name: read, energy: 2, latency: 1e-9/16} + - {name: write, energy: 2, latency: 1e-9/16} + spatial: + - {name: X, fanout: 2} + - {name: Y, fanout: 2} + + - !Network + name: MacArray + area: 0 + leak_power: 0 + actions: + - {name: hops, energy: 1, latency: 1e-9/16} + + - !Compute + name: MAC + area: 0 + leak_power: 0 + actions: + - {name: compute, energy: 1, latency: 1e-9} + spatial: + - {name: X, fanout: 2} + - {name: Y, fanout: 2} \ No newline at end of file diff --git a/tests/network/input_files/networked/one_matmul_to_flat.yaml b/tests/network/input_files/networked/one_matmul_to_flat.yaml new file mode 100644 index 00000000..cf7d2f17 --- /dev/null +++ b/tests/network/input_files/networked/one_matmul_to_flat.yaml @@ -0,0 +1,42 @@ +mapping: + nodes: + - !Storage + component: MainMemory + tensors: [T0, T1, W0] + - !Storage + component: DistributedBuffer + tensors: [W0] + - !Temporal + rank_variable: m + tile_shape: {{ M_TILE }} + - !Storage + component: RowBuffer + tensors: [T0] + - !Storage + component: ColumnBuffer + tensors: [T1] + - !Spatial + rank_variable: n0 + tile_shape: {{ MAC_TILE }} + component: Array + name: X + - !Spatial + rank_variable: n1 + tile_shape: {{ MAC_TILE }} + component: Array + name: Y + - !Storage + component: Scratchpad + tensors: [T0, T1, W0] + - !Temporal + rank_variable: m + tile_shape: 1 + - !Temporal + rank_variable: n0 + tile_shape: 1 + - !Temporal + rank_variable: n1 + tile_shape: 1 + - !Compute + einsum: Matmul0 + component: MAC \ No newline at end of file diff --git a/tests/input_files/networked/one_matmul_to_networked_hierarchical.yaml b/tests/network/input_files/networked/one_matmul_to_networked_hierarchical.yaml similarity index 100% rename from tests/input_files/networked/one_matmul_to_networked_hierarchical.yaml rename to tests/network/input_files/networked/one_matmul_to_networked_hierarchical.yaml diff --git a/tests/input_files/networked/one_matmul_to_networked_hierarchical_1d.yaml b/tests/network/input_files/networked/one_matmul_to_networked_hierarchical_1d.yaml similarity index 100% rename from tests/input_files/networked/one_matmul_to_networked_hierarchical_1d.yaml rename to tests/network/input_files/networked/one_matmul_to_networked_hierarchical_1d.yaml diff --git a/tests/network/test_network.py b/tests/network/test_network.py new file mode 100644 index 00000000..04e6e6ba --- /dev/null +++ b/tests/network/test_network.py @@ -0,0 +1,451 @@ +from pathlib import Path +from unittest import TestCase + +import accelforge as af + +INPUT_FILES_DIR = Path(__file__).parent / "input_files" / "networked" + + +class TestParsing(TestCase): + def test_hierarchical(self): + spec = af.Spec.from_yaml( + INPUT_FILES_DIR / "hierarchical.yaml", + ) + self.assertIn("PeArray", spec.arch.nodes) + self.assertEqual(spec.arch.nodes["PeArray"].get_fanout(), 1) + self.assertIn("Scratchpad", spec.arch.nodes) + self.assertEqual(spec.arch.nodes["Scratchpad"].get_fanout(), 4) + self.assertIn("MacArray", spec.arch.nodes) + self.assertEqual(spec.arch.nodes["MacArray"].get_fanout(), 1) + + try: + spec = spec.calculate_component_costs() + except af.EvaluationError as e: + self.fail(e.message) + + def test_flat(self): + spec = af.Spec.from_yaml( + INPUT_FILES_DIR / "flat.yaml", + ) + + try: + spec = spec.calculate_component_costs() + except af.EvaluationError as e: + self.fail(e.message) + + +class TestModelMesh(TestCase): + def test_hierarchical_1d(self): + M = 8 + KN = 8 + MAC_TILE = 2 + M_TILE = 4 + BITS_PER_VALUE = 8 + + spec = af.Spec.from_yaml( + af.examples.workloads.matmuls, + # af.examples.arches.networked.hierarchical, + INPUT_FILES_DIR / "hierarchical_1d.yaml", + # af.examples.mappings.one_matmul_to_networked_hierarchical, + INPUT_FILES_DIR / "one_matmul_to_networked_hierarchical_1d.yaml", + jinja_parse_data={ + "N_EINSUMS": 1, + "M": 8, + "KN": 8, + "MAC_TILE": MAC_TILE, + "M_TILE": M_TILE, + }, + ) + result = spec.evaluate_mapping() + self.assertEqual( + result.data["Matmul0actionMacArrayT0hops"].iloc[0], + (M / M_TILE) + * (KN / MAC_TILE) # number of used Scratchpad + * M_TILE + * KN # temporal for n1 in mapping + * sum(i for i in range(MAC_TILE)) # unicast along X-axis of MacArray + * BITS_PER_VALUE, + ) + # NOTE: assuming XY routing (as defined in mapping) + self.assertEqual( + result.data["Matmul0actionMacArrayT1hops"].iloc[0], + (M / M_TILE) + * (KN / MAC_TILE) + * M_TILE + * KN # temporal for n1 in mapping + * (MAC_TILE - 1) # multicast along X-axis of MacArray + * BITS_PER_VALUE, + ) + self.assertEqual( + result.data["Matmul0actionMacArrayW0hops"].iloc[0], + (M / M_TILE) + * (KN / MAC_TILE) + * M_TILE + * KN + * sum(i for i in range(MAC_TILE)) + * BITS_PER_VALUE, + ) + + self.assertEqual( + result.data["Matmul0actionPeArrayT0hops"].iloc[0], + (M / M_TILE) + * sum(i for i in range(KN // MAC_TILE)) # unicast along X-axis of PeArray + * M_TILE + * MAC_TILE + * BITS_PER_VALUE, + ) + # NOTE: assuming XY routing (as defined in mapping) + self.assertEqual( + result.data["Matmul0actionPeArrayT1hops"].iloc[0], + (M / M_TILE) + * (KN // MAC_TILE - 1) # multicast along X-axis of PeArray + * M_TILE + * KN + * BITS_PER_VALUE, + ) + self.assertEqual( + result.data["Matmul0actionPeArrayW0hops"].iloc[0], + (M / M_TILE) + * sum(i for i in range(KN // MAC_TILE)) # unicast along PeArray + * MAC_TILE + * KN + * BITS_PER_VALUE, + ) + self.assertEqual( + result.data["Totallatency"].iloc[0], + 4 + ) + + def test_hierarchical(self): + M = 8 + KN = 8 + MAC_TILE = 2 + PE_TILE = KN // MAC_TILE + M_TILE = 4 + BITS_PER_VALUE = 8 + + spec = af.Spec.from_yaml( + af.examples.workloads.matmuls, + # af.examples.arches.networked.hierarchical, + INPUT_FILES_DIR / "hierarchical.yaml", + # af.examples.mappings.one_matmul_to_networked_hierarchical, + INPUT_FILES_DIR / "one_matmul_to_networked_hierarchical.yaml", + jinja_parse_data={ + "N_EINSUMS": 1, + "M": 8, + "KN": 8, + "MAC_TILE": MAC_TILE, + "M_TILE": M_TILE, + }, + ) + result = spec.evaluate_mapping() + self.assertEqual( + result.data["Matmul0actionMacArrayT0hops"].iloc[0], + (M / M_TILE) + * (KN / MAC_TILE) ** 2 + * M_TILE + * ( + sum(i for i in range(MAC_TILE)) # unicasting along X + + + MAC_TILE * (MAC_TILE-1) # multicast along Y for each column + ) + * BITS_PER_VALUE, + ) + # NOTE: assuming XY routing (as defined in mapping) + self.assertEqual( + result.data["Matmul0actionMacArrayT1hops"].iloc[0], + (M / M_TILE) + * (KN / MAC_TILE) ** 2 + * M_TILE + * ( + MAC_TILE * (MAC_TILE - 1) # multicast along X (the tile is shape N1, which is MAC_TILE here) + + + MAC_TILE * sum(i for i in range(MAC_TILE)) # unicasting along Y for each row + ) + * BITS_PER_VALUE, + ) + self.assertEqual( + result.data["Matmul0actionMacArrayW0hops"].iloc[0], + (M / M_TILE) + * (KN / MAC_TILE) ** 2 + * M_TILE + * ( + MAC_TILE * sum(i for i in range(MAC_TILE)) # unicast along X (the tile is shape N1, which is MAC_TILE here) + + + MAC_TILE * sum(i for i in range(MAC_TILE)) # unicasting along Y for each row + ) + * BITS_PER_VALUE, + ) + + self.assertEqual( + result.data["Matmul0actionPeArrayT0hops"].iloc[0], + (M / M_TILE) + * ( + sum(i for i in range(PE_TILE)) + + + PE_TILE * (PE_TILE - 1) + ) + # tile shape + * M_TILE * MAC_TILE * BITS_PER_VALUE, + ) + # NOTE: assuming XY routing (as defined in mapping) + self.assertEqual( + result.data["Matmul0actionPeArrayT1hops"].iloc[0], + (M / M_TILE) + * ( + PE_TILE * (PE_TILE - 1) + + + PE_TILE * sum(i for i in range(PE_TILE)) + ) + * M_TILE + * MAC_TILE + * BITS_PER_VALUE, + ) + self.assertEqual( + result.data["Matmul0actionPeArrayW0hops"].iloc[0], + (M / M_TILE) + * ( + PE_TILE * sum(i for i in range(PE_TILE)) + + + PE_TILE * sum(i for i in range(PE_TILE)) + ) + * MAC_TILE**2 + * BITS_PER_VALUE, + ) + + def test_flat(self): + M = 8 + KN = 8 + MAC_TILE = 2 + M_TILE = 4 + BITS_PER_VALUE = 8 + + spec = af.Spec.from_yaml( + af.examples.workloads.matmuls, + INPUT_FILES_DIR / "flat.yaml", + INPUT_FILES_DIR / "one_matmul_to_flat.yaml", + jinja_parse_data={ + "N_EINSUMS": 1, + "M": 8, + "KN": 8, + "MAC_TILE": MAC_TILE, + "M_TILE": M_TILE, + }, + ) + result = spec.evaluate_mapping() + self.assertEqual( + result.data['Matmul0actionNoCT0hops'].iloc[0], + ( + M / M_TILE + * + (KN / MAC_TILE) * (KN / MAC_TILE - 1) # num rows * multicast_hops + * + M_TILE * MAC_TILE # tile shape + * + BITS_PER_VALUE + ) + ) + self.assertEqual( + result.data['Matmul0actionNoCT1hops'].iloc[0], + ( + M / M_TILE + * + (KN / MAC_TILE) * (KN / MAC_TILE - 1) # num rows * multicast_hops + * + M_TILE * MAC_TILE # tile shape + * + BITS_PER_VALUE + ) + ) + self.assertEqual( + result.data['Matmul0actionNoCW0hops'].iloc[0], + ( + M / M_TILE + * + ( + 4 # a 2x2 grid of physical buffers + * + ( + sum(i for i in range(2)) * MAC_TILE # unicast along row * tile shape + + + 2 * sum(i for i in range(2)) # num cols * unicast down col + ) + ) + * + MAC_TILE * MAC_TILE # tile shape + * + BITS_PER_VALUE + ) + ) + self.assertEqual( + result.data['Matmul0actionRowBufferT0read'].iloc[0], + ( + M / M_TILE + * + KN // MAC_TILE + * + M_TILE * MAC_TILE + * + BITS_PER_VALUE + ) + ) + self.assertEqual( + result.data['Matmul0latencyRowBuffer'].iloc[0], + ( + M / M_TILE + * + KN // MAC_TILE + * + M_TILE * MAC_TILE + * + BITS_PER_VALUE + / + 4 # num of physical RowBuffer + ) + ) + self.assertEqual( + result.data['Matmul0latencyDistributedBuffer'].iloc[0], + ( # Reads from child + M / M_TILE + * + KN // MAC_TILE + * + KN // MAC_TILE + * + MAC_TILE * MAC_TILE # tile shape + * + BITS_PER_VALUE + / + 4 # num of physical DistributedBuffer + ) + + + ( # Writes from parent + KN // MAC_TILE + * + KN // MAC_TILE + * + MAC_TILE * MAC_TILE # tile shape + * + BITS_PER_VALUE + / + 4 # num of physical DistributedBuffer + ) + ) + + +class TestModelAllToAll(TestCase): + """MacArray is an all-to-all switch (NVLink-like). PeArray is a mesh.""" + + def test_hierarchical_1d_all_to_all(self): + M = 8 + KN = 8 + MAC_TILE = 4 + M_TILE = 4 + BITS_PER_VALUE = 8 + + spec = af.Spec.from_yaml( + af.examples.workloads.matmuls, + INPUT_FILES_DIR / "hierarchical_1d_all_to_all.yaml", + INPUT_FILES_DIR / "one_matmul_to_networked_hierarchical_1d.yaml", + jinja_parse_data={ + "N_EINSUMS": 1, + "M": M, + "KN": KN, + "MAC_TILE": MAC_TILE, + "M_TILE": M_TILE, + }, + ) + result = spec.evaluate_mapping() + + # --- MacArray: all-to-all switch --------------------------------- + # Every node is one hop away + all_to_all = ( + (M / M_TILE) + * (KN / MAC_TILE) # number of used Scratchpad + * M_TILE + * KN # temporal for n1 in mapping + * (MAC_TILE - 1) # one hop per destination, for every tensor + * BITS_PER_VALUE + ) + for tensor in ("T0", "T1", "W0"): + self.assertEqual( + result.data[ + f"Matmul0actionMacArray{tensor}hops" + ].iloc[0], + all_to_all, + msg=f"unexpected MacArray hops for {tensor}", + ) + + # --- PeArray: still a mesh --------------------------------------- + # Unchanged from test_hierarchical_1d, so the mesh formulas hold (now + # with MAC_TILE = 4, i.e. KN // MAC_TILE = 2). + self.assertEqual( + result.data["Matmul0actionPeArrayT0hops"].iloc[0], + (M / M_TILE) + * sum(i for i in range(KN // MAC_TILE)) # unicast along X of PeArray + * M_TILE + * MAC_TILE + * BITS_PER_VALUE, + ) + self.assertEqual( + result.data["Matmul0actionPeArrayT1hops"].iloc[0], + (M / M_TILE) + * (KN // MAC_TILE - 1) # multicast along X of PeArray + * M_TILE + * KN + * BITS_PER_VALUE, + ) + self.assertEqual( + result.data["Matmul0actionPeArrayW0hops"].iloc[0], + (M / M_TILE) + * sum(i for i in range(KN // MAC_TILE)) # unicast along PeArray + * MAC_TILE + * KN + * BITS_PER_VALUE, + ) + + # --- Latency ------------------------------------------------------ + # The switch's uniform single-hop routing gives MacArray a constant + # latency of 1, versus the mesh PeArray's 2. + self.assertEqual( + result.data["Matmul0latencyMacArray"].iloc[0], 1 + ) + self.assertEqual( + result.data["Matmul0latencyPeArray"].iloc[0], 2 + ) + self.assertEqual(result.data["Totallatency"].iloc[0], 2) + + +class TestMapper(TestCase): + def test_hierarchical(self): + M = 8 + KN = 8 + + spec = af.Spec.from_yaml( + af.examples.workloads.matmuls, + INPUT_FILES_DIR / "hierarchical.yaml", + jinja_parse_data={"N_EINSUMS": 1, "M": M, "KN": KN} + ) + result = spec.map_workload_to_arch() + + def test_flat(self): + M = 8 + KN = 8 + + spec = af.Spec.from_yaml( + af.examples.workloads.matmuls, + INPUT_FILES_DIR / "flat.yaml", + jinja_parse_data={"N_EINSUMS": 1, "M": M, "KN": KN} + ) + result = spec.map_workload_to_arch() + + def test_flat_one_row_buffer(self): + M = 8 + KN = 8 + + spec = af.Spec.from_yaml( + af.examples.workloads.matmuls, + INPUT_FILES_DIR / "flat.yaml", + jinja_parse_data={"N_EINSUMS": 1, "M": M, "KN": KN, "N_ROW_BUFFER": 1} + ) + result = spec.map_workload_to_arch() diff --git a/tests/network/test_topology_model.py b/tests/network/test_topology_model.py new file mode 100644 index 00000000..36dcc31c --- /dev/null +++ b/tests/network/test_topology_model.py @@ -0,0 +1,168 @@ +from unittest import TestCase + +from accelforge.frontend.arch.components import TopologySpec +from accelforge.frontend._workload_isl._symbolic import ( + Irrelevant, + PartiallyRelevant, + Relevant, +) +from accelforge.model._looptree.reuse.symbolic._network import ( + AllToAllTopologyModel, + MeshTopologyModel, + get_topology_model, +) + + +class _NoDistribution: + """Stand-in source component that is not physically distributed.""" + + def _get_physical_fanout_along(self, dim_name, default=1): + return 1 + + +class _Distributed: + """Stand-in source component physically distributed along a dimension.""" + + def __init__(self, fanout, stride): + self.fanout = fanout + self.stride = stride + + def _get_physical_fanout_along(self, dim_name, default=1): + return self.fanout + + def _get_physical_stride_along(self, dim_name): + return self.stride + + +class TestMeshTopologyModel(TestCase): + """Unit tests for the mesh cost model in isolation.""" + + def _cost(self, relevancy, *, n, stride, volume=10, src=None): + return MeshTopologyModel().per_loop_transfer_cost( + relevancy, + shape_repeats=n, + last_fanout=stride, + volume=volume, + src_component=src if src is not None else _NoDistribution(), + dim_name="X", + ) + + def test_registry_resolves_model(self): + self.assertIsInstance(get_topology_model(TopologySpec.MESH), MeshTopologyModel) + self.assertIsInstance(get_topology_model("mesh"), MeshTopologyModel) + + def test_multicast(self): + # Irrelevant: one value flows down the line, dropped at each of the + # (n - 1) downstream nodes. Each link carries it at most once. + n, stride, volume = 4, 2, 10 + cost = self._cost(Irrelevant(), n=n, stride=stride, volume=volume) + self.assertEqual(cost.total_cost, (n - 1) * stride * volume) + self.assertEqual(cost.max_hops, n * stride) + self.assertEqual(cost.max_traffic, volume) + + def test_unicast(self): + # Relevant (not distributed): each destination needs its own data + # delivered i*stride hops away, so the total is quadratic and the link + # nearest the source carries traffic for all (n - 1) downstream nodes. + n, stride, volume = 4, 2, 10 + cost = self._cost(Relevant("n0"), n=n, stride=stride, volume=volume) + self.assertEqual(cost.total_cost, sum(range(n)) * stride * volume) + self.assertEqual(cost.max_hops, n * stride) + self.assertEqual(cost.max_traffic, (n - 1) * volume) + + def test_unicast_distributed_binds_locally(self): + # When the source is physically distributed, data binds as locally as + # possible, reducing hops relative to the non-distributed unicast. + n, stride, volume = 4, 1, 10 + src = _Distributed(fanout=2, stride=4) + cost = self._cost(Relevant("n0"), n=n, stride=stride, volume=volume, src=src) + + # physical_stride / last_fanout = 4, capped at shape_repeats = 4 + n_dsts_per_physical = 4 + n_activated_physical = 1 # n*stride / physical_stride = 4/4 + self.assertEqual( + cost.total_cost, + n_activated_physical * sum(range(n_dsts_per_physical)) * stride * volume, + ) + self.assertEqual(cost.max_hops, (n_dsts_per_physical - 1) * stride) + self.assertEqual(cost.max_traffic, (n_dsts_per_physical - 1) * volume) + + def test_partially_relevant_not_implemented(self): + with self.assertRaises(NotImplementedError): + self._cost(PartiallyRelevant("n0"), n=4, stride=2) + + +class TestAllToAllTopologyModel(TestCase): + """Unit tests for the all-to-all (switch) cost model in isolation.""" + + def _cost(self, relevancy, n, *, volume=10, last_fanout=99): + # last_fanout is deliberately large and arbitrary: an all-to-all switch + # must ignore physical stride entirely. + return AllToAllTopologyModel().per_loop_transfer_cost( + relevancy, + shape_repeats=n, + last_fanout=last_fanout, + volume=volume, + src_component=_NoDistribution(), + dim_name="X", + ) + + def test_registry_resolves_model(self): + # Resolves both by enum and by the StrEnum value (the form that survives + # the arch evaluation pipeline). + self.assertIsInstance( + get_topology_model(TopologySpec.ALL_TO_ALL), AllToAllTopologyModel + ) + self.assertIsInstance(get_topology_model("all_to_all"), AllToAllTopologyModel) + + def test_multicast(self): + n, volume = 5, 10 + cost = self._cost(Irrelevant(), n, volume=volume) + # Linear in destinations, one switch hop, shared link traffic. + self.assertEqual(cost.total_cost, (n - 1) * volume) + self.assertEqual(cost.max_hops, AllToAllTopologyModel.HOPS_PER_TRANSFER) + self.assertEqual(cost.max_traffic, volume) + + def test_unicast(self): + n, volume = 5, 10 + cost = self._cost(Relevant("n0"), n, volume=volume) + # Same (linear) total cost as multicast and constant hops, but the + # source's uplink to the switch carries every distinct message. + self.assertEqual(cost.total_cost, (n - 1) * volume) + self.assertEqual(cost.max_hops, AllToAllTopologyModel.HOPS_PER_TRANSFER) + self.assertEqual(cost.max_traffic, (n - 1) * volume) + + def test_independent_of_stride(self): + # Stride (last_fanout) must not affect any component of the cost. + a = self._cost(Relevant("n0"), 5, last_fanout=1) + b = self._cost(Relevant("n0"), 5, last_fanout=1000) + self.assertEqual( + (a.total_cost, a.max_hops, a.max_traffic), + (b.total_cost, b.max_hops, b.max_traffic), + ) + + def test_linear_unlike_mesh_quadratic(self): + # Against an identical mesh scenario, all-to-all unicast is linear while + # the mesh is quadratic, and all-to-all hops are constant (< distance). + n, volume, stride = 6, 1, 1 + kwargs = dict( + shape_repeats=n, + last_fanout=stride, + volume=volume, + src_component=_NoDistribution(), + dim_name="X", + ) + a2a = AllToAllTopologyModel().per_loop_transfer_cost(Relevant("n0"), **kwargs) + mesh = MeshTopologyModel().per_loop_transfer_cost(Relevant("n0"), **kwargs) + + self.assertEqual(a2a.total_cost, (n - 1) * volume) + self.assertEqual(mesh.total_cost, sum(range(n)) * stride * volume) + self.assertLess(a2a.total_cost, mesh.total_cost) + self.assertLess(a2a.max_hops, mesh.max_hops) + + def test_accumulate_max_hops_persists(self): + # overall_max_hops accumulates across calls for a given network. + model = AllToAllTopologyModel() + h = AllToAllTopologyModel.HOPS_PER_TRANSFER + self.assertEqual(model.accumulate_max_hops("net", h), h) + self.assertEqual(model.accumulate_max_hops("net", h), 2 * h) diff --git a/tests/not_working/networks.ipynb b/tests/not_working/networks.ipynb index 9532c809..1687d547 100644 --- a/tests/not_working/networks.ipynb +++ b/tests/not_working/networks.ipynb @@ -1,102 +1,197 @@ { - "cells": [ - { - "cell_type": "code", - "execution_count": null, - "id": "43938186", - "metadata": {}, - "outputs": [], - "source": [ - "import accelforge as af" - ] - }, - { - "cell_type": "markdown", - "id": "88205db3", - "metadata": {}, - "source": [ - "Below, we render a completely hierarchical architecture with two networks." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "49a31e7a", - "metadata": {}, - "outputs": [], - "source": [ - "spec = af.Spec.from_yaml(af.examples.arches.networked.hierarchical)\n", - "spec.arch" - ] - }, - { - "cell_type": "markdown", - "id": "389cb739", - "metadata": {}, - "source": [ - "Now, we render an architecture with certain components in a flat organization, and others in a hierarchy." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "9a11eec1", - "metadata": {}, - "outputs": [], - "source": [ - "spec = af.Spec.from_yaml(af.examples.arches.networked.flat)\n", - "spec.arch" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "d2bbda8a", - "metadata": {}, - "outputs": [], - "source": [ - "spec.calculate_component_costs()._get_flattened_architecture()" - ] - }, - { - "cell_type": "markdown", - "id": "a6a508a5", - "metadata": {}, - "source": [ - "Finally, here is a simplified rack-scale architecture." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "cc2df4b6", - "metadata": {}, - "outputs": [], - "source": [ - "spec = af.Spec.from_yaml(af.examples.arches.networked.rack)\n", - "spec.arch" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.12.11" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "id": "43938186", + "metadata": {}, + "outputs": [], + "source": [ + "import accelforge as af" + ] + }, + { + "cell_type": "markdown", + "id": "88205db3", + "metadata": {}, + "source": [ + "Below, we render a completely hierarchical architecture with two networks." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "49a31e7a", + "metadata": {}, + "outputs": [], + "source": [ + "spec = af.Spec.from_yaml(\n", + " af.examples.arches.networked.hierarchical,\n", + " af.examples.workloads.matmuls,\n", + " jinja_parse_data={\"N_EINSUMS\": 1}\n", + ")\n", + "spec.arch" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a64424bb", + "metadata": {}, + "outputs": [], + "source": [ + "spec.mapper.metrics = af.mapper.Metrics.LATENCY | af.mapper.Metrics.ENERGY\n", + "result = spec.map_workload_to_arch()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9cc6ed1d", + "metadata": {}, + "outputs": [], + "source": [ + "result.data" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f0dadcac", + "metadata": {}, + "outputs": [], + "source": [ + "result.energy(per_component=True)" + ] + }, + { + "cell_type": "markdown", + "id": "389cb739", + "metadata": {}, + "source": [ + "Now, we render an architecture with certain components in a flat organization, and others in a hierarchy." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9a11eec1", + "metadata": {}, + "outputs": [], + "source": [ + "spec = af.Spec.from_yaml(\n", + " af.examples.arches.networked.flat,\n", + " af.examples.workloads.matmuls,\n", + " jinja_parse_data={\"N_EINSUMS\": 1}\n", + ")\n", + "spec.arch" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d2bbda8a", + "metadata": {}, + "outputs": [], + "source": [ + "result = spec.map_workload_to_arch()\n", + "result" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1e2b3332", + "metadata": {}, + "outputs": [], + "source": [ + "from accelforge.plotting.mappings import plot_energy_breakdown\n", + "\n", + "plot_energy_breakdown([result], separate_by=[\"component\"], stack_by=[\"tensor\"])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "741719fa", + "metadata": {}, + "outputs": [], + "source": [ + "spec = af.Spec.from_yaml(\n", + " af.examples.arches.networked.flat,\n", + " af.examples.workloads.matmuls,\n", + " jinja_parse_data={\"N_EINSUMS\": 1, \"N_ROW_BUFFER\": 1, \"N_COL_BUFFER\": 1},\n", + ")\n", + "spec.arch" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a62e6dfa", + "metadata": {}, + "outputs": [], + "source": [ + "result = spec.map_workload_to_arch()\n", + "plot_energy_breakdown([result], separate_by=[\"component\"], stack_by=[\"tensor\"])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "929f5399", + "metadata": {}, + "outputs": [], + "source": [ + "result.data[[c for c in result.data.columns if \"hops\" in c]]" + ] + }, + { + "cell_type": "markdown", + "id": "a6a508a5", + "metadata": {}, + "source": [ + "Finally, here is a simplified rack-scale architecture." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "cc2df4b6", + "metadata": {}, + "outputs": [], + "source": [ + "spec = af.Spec.from_yaml(af.examples.arches.networked.rack)\n", + "spec.arch" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3e54780a", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} \ No newline at end of file diff --git a/tests/test_network.py b/tests/test_network.py deleted file mode 100644 index e8b8d567..00000000 --- a/tests/test_network.py +++ /dev/null @@ -1,249 +0,0 @@ -from pathlib import Path -from unittest import TestCase - -import accelforge as af - -INPUT_FILES_DIR = Path(__file__).parent / "input_files" / "networked" - - -class TestParsing(TestCase): - def test_hierarchical(self): - spec = af.Spec.from_yaml( - # af.examples.arches.networked.hierarchical, - INPUT_FILES_DIR - / "hierarchical.yaml", - ) - self.assertIn("PeArray", spec.arch.nodes) - self.assertEqual(spec.arch.nodes["PeArray"].get_fanout(), 1) - self.assertIn("Scratchpad", spec.arch.nodes) - self.assertEqual(spec.arch.nodes["Scratchpad"].get_fanout(), 4) - self.assertIn("MacArray", spec.arch.nodes) - self.assertEqual(spec.arch.nodes["MacArray"].get_fanout(), 1) - - try: - spec = spec.calculate_component_costs() - except af.EvaluationError as e: - self.fail(e.message) - - def test_flat(self): - spec = af.Spec.from_yaml( - # af.examples.arches.networked.flat, - INPUT_FILES_DIR - / "flat.yaml", - ) - self.assertIn("NoC", spec.arch.nodes) - self.assertEqual(spec.arch.nodes["NoC"].get_fanout(), 1) - self.assertEqual( - {n.name for n in spec.arch.get_nodes_of_type(af.spec.Leaf)}, - { - "MainMemory", - "GlobalBuffer", - "NoC", - "RowBuffer", - "ColumnBuffer", - "DistributedBuffer", - "Scratchpad", - "MAC", - }, - ) - - try: - spec = spec.calculate_component_costs() - except af.EvaluationError as e: - self.fail(e.message) - - -class TestModel(TestCase): - def test_hierarchical_1d(self): - M = 8 - KN = 8 - MAC_TILE = 2 - M_TILE = 4 - BITS_PER_VALUE = 8 - - spec = af.Spec.from_yaml( - af.examples.workloads.matmuls, - # af.examples.arches.networked.hierarchical, - INPUT_FILES_DIR / "hierarchical_1d.yaml", - # af.examples.mappings.one_matmul_to_networked_hierarchical, - INPUT_FILES_DIR / "one_matmul_to_networked_hierarchical_1d.yaml", - jinja_parse_data={ - "N_EINSUMS": 1, - "M": 8, - "KN": 8, - "MAC_TILE": MAC_TILE, - "M_TILE": M_TILE, - }, - ) - result = spec.evaluate_mapping() - self.assertEqual( - result.data["Matmul0actionMacArrayT0hops"].iloc[0], - (M / M_TILE) - * (KN / MAC_TILE) # number of used Scratchpad - * M_TILE - * KN # temporal for n1 in mapping - * sum(i + 1 for i in range(MAC_TILE)) # unicast along X-axis of MacArray - * BITS_PER_VALUE, - ) - # NOTE: assuming XY routing (as defined in mapping) - self.assertEqual( - result.data["Matmul0actionMacArrayT1hops"].iloc[0], - (M / M_TILE) - * (KN / MAC_TILE) - * M_TILE - * KN # temporal for n1 in mapping - * MAC_TILE # multicast along X-axis of MacArray - * BITS_PER_VALUE, - ) - self.assertEqual( - result.data["Matmul0actionMacArrayW0hops"].iloc[0], - (M / M_TILE) - * (KN / MAC_TILE) - * M_TILE - * KN - * sum(i + 1 for i in range(MAC_TILE)) - * BITS_PER_VALUE, - ) - - self.assertEqual( - result.data["Matmul0actionPeArrayT0hops"].iloc[0], - (M / M_TILE) - * sum( - i + 1 for i in range(KN // MAC_TILE) - ) # unicast along X-axis of PeArray - * M_TILE - * MAC_TILE - * BITS_PER_VALUE, - ) - # NOTE: assuming XY routing (as defined in mapping) - self.assertEqual( - result.data["Matmul0actionPeArrayT1hops"].iloc[0], - (M / M_TILE) - * KN - // MAC_TILE # multicast along X-axis of PeArray - * M_TILE - * KN - * BITS_PER_VALUE, - ) - self.assertEqual( - result.data["Matmul0actionPeArrayW0hops"].iloc[0], - (M / M_TILE) - * sum(i + 1 for i in range(KN // MAC_TILE)) # unicast along PeArray - * MAC_TILE - * KN - * BITS_PER_VALUE, - ) - - def test_hierarchical(self): - M = 8 - KN = 8 - MAC_TILE = 2 - PE_TILE = KN // MAC_TILE - M_TILE = 4 - BITS_PER_VALUE = 8 - - spec = af.Spec.from_yaml( - af.examples.workloads.matmuls, - # af.examples.arches.networked.hierarchical, - INPUT_FILES_DIR / "hierarchical.yaml", - # af.examples.mappings.one_matmul_to_networked_hierarchical, - INPUT_FILES_DIR / "one_matmul_to_networked_hierarchical.yaml", - jinja_parse_data={ - "N_EINSUMS": 1, - "M": 8, - "KN": 8, - "MAC_TILE": MAC_TILE, - "M_TILE": M_TILE, - }, - ) - result = spec.evaluate_mapping() - self.assertEqual( - result.data["Matmul0actionMacArrayT0hops"].iloc[0], - (M / M_TILE) - * (KN / MAC_TILE) ** 2 - * M_TILE - * ( - sum(i + 1 for i in range(MAC_TILE)) # unicasting along X - + MAC_TILE * MAC_TILE # multicast along Y for each column - ) - * BITS_PER_VALUE, - ) - # NOTE: assuming XY routing (as defined in mapping) - self.assertEqual( - result.data["Matmul0actionMacArrayT1hops"].iloc[0], - (M / M_TILE) - * (KN / MAC_TILE) ** 2 - * M_TILE - * ( - MAC_TILE - * MAC_TILE # multicast along X (the tile is shape N1, which is MAC_TILE here) - + MAC_TILE - * sum(i + 1 for i in range(MAC_TILE)) # unicasting along Y for each row - ) - * BITS_PER_VALUE, - ) - self.assertEqual( - result.data["Matmul0actionMacArrayW0hops"].iloc[0], - (M / M_TILE) - * (KN / MAC_TILE) ** 2 - * M_TILE - * ( - MAC_TILE - * sum( - i + 1 for i in range(MAC_TILE) - ) # unicast along X (the tile is shape N1, which is MAC_TILE here) - + MAC_TILE - * sum(i + 1 for i in range(MAC_TILE)) # unicasting along Y for each row - ) - * BITS_PER_VALUE, - ) - - self.assertEqual( - result.data["Matmul0actionPeArrayT0hops"].iloc[0], - (M / M_TILE) * (sum(i + 1 for i in range(PE_TILE)) + PE_TILE * PE_TILE) - # tile shape - * M_TILE * MAC_TILE * BITS_PER_VALUE, - ) - # NOTE: assuming XY routing (as defined in mapping) - self.assertEqual( - result.data["Matmul0actionPeArrayT1hops"].iloc[0], - (M / M_TILE) - * (PE_TILE * PE_TILE + PE_TILE * sum(i + 1 for i in range(PE_TILE))) - * M_TILE - * MAC_TILE - * BITS_PER_VALUE, - ) - self.assertEqual( - result.data["Matmul0actionPeArrayW0hops"].iloc[0], - (M / M_TILE) - * ( - PE_TILE * sum(i + 1 for i in range(PE_TILE)) - + PE_TILE * sum(i + 1 for i in range(PE_TILE)) - ) - * MAC_TILE**2 - * BITS_PER_VALUE, - ) - - -class TestMapper(TestCase): - def test_hierarchical(self): - M = 8 - KN = 8 - MAC_TILE = 2 - PE_TILE = KN // MAC_TILE - M_TILE = 4 - BITS_PER_VALUE = 8 - - spec = af.Spec.from_yaml( - af.examples.workloads.matmuls, - # af.examples.arches.networked.hierarchical, - INPUT_FILES_DIR / "hierarchical.yaml", - jinja_parse_data={ - "N_EINSUMS": 1, - "M": 8, - "KN": 8, - "MAC_TILE": MAC_TILE, - "M_TILE": M_TILE, - }, - ) - result = spec.map_workload_to_arch()